## Model Selection and Training

In [1]:
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from IPython.display import display, Math
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy import stats
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
import joblib
from sklearn.ensemble import ExtraTreesClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC

In [2]:
# Load the dataset
file_path = "C:/Users/KIIT/Minor Project/synthetic_preterm_3000_final_95.csv"
df = pd.read_csv(file_path)

# Drop low-impact features based on correlation analysis
df_cleaned = df.drop(columns=["STD", "lenght of contraction"])

# Handling Outliers using Z-score
z_scores = np.abs(stats.zscore(df_cleaned))
df_cleaned = df_cleaned[(z_scores < 3).all(axis=1)] 

In [3]:
import pandas as pd
#Removing outlier from length of contraction and Risk Factor Score
# Assuming df is your dataframe
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_filtered = df_cleaned[(df_cleaned[column] >= lower_bound) & (df_cleaned[column] <= upper_bound)]
    return df_filtered

df_cleaned = remove_outliers_iqr(df_cleaned, 'Risk Factor Score')

In [4]:
df_cleaned['Contraction_Risk_Interaction'] = df['Count Contraction'] * df['Risk Factor Score']

In [5]:
df_cleaned

Unnamed: 0,Count Contraction,Entropy,Contraction times,Pre-term,Risk Factor Score,Minor Health Indicator,Lifestyle Factor,Contraction_Risk_Interaction
0,11514,0.044455,1.291351,1,0.471035,0.096713,0.071005,5423.501632
1,11539,0.981544,0.424260,1,0.708752,0.101520,0.096091,8178.286519
2,7684,0.324470,1.005407,1,0.925776,0.213598,0.134627,7113.659200
3,10629,0.667819,-0.940382,0,0.109835,0.105040,0.192961,1167.434579
4,9887,0.335620,0.520655,0,-0.066231,0.054059,0.166894,-654.830643
...,...,...,...,...,...,...,...,...
2995,12009,-0.010580,0.380057,0,0.041634,0.013795,0.035682,499.980007
2996,9739,-0.089561,0.202480,0,-0.149681,0.084975,0.066435,-1457.746853
2997,12039,0.519564,-0.763530,0,-0.172221,0.039890,0.089791,-2073.369642
2998,9730,0.453195,0.951061,1,0.241077,0.158409,0.059210,2345.679290


In [6]:
df_cleaned_train = df_cleaned.drop('Pre-term', axis=1).copy()
df

Unnamed: 0,Count Contraction,lenght of contraction,STD,Entropy,Contraction times,Pre-term,Risk Factor Score,Minor Health Indicator,Lifestyle Factor
0,11514,67796,61903.867958,0.044455,1.291351,1,0.471035,0.096713,0.071005
1,11539,188778,54208.853240,0.981544,0.424260,1,0.708752,0.101520,0.096091
2,7684,141554,58351.975766,0.324470,1.005407,1,0.925776,0.213598,0.134627
3,10629,136777,60630.928699,0.667819,-0.940382,0,0.109835,0.105040,0.192961
4,9887,17023,52016.466243,0.335620,0.520655,0,-0.066231,0.054059,0.166894
...,...,...,...,...,...,...,...,...,...
2995,12009,136254,57967.169488,-0.010580,0.380057,0,0.041634,0.013795,0.035682
2996,9739,215615,52195.639080,-0.089561,0.202480,0,-0.149681,0.084975,0.066435
2997,12039,164586,52096.574916,0.519564,-0.763530,0,-0.172221,0.039890,0.089791
2998,9730,227384,58751.231529,0.453195,0.951061,1,0.241077,0.158409,0.059210


In [7]:
numerical_cols = df_cleaned.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [8]:
numerical_cols = [col for col in df_cleaned.columns if col not in ["Pre-term"]]

# Define transformation: Only scale numerical features
num_transformer = ColumnTransformer(
    transformers=[("scaler", StandardScaler(), numerical_cols)], 
    remainder="passthrough"  # Keeps 'Pre-term' unchanged
)

# Create pipeline
pipeline = Pipeline(steps=[("scaling", num_transformer)])

# Fit and transform data
df_transformed = pipeline.fit_transform(df_cleaned)

# Convert back to DataFrame with correct column names
df_scaled = pd.DataFrame(df_transformed, columns=numerical_cols + ["Pre-term"])

# Ensure 'Pre-term' is still an integer (in case it was converted)
df_scaled["Pre-term"] = df_cleaned["Pre-term"].values  

# Display transformed data
print(df_scaled.head())

   Count Contraction   Entropy  Contraction times  Risk Factor Score  \
0           0.887630 -0.561523           2.210457           0.336371   
1           0.902108  1.474840           0.287612           0.849471   
2          -1.330419  0.046970           1.576354           1.317907   
3           0.375104  0.793093          -2.738593          -0.443265   
4          -0.054607  0.071201           0.501377          -0.823297   

   Minor Health Indicator  Lifestyle Factor  Contraction_Risk_Interaction  \
0               -0.296316         -0.804370                      0.482332   
1               -0.215963         -0.389328                      1.069699   
2                1.657814          0.248231                      0.842702   
3               -0.157102          1.213328                     -0.425134   
4               -1.009442          0.782064                     -0.813672   

   Pre-term  
0         1  
1         1  
2         1  
3         0  
4         0  


In [9]:
df_scaled

Unnamed: 0,Count Contraction,Entropy,Contraction times,Risk Factor Score,Minor Health Indicator,Lifestyle Factor,Contraction_Risk_Interaction,Pre-term
0,0.887630,-0.561523,2.210457,0.336371,-0.296316,-0.804370,0.482332,1
1,0.902108,1.474840,0.287612,0.849471,-0.215963,-0.389328,1.069699,1
2,-1.330419,0.046970,1.576354,1.317907,1.657814,0.248231,0.842702,1
3,0.375104,0.793093,-2.738593,-0.443265,-0.157102,1.213328,-0.425134,0
4,-0.054607,0.071201,0.501377,-0.823297,-1.009442,0.782064,-0.813672,0
...,...,...,...,...,...,...,...,...
2986,1.174297,-0.681117,0.189590,-0.590474,-1.682595,-1.388759,-0.567446,0
2987,-0.140317,-0.852751,-0.204202,-1.003420,-0.492559,-0.879981,-0.984867,0
2988,1.191671,0.470923,-2.346410,-1.052071,-1.246326,-0.493559,-1.116128,0
2989,-0.145529,0.326700,1.455837,-0.159985,0.735138,-0.999501,-0.173912,1


In [10]:
df_scaled[numerical_cols]

Unnamed: 0,Count Contraction,Entropy,Contraction times,Risk Factor Score,Minor Health Indicator,Lifestyle Factor,Contraction_Risk_Interaction
0,0.887630,-0.561523,2.210457,0.336371,-0.296316,-0.804370,0.482332
1,0.902108,1.474840,0.287612,0.849471,-0.215963,-0.389328,1.069699
2,-1.330419,0.046970,1.576354,1.317907,1.657814,0.248231,0.842702
3,0.375104,0.793093,-2.738593,-0.443265,-0.157102,1.213328,-0.425134
4,-0.054607,0.071201,0.501377,-0.823297,-1.009442,0.782064,-0.813672
...,...,...,...,...,...,...,...
2986,1.174297,-0.681117,0.189590,-0.590474,-1.682595,-1.388759,-0.567446
2987,-0.140317,-0.852751,-0.204202,-1.003420,-0.492559,-0.879981,-0.984867
2988,1.191671,0.470923,-2.346410,-1.052071,-1.246326,-0.493559,-1.116128
2989,-0.145529,0.326700,1.455837,-0.159985,0.735138,-0.999501,-0.173912


In [11]:
import warnings
warnings.filterwarnings("ignore")

## XGBoost,Extra Trees,CatBoost,SVM,Logistic Regression

In [12]:
X = df_scaled[numerical_cols]
y = df_scaled['Pre-term']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Selection using RandomForest Feature Importance
rf_temp = RandomForestClassifier(n_estimators=50, random_state=42)
rf_temp.fit(X_train, y_train)
feature_importances = pd.Series(rf_temp.feature_importances_, index=numerical_cols)
selected_features = feature_importances.nlargest(5).index.tolist()

# Update Training Data with Selected Features
X_train = X_train[selected_features]
X_test = X_test[selected_features]

# Train XGBoost Model with Regularization
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', max_depth=5, 
                          learning_rate=0.1, reg_lambda=1, reg_alpha=1, random_state=42)
xgb_model.fit(X_train, y_train)

# Train SVM Model (Replaces LightGBM)
svm_model = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)
svm_model.fit(X_train, y_train)

# Train ExtraTrees Model (Replaces XGBoost)
et_model = ExtraTreesClassifier(n_estimators=100, max_depth=5, random_state=42)
et_model.fit(X_train, y_train)


# Train CatBoost Model (Replaces LightGBM)
cat_model = CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, verbose=0, random_state=42)
cat_model.fit(X_train, y_train)


# Train Logistic Regression
lr_model = LogisticRegression(max_iter=500)
lr_model.fit(X_train, y_train)

# Predictions
y_pred_xgb_model=xgb_model.predict(X_test)
y_pred_et = et_model.predict(X_test)
y_pred_cat = cat_model.predict(X_test)
y_pred_svm = svm_model.predict(X_test)
y_pred_lr = lr_model.predict(X_test)

# Evaluate Models
models = {
    "XGBoost": xgb_model,
    "Extra Trees": et_model,
    "CatBoost": cat_model,
     "SVM": svm_model,
    "Logistic Regression": lr_model
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"{name} Accuracy: {accuracy:.4f}")
    print("Actual vs Predicted:")
    result_df = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
    print(result_df.head(10))  # Print first 10 rows
    print("="*60)


for name, model in models.items():
    train_acc = accuracy_score(y_train, model.predict(X_train))
    test_acc = accuracy_score(y_test, model.predict(X_test))
    cv_acc = cross_val_score(model, X, y, cv=5).mean()
    print(f"{name} - Train Accuracy: {train_acc:.4f} | Test Accuracy: {test_acc:.4f} | CV Accuracy: {cv_acc:.4f}")
    print("Classification Report:\n", classification_report(y_test, model.predict(X_test)))
    print("="*60)



XGBoost Accuracy: 0.9683
Actual vs Predicted:
   Actual  Predicted
0       1          0
1       1          1
2       1          1
3       0          0
4       1          1
5       0          0
6       0          0
7       1          1
8       1          1
9       1          1
Extra Trees Accuracy: 0.9566
Actual vs Predicted:
   Actual  Predicted
0       1          0
1       1          1
2       1          1
3       0          0
4       1          1
5       0          0
6       0          0
7       1          1
8       1          1
9       1          1
CatBoost Accuracy: 0.9666
Actual vs Predicted:
   Actual  Predicted
0       1          0
1       1          1
2       1          1
3       0          0
4       1          1
5       0          0
6       0          0
7       1          1
8       1          1
9       1          1
SVM Accuracy: 0.9566
Actual vs Predicted:
   Actual  Predicted
0       1          0
1       1          1
2       1          1
3       0          0
4       1        

In [13]:
df_scaled

Unnamed: 0,Count Contraction,Entropy,Contraction times,Risk Factor Score,Minor Health Indicator,Lifestyle Factor,Contraction_Risk_Interaction,Pre-term
0,0.887630,-0.561523,2.210457,0.336371,-0.296316,-0.804370,0.482332,1
1,0.902108,1.474840,0.287612,0.849471,-0.215963,-0.389328,1.069699,1
2,-1.330419,0.046970,1.576354,1.317907,1.657814,0.248231,0.842702,1
3,0.375104,0.793093,-2.738593,-0.443265,-0.157102,1.213328,-0.425134,0
4,-0.054607,0.071201,0.501377,-0.823297,-1.009442,0.782064,-0.813672,0
...,...,...,...,...,...,...,...,...
2986,1.174297,-0.681117,0.189590,-0.590474,-1.682595,-1.388759,-0.567446,0
2987,-0.140317,-0.852751,-0.204202,-1.003420,-0.492559,-0.879981,-0.984867,0
2988,1.191671,0.470923,-2.346410,-1.052071,-1.246326,-0.493559,-1.116128,0
2989,-0.145529,0.326700,1.455837,-0.159985,0.735138,-0.999501,-0.173912,1
