In [None]:
import pandas as pd 
import joblib
import numpy as np

# Scikit-learn models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef

# XGBoost
from xgboost import XGBClassifier

# Train-test split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:

df = pd.read_csv('dataset/split_train.csv')
X = df.drop('smoking', axis=1)  # Features
y = df['smoking']  # Target

# train-test split (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"Number of features: {X_train.shape[1]}")

In [None]:
# Make copies to avoid modifying original data
X_train_processed = X_train.copy()
X_test_processed = X_test.copy()

# Standard Scaling 
scaling_features = ['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'systolic', 'relaxation', 'hemoglobin']
scaler = StandardScaler()
X_train_processed[scaling_features] = scaler.fit_transform(X_train_processed[scaling_features])
X_test_processed[scaling_features] = scaler.transform(X_test_processed[scaling_features])

# Log Transformation
log_features = ['AST', 'ALT', 'Gtp', 'fasting blood sugar', 'triglyceride']
for feature in log_features:
    X_train_processed[feature] = np.log1p(X_train_processed[feature])
    X_test_processed[feature] = np.log1p(X_test_processed[feature])

# 3. Map Hearing column: 1 -> 0, 2 -> 1
X_train_processed['hearing(left)'] = X_train_processed['hearing(left)'].map({1: 0, 2: 1})
X_train_processed['hearing(right)'] = X_train_processed['hearing(right)'].map({1: 0, 2: 1})
X_test_processed['hearing(left)'] = X_test_processed['hearing(left)'].map({1: 0, 2: 1})
X_test_processed['hearing(right)'] = X_test_processed['hearing(right)'].map({1: 0, 2: 1})


print(f"Processed training set shape: {X_train_processed.shape}")
print(f"Processed test set shape: {X_test_processed.shape}")

In [None]:
def calculate_metrics(y_true, y_pred, y_pred_proba=None):
    
    metrics = {}

    metrics['Accuracy'] = accuracy_score(y_true, y_pred)

    if y_pred_proba is not None:
        metrics['AUC Score'] = roc_auc_score(y_true, y_pred_proba)
    else:
        metrics['AUC Score'] = None  # Will be calculated if probabilities are available
    
    metrics['Precision'] = precision_score(y_true, y_pred, zero_division=0)
    
    metrics['Recall'] = recall_score(y_true, y_pred, zero_division=0)

    metrics['F1 Score'] = f1_score(y_true, y_pred, zero_division=0)
    
    metrics['MCC Score'] = matthews_corrcoef(y_true, y_pred)
    
    return metrics


def display_metrics(model_name, metrics):
    print(f"\n{'='*50}")
    print(f"Model: {model_name}")
    print(f"{'='*50}")
    for metric_name, value in metrics.items():
        if value is not None:
            print(f"{metric_name:<25} : {value:.4f}")
        else:
            print(f"{metric_name:<25} : Not Available")
    print(f"{'='*50}\n")

#### Logistic Regression

In [None]:


lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_processed, y_train)

y_pred_lr = lr_model.predict(X_test_processed)
y_pred_proba_lr = lr_model.predict_proba(X_test_processed)[:, 1]

lr_metrics = calculate_metrics(y_test, y_pred_lr, y_pred_proba_lr)
display_metrics("Logistic Regression", lr_metrics)

joblib.dump(lr_model, 'models/logistic_regression_model.pkl')
joblib.dump(scaler, 'models/scaler.pkl')

#### Decision tree classifier

In [None]:

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_processed, y_train)

y_pred_dt = dt_model.predict(X_test_processed)
y_pred_proba_dt = dt_model.predict_proba(X_test_processed)[:, 1]

dt_metrics = calculate_metrics(y_test, y_pred_dt, y_pred_proba_dt)
display_metrics("Decision Tree Classifier", dt_metrics)

joblib.dump(dt_model, 'models/decision_tree_model.pkl')

#### K-Nearest Neighbour

In [None]:
knn_model = KNeighborsClassifier(
    n_neighbors=31, 
    weights='distance', 
    metric='euclidean', # Standard for scaled data
    n_jobs=-1
)
knn_model.fit(X_train_processed, y_train)

y_pred_knn = knn_model.predict(X_test_processed)
y_pred_proba_knn = knn_model.predict_proba(X_test_processed)[:, 1]

knn_metrics = calculate_metrics(y_test, y_pred_knn, y_pred_proba_knn)
display_metrics("K-Nearest Neighbor Classifier", knn_metrics)
joblib.dump(knn_model, 'models/knn_model.pkl')

#### Gaussian Nave Bayes

In [None]:
gnb_model = GaussianNB()
gnb_model.fit(X_train_processed, y_train)

y_pred_gnb = gnb_model.predict(X_test_processed)
y_pred_proba_gnb = gnb_model.predict_proba(X_test_processed)[:, 1]

gnb_metrics = calculate_metrics(y_test, y_pred_gnb, y_pred_proba_gnb)
display_metrics("Gaussian Naive Bayes Classifier", gnb_metrics)
joblib.dump(gnb_model, 'models/gaussian_nb_model.pkl')

#### Ensemble - Random Forest

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
rf_model.fit(X_train_processed, y_train)

y_pred_rf = rf_model.predict(X_test_processed)
y_pred_proba_rf = rf_model.predict_proba(X_test_processed)[:, 1]

rf_metrics = calculate_metrics(y_test, y_pred_rf, y_pred_proba_rf)
display_metrics("Random Forest Classifier", rf_metrics)
joblib.dump(rf_model, 'models/random_forest_model.pkl')

In [None]:
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    n_jobs=-1
)
xgb_model.fit(X_train_processed, y_train)

y_pred_xgb = xgb_model.predict(X_test_processed)
y_pred_proba_xgb = xgb_model.predict_proba(X_test_processed)[:, 1]

xgb_metrics = calculate_metrics(y_test, y_pred_xgb, y_pred_proba_xgb)
display_metrics("XGBoost Classifier", xgb_metrics)
joblib.dump(xgb_model, 'models/xgboost_model.pkl')