In [181]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

data = pd.read_csv("data/train.csv")

# Feature Engineering 

In [182]:
# 1. BMI Calculation
def feature_engineering(data):
    data['BMI'] = data['weight(kg)'] / (data['height(cm)'] / 100) ** 2

    # 2. Average Eyesight
    data['avg_eyesight'] = (data['eyesight(left)'] + data['eyesight(right)']) / 2

    # 3. Average Hearing
    data['avg_hearing'] = (data['hearing(left)'] + data['hearing(right)']) / 2

    # 4. Total Cholesterol
    data['total_cholesterol'] = data['HDL'] + data['LDL']

    # 5. AST to ALT ratio
    data['AST_to_ALT_ratio'] = data['AST'] / data['ALT']

    # Display the first few rows of the dataset with the new features
    columns_to_drop = ['id', 'eyesight(left)', 'eyesight(right)', 'hearing(left)', 'hearing(right)', 'HDL', 'LDL', 'smoking']
    data = data.drop(columns=columns_to_drop)
    return data 

X_features = feature_engineering(data)

In [183]:
continuous_columns = ['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'systolic',
                      'hemoglobin', 'serum creatinine', 'AST', 'ALT', 'Gtp', 
                      'BMI', 'avg_eyesight', 'avg_hearing', 'total_cholesterol', 'AST_to_ALT_ratio', "fasting blood sugar", 'Cholesterol', 'triglyceride', 'relaxation']

categorical_columns = [col for col in X_features.columns if col not in continuous_columns]

# Adjust the column transformer to handle unknown categories
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_columns),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_columns)
    ])

In [159]:
from sklearn.model_selection import train_test_split

# Drop redundant columns
columns_to_drop = ['id', 'eyesight(left)', 'eyesight(right)', 'hearing(left)', 'hearing(right)', 'HDL', 'LDL']
data = data.drop(columns=columns_to_drop)

# Split the data into training and validation sets (80% train, 20% validation)
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Separate features and target variable for both training and validation sets
X_train = train_data["smoking"]
y_train = train_data['smoking']

X_val = val_data.drop(columns='smoking')
y_val = val_data['smoking']

X_train.shape, X_val.shape


((127404, 21), (31852, 21))

In [165]:
# Identifying continuous and categorical columns
continuous_columns = ['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'systolic',
                      'hemoglobin', 'serum creatinine', 'AST', 'ALT', 'Gtp', 
                      'BMI', 'avg_eyesight', 'avg_hearing', 'total_cholesterol', 'AST_to_ALT_ratio', "fasting blood sugar", 'Cholesterol', 'triglyceride', 'relaxation']

categorical_columns = [col for col in X_train.columns if col not in continuous_columns]

continuous_columns, categorical_columns


(['age',
  'height(cm)',
  'weight(kg)',
  'waist(cm)',
  'systolic',
  'hemoglobin',
  'serum creatinine',
  'AST',
  'ALT',
  'Gtp',
  'BMI',
  'avg_eyesight',
  'avg_hearing',
  'total_cholesterol',
  'AST_to_ALT_ratio',
  'fasting blood sugar',
  'Cholesterol',
  'triglyceride',
  'relaxation'],
 ['Urine protein', 'dental caries'])

In [166]:
# Adjust the column transformer to handle unknown categories
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_columns),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_columns)
    ])

# Fit the transformer on the training data and transform the training and validation data
X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)

X_train_transformed.shape, X_val_transformed.shape

((127404, 25), (31852, 25))

In [167]:
K_folds = 10

In [168]:
X = data.drop("smoking", axis=1).to_numpy()
y = data["smoking"].to_numpy()

In [178]:
model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
model.fit(X_train_transformed, y_train)
preds = model.predict_proba(X_val_transformed)[:, 1]
print(roc_auc_score(y_val, preds))

0.8613910973002025


In [135]:

scores = cross_val_score(model, Xtrai, y, cv=K_folds, scoring="roc_auc")
print(np.mean(scores), np.var(scores))

0.863735551103294 1.9040727217446967e-05


# 



# Lets do a bayesian hyperparmeter search over xgboost

In [136]:
space = {
    'learning_rate': hp.quniform('learning_rate', 0.01, 0.5, 0.01),
    'max_depth': hp.choice('max_depth', range(1, 15, 1)),
    'n_estimators': hp.choice('n_estimators', range(20, 205, 5)),
    'gamma': hp.quniform('gamma', 0, 0.5, 0.01),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.quniform('subsample', 0.1, 1, 0.01),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)
}


In [137]:
def objective(params):
    model = xgb.XGBClassifier(**params)
    scores = cross_val_score(model, X, y, cv=K_folds, scoring="roc_auc")
    loss = 1 - np.mean(scores)
    return {'loss': loss, 'status': STATUS_OK}


In [138]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10,  # Number of iterations
            trials=trials)

100%|██████████| 10/10 [02:21<00:00, 14.11s/trial, best loss: 0.13433569053900718]


In [139]:
model = xgb.XGBClassifier(**best)

In [140]:
model.fit(X, y)

In [141]:
preds = model.predict_proba(X)[:, 1].flatten()
roc_auc_score(y, preds)

0.8569495120946011

In [142]:
test_data = pd.read_csv("data/test.csv")
print(len(test_data))
ids = test_data["id"].to_numpy()
print(len(ids))

106171
106171


In [143]:
preds = model.predict_proba(test_data.to_numpy())[:, 1].flatten()
print(len(preds))

ValueError: Feature shape mismatch, expected: 28, got 23

In [None]:
predictions = {"id":ids, "smoking":preds}
test_df = pd.DataFrame.from_dict(predictions)
print(test_df.head())

       id   smoking
0  159256  0.608602
1  159257  0.222500
2  159258  0.691817
3  159259  0.070218
4  159260  0.500467


In [None]:
test_df.to_csv("test_predictions.csv", index=False)