In [65]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

data = pd.read_csv("data/train.csv")
print(data.shape)

(159256, 24)


# Train and Test Split

In [66]:
from sklearn.model_selection import train_test_split
# Split the data into training and validation sets (80% train, 20% validation)
train_data, val_data = train_test_split(data, test_size=0.2, random_state=20)



# Separate features and target variable for both training and validation sets
X_train = train_data.drop(columns='smoking')
y_train = train_data['smoking']

X_val = val_data.drop(columns='smoking')
y_val = val_data['smoking']

X_train.shape, X_val.shape

X_train.shape, X_val.shape

((127404, 23), (31852, 23))

# Feature Engineering 

In [67]:
# 1. BMI Calculation
def feature_engineering(dataset):
    data = dataset
    print("HEEEEEEEEEEEEEEEEEEEELO")
    data['BMI'] = data['weight(kg)'] / (data['height(cm)'] / 100) ** 2
    print("HI")

    # 2. Average Eyesight
    data['avg_eyesight'] = (data['eyesight(left)'] + data['eyesight(right)']) / 2

    # 3. Average Hearing
    data['avg_hearing'] = (data['hearing(left)'] + data['hearing(right)']) / 2

    # 4. Total Cholesterol
    data['total_cholesterol'] = data['HDL'] + data['LDL']

    # 5. AST to ALT ratio
    data['AST_to_ALT_ratio'] = data['AST'] / data['ALT']

    data['HDL-triglyceride_ratio'] = data["HDL"] / data["triglyceride"]

    data["LDL-triglyceride_ratio"] = data["LDL"] / data["triglyceride"]

    data["HDH-LDL Ratio"] = data["HDL"] / data["LDL"]

    data['log_triglyceride'] = np.log1p(data['triglyceride'])

    data['age_squared'] = data['age'] ** 2

    data['weight_to_height_ratio'] = data['weight(kg)'] / data['height(cm)']

    bins = [0, 25, 50, 75, np.inf]
    labels = ['Young', 'Middle-aged', 'Senior', 'Elderly']
    data['age_group'] = pd.cut(data['age'], bins=bins, labels=labels, right=False)

    threshold = 100  # You can adjust this based on clinical guidelines or data distribution
    data['high_fasting_sugar'] = (data['fasting blood sugar'] > threshold).astype(int)

    # Display the first few rows of the dataset with the new features
    columns_to_drop = ['id']
    data = data.drop(columns=columns_to_drop)
    return data

X_train = feature_engineering(X_train)
X_val = feature_engineering(X_val)

X_train.shape, X_val.shape

HEEEEEEEEEEEEEEEEEEEELO
HI
HEEEEEEEEEEEEEEEEEEEELO
HI


((127404, 35), (31852, 35))

In [68]:
continuous_columns = ['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'systolic',
                      'hemoglobin', 'serum creatinine', 'AST', 'ALT', 'Gtp', 
                      'BMI', 'avg_eyesight', 'avg_hearing', 'total_cholesterol', 'AST_to_ALT_ratio', 
                      'HDL', 'LDL', 'hearing(right)', 'hearing(left)', "fasting blood sugar", 'Cholesterol', 
                      'eyesight(left)', 'eyesight(right)', 'triglyceride', 'relaxation', 'weight_to_height_ratio',
                      'age_squared', 'log_triglyceride', 'HDL-triglyceride_ratio', "LDL-triglyceride_ratio",
                      "HDH-LDL Ratio"]

categorical_columns = [col for col in X_train.columns if col not in continuous_columns]

# Adjust the column transformer to handle unknown categories
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_columns),
        ('cat', OneHotEncoder(drop='first'), categorical_columns)  # drop='first' to avoid collinearity
    ])


X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)



# Modelling

In [20]:
model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
model.fit(X_train_transformed, y_train)
X_train_transformed.shape, X_val_transformed.shape
preds = model.predict_proba(X_val_transformed)[:, 1]
print(roc_auc_score(y_val, preds))


0.8622746571535798


In [21]:
#from sklearn.svm import SVC
#clf = SVC(probability=True)
#clf.fit(X_train_transformed, y_train)
#probabilities = clf.predict_proba(X_val_transformed)

In [22]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=2000)
clf.fit(X_train_transformed, y_train)
probabilities = clf.predict_proba(X_val_transformed)[:, 1]
print(roc_auc_score(y_val, probabilities ))

0.8416599279368032


In [23]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_transformed, y_train)
y_pred = gnb.predict(X_val_transformed)
print(roc_auc_score(y_val, y_pred))

0.7119533683357663


In [24]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_transformed, y_train)
y_pred = knn.predict(X_val_transformed)
probabilities = knn.predict_proba(X_val_transformed)[:, 1]
print(roc_auc_score(y_val, probabilities))

0.8004946731368157


In [25]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_transformed, y_train)
probabilities = clf.predict_proba(X_val_transformed)[:, 1]
print(roc_auc_score(y_val, probabilities))

0.8517777312217895


In [26]:
importances = clf.feature_importances_
for feature, importance in zip(train_data, importances):
    print(f"{feature}: {importance:.4f}")

id: 0.0187
age: 0.0895
height(cm): 0.0405
weight(kg): 0.0273
waist(cm): 0.0268
eyesight(left): 0.0933
eyesight(right): 0.0327
hearing(left): 0.0260
hearing(right): 0.0284
systolic: 0.0804
relaxation: 0.0230
fasting blood sugar: 0.0195
Cholesterol: 0.0013
triglyceride: 0.0291
HDL: 0.0336
LDL: 0.0253
hemoglobin: 0.0271
Urine protein: 0.0009
serum creatinine: 0.0009
AST: 0.0279
ALT: 0.0268
Gtp: 0.0144
dental caries: 0.0149
smoking: 0.0368


# 



# Lets do a bayesian hyperparmeter search over xgboost

In [27]:
space = {
    'learning_rate': hp.quniform('learning_rate', 0.01, 0.5, 0.01),
    'max_depth': hp.choice('max_depth', range(1, 15, 1)),
    'n_estimators': hp.choice('n_estimators', range(20, 205, 5)),
    'gamma': hp.quniform('gamma', 0, 0.5, 0.01),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.quniform('subsample', 0.1, 1, 0.01),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)
    
}


In [28]:
def objective(params):
    params['objective'] = 'binary:logistic'
    model = xgb.XGBClassifier(**params)
    model.fit(X_train_transformed, y_train)
    preds = model.predict_proba(X_val_transformed)[:, 1]
    score = roc_auc_score(y_val, preds)
    loss = 1 - score
    return {'loss': loss, 'status': STATUS_OK}


In [30]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=5,  # Number of iterations
            trials=trials)

100%|██████████| 5/5 [01:24<00:00, 16.94s/trial, best loss: 0.14728386970589535]


In [31]:
model = xgb.XGBClassifier(**best)
model.fit(X_train_transformed, y_train)
preds = model.predict_proba(X_val_transformed)[:, 1]
score = roc_auc_score(y_val, preds)
print("Best ROC_AUC socre was: ", score)

Best ROC_AUC socre was:  0.8314632248645102


# Train on All the Data 

In [32]:
test_data = pd.read_csv("data/test.csv")

X_all = data.drop(columns='smoking')
y_all = data['smoking']

X_all = feature_engineering(data)
X_test = feature_engineering(test_data)
X_all_transformed = preprocessor.fit_transform(X_all)
X_test_transformed = preprocessor.transform(X_test)


model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
model.fit(X_all_transformed, y_all)
preds = model.predict_proba(X_all_transformed)[:, 1]
print(roc_auc_score(y_all, preds))

0.9028379421898358


# Add New Data to the Training Set

In [72]:
data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

preds = model.predict_proba(X_test_transformed)[:, 1]
pos_mask = preds > 0.8
neg_mask = preds < 0.2
pos_data = test_data[pos_mask]
neg_data = test_data[neg_mask]
pos_data['smoking'] = 1
neg_data['smoking'] = 0

X_test_extended = pd.concat([pos_data, neg_data], axis=0)
X_test_extended = X_test_extended.sample(frac=1)
X_test_extended = pd.concat([X_test_extended, data], axis=0)
X_train_extended = X_test_extended.sample(frac=1)
y_train = X_test_extended['smoking']
X_train_extended = feature_engineering(X_train_extended)
X_train_extended_transformed = preprocessor.fit_transform(X_train_extended)


HEEEEEEEEEEEEEEEEEEEELO
HI


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_data['smoking'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neg_data['smoking'] = 0


KeyError: 'BMI'

In [70]:

X_test = feature_engineering(test_data)

X_test_transformed = preprocessor.transform(X_test)


HEEEEEEEEEEEEEEEEEEEELO
HI


IndexError: index 24 is out of bounds for axis 0 with size 24

In [None]:
model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
model.fit(X_train_extended_transformed, y_train)
train_preds = model.predict_proba(X_train_extended_transformed)[:, 1]
roc_auc_score(y_train, train_preds)

((106171,), (106171,))

In [160]:
df = pd.DataFrame.from_dict(predictions)
df.head()


Unnamed: 0,id,smoking
0,159256,0.596708
1,159257,0.231896
2,159258,0.532802
3,159259,0.023536
4,159260,0.513952


In [163]:
df.to_csv("test_predictions.csv", index=False)

In [162]:
test_data.columns

Index(['id', 'age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)',
       'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic',
       'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST',
       'ALT', 'Gtp', 'dental caries', 'BMI', 'avg_eyesight', 'avg_hearing',
       'total_cholesterol', 'AST_to_ALT_ratio', 'log_triglyceride',
       'age_squared', 'weight_to_height_ratio'],
      dtype='object')