In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import sklearn as sklearn
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.DataFrame()
df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv', header='infer')

In [None]:
df.head()

In [None]:
df['EJ_A'] = np.where(df['EJ'] == 'A',1,0)
df['EJ_B'] = np.where(df['EJ'] == 'B',1,0)
df = df.drop(['EJ'],axis=1)
df.info()

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(['Class'],axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

type(X_train)
type(y_train)

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")

X_train.set_index('Id')
X_test.set_index('Id')
X_train = X_train.drop('Id',axis=1)
X_test = X_test.drop('Id',axis=1)

for x in X_train.columns:
    imputer = imputer.fit(X_train[[x]])
    X_train[x + "_Imputed"] = imputer.transform(X_train[[x]])
    X_train = X_train.drop([x],axis=1)

    X_test[x + "_Imputed"] = imputer.transform(X_test[[x]])
    X_test = X_test.drop([x],axis=1)

In [None]:
X_test.info()

In [None]:
# Model 1: Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=42, criterion="entropy",
                             min_samples_split=10, min_samples_leaf=10, max_depth=3, max_leaf_nodes=5)
clf.fit(X_train, y_train)

y_pred_dt = clf.predict(X_test)

In [None]:
class_names = [str(x) for x in clf.classes_]

In [None]:
print(clf.tree_.node_count)
print(clf.tree_.impurity)
print(clf.tree_.children_left)
print(clf.tree_.threshold)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred_dt)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_dt, target_names=class_names))

In [None]:
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, log_loss

print("Accuracy = {:.2f}".format(accuracy_score(y_test, y_pred_dt)))
print("Kappa = {:.2f}".format(cohen_kappa_score(y_test, y_pred_dt)))
print("F1 Score = {:.2f}".format(f1_score(y_test, y_pred_dt)))
print("Log Loss = {:.2f}".format(log_loss(y_test, y_pred_dt)))

In [None]:
# Model 2: Random Forest (Bagging)

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    random_state=0)

clf_rf.fit(X_train, y_train)

In [None]:
y_pred_dt = clf_rf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred_dt)

In [None]:
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, log_loss

print("Accuracy = {:.2f}".format(accuracy_score(y_test, y_pred_dt)))
print("Kappa = {:.2f}".format(cohen_kappa_score(y_test, y_pred_dt)))
print("F1 Score = {:.2f}".format(f1_score(y_test, y_pred_dt)))
print("Log Loss = {:.2f}".format(log_loss(y_test, y_pred_dt)))

In [None]:
import time

In [None]:
# Model 3: LightGBM

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_validate

# Split the dataframe into X (features) and y (target, aka, label).
X = X_train
y = y_train

params = {
      "cat_l2": 10,
      "cat_smooth": 10,
      "colsample_bytree": 0.8,
      "feature_fraction_bynode":  0.8,
      "learning_rate": 0.03,
      "max_depth": 6,
      "min_child_samples": 63,
      "min_data_per_group": 100,
      "n_estimators": 200,
      "num_leaves": 63,
      "path_smooth": 0,
      "reg_alpha":  0.05,
      "reg_lambda": 0.05,
      "subsample_freq": 1,
      "subsample": 0.8,
      "max_bin": 127,
      "extra_trees": False,
      "is_unbalance": True,
      "boosting_type": 'gbdt',
      "n_jobs": 1,
      "verbosity": -1,
      "seed": 77,
}
        
fit_params= {
    'feature_name': "auto",
}

start = time.time()
estimator = LGBMClassifier(**params)

# The more CV, the better our estimate of the score
inner_cv_scores = cross_validate(estimator, X, y, 
                                 fit_params=fit_params, 
                                 cv=15, 
                                 scoring="roc_auc", 
                                 n_jobs=5, 
                                 verbose=0, 
                                 return_train_score=True)
cv_scores=inner_cv_scores['test_score'].tolist()
duration = time.time() - start

print("CV Scores:")
print(["{:0.4f}".format(cv_score) for cv_score in cv_scores])
print("CV Score mean: {:.4f} ".format(np.mean(cv_scores)))
print("CV Score range: {:0.4f} -- {:0.4f}".format(np.mean(cv_scores) - np.std(cv_scores), np.mean(cv_scores) + np.std(cv_scores)))
print("Fit times: {}".format(duration))
print("Total duration: {}".format(duration))

In [None]:
# Of course, at this point, once you've found the best parameters etc., you would retrain the model on the full dataset:
estimator = LGBMClassifier(**params)
estimator = estimator.fit(X, y, **fit_params)

In [None]:
y_pred = estimator.predict(X_test)

In [None]:
print("Accuracy = {:.2f}".format(accuracy_score(y_test, y_pred)))
print("Kappa = {:.2f}".format(cohen_kappa_score(y_test, y_pred)))
print("F1 Score = {:.2f}".format(f1_score(y_test, y_pred)))
print("Log Loss = {:.2f}".format(log_loss(y_test, y_pred)))

In [None]:
# Model 4: XGBClassifier

In [None]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

params = {
      "learning_rate": 0.03,
      "max_depth": 6,
      "min_child_samples": 63,
      "min_data_per_group": 100,
      "n_estimators": 200,
      "num_leaves": 63,
      "reg_alpha":  0.05,
      "reg_lambda": 0.05,
      "subsample": 0.8,
      "boosting_type": 'gbtree',
      "n_jobs": 1,
      "verbosity": 0,
      "seed": 77,
}

pipe = Pipeline(steps=[
   #('encoder', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1, dtype=np.int32)),
   ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=np.int32)),
   ('clf', XGBClassifier(**params)),                    
])

start = time.time()
pipe.fit(X, y)

# The more CV, the better our estimate of the score
inner_cv_scores = cross_validate(pipe, X, y, 
                                 cv=5, 
                                 scoring="roc_auc", 
                                 n_jobs=15, 
                                 verbose=0, 
                                 return_train_score=True)
cv_scores=inner_cv_scores['test_score'].tolist()
fit_times=inner_cv_scores['fit_time'].tolist()
duration = time.time() - start

print("CV Scores:")
print(["{:0.4f}".format(cv_score) for cv_score in cv_scores])
print("CV Score mean: {:.4f} ".format(np.mean(cv_scores)))
print("CV Score range: {:0.4f} -- {:0.4f}".format(np.mean(cv_scores) - np.std(cv_scores), np.mean(cv_scores) + np.std(cv_scores)))
print("Total duration: {}".format(duration))

In [None]:
estimator = XGBClassifier(**params)
estimator = estimator.fit(X, y)

In [None]:
y_pred = estimator.predict(X_test)

In [None]:
print("Accuracy = {:.2f}".format(accuracy_score(y_test, y_pred)))
print("Kappa = {:.2f}".format(cohen_kappa_score(y_test, y_pred)))
print("F1 Score = {:.2f}".format(f1_score(y_test, y_pred)))
print("Log Loss = {:.2f}".format(log_loss(y_test, y_pred)))

In [None]:
# Model 5: XGBoost Tuned by Optuna

In [None]:
pip install optuna

In [None]:
import optuna

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

    
def objective_dt(trial, X, y):

  # Now, define all the hyperparams we want to vary, and what values they are allowed
  # to take.
  #
  # Each trial, optuna will automatically choose values for each hyperparam.
  hyper_params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.005,1,step=0.005),
        'min_split_loss': trial.suggest_float('min_split_loss',0.1,10,step=0.1),
        'max_depth':trial.suggest_int('max_depth',1,12,step=1),
        'max_leaves':trial.suggest_int('max_leaves',0,1000,step=1),
        'min_child_weight':trial.suggest_float('min_child_weight',0,1,step=0.05),
      
        "n_estimators": trial.suggest_int("n_estimators", 10, 500, step=10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.01, 0.10, step=0.01),

        'reg_lambda': trial.suggest_float('reg_lambda', 0.01,0.10,step=0.01),

        "random_state": 77,
  }
    

  # Use the hyperparams that optuna has chosen for this trial to create a DecisionTreeClassifier
  clf = XGBClassifier(**hyper_params)

  # Run CV to see how well these hyper_params do
  cv_scores = cross_val_score(clf, X, y, cv=10, scoring="f1_macro")
  score = np.mean(cv_scores)

  # Whatever we return here tells optuna how well these parameters did
  return score

In [None]:
study = optuna.create_study(direction="maximize")

In [None]:
study.optimize(lambda trial: objective_dt(trial, X, y), n_trials=100,  gc_after_trial=True)

In [None]:
study.best_params

In [None]:
study.best_value

In [None]:
clf = XGBClassifier(**study.best_params)
clf.fit(X,y)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print("Accuracy = {:.2f}".format(accuracy_score(y_test, y_pred)))
print("Kappa = {:.2f}".format(cohen_kappa_score(y_test, y_pred)))
print("F1 Score = {:.2f}".format(f1_score(y_test, y_pred)))
print("Log Loss = {:.2f}".format(log_loss(y_test, y_pred)))

In [None]:
df = pd.read_csv( '/kaggle/input/d/bigswipp/icr-identify-age-related-conditions/test.csv', header = 'infer' )
X_test = df

df['EJ_A'] = np.where(df['EJ'] == 'A',1,0)
df['EJ_B'] = np.where(df['EJ'] == 'B',1,0)
df = df.drop(['EJ'],axis=1)

In [None]:
X_test.head()

In [None]:
X_test_2 = df.drop(['Id'],axis=1)

In [None]:
X_test_2.columns.unique()

In [None]:
X_train.columns.unique()

In [None]:
X_test_2.head()

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")


for x in X_test_2.columns:
    imputer = imputer.fit(X_test_2[[x]])
    X_test_2[x + "_Imputed"] = imputer.transform(X_test_2[[x]])
    X_test_2 = X_test_2.drop([x],axis=1)


In [None]:
y_pred = clf.predict(X_test_2)

In [None]:
y_pred

In [None]:
X_test_2

In [None]:
submission = pd.DataFrame(X_test['Id'], columns=['Id'])
class_0 = []
class_1 = []

for x in y_pred:
    if x == 1:
        class_0.append(0)
        class_1.append(1)
    else:
        class_0.append(1)
        class_1.append(0)
        
submission['class_0'] = class_0
submission['class_1'] = class_1
submission.to_csv('submission.csv', index=False) 