In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
df_test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
df_greeks=pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')
df_submit = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
df_train.head()

In [None]:
df_test.head()

In [None]:
df_submit

In [None]:
#check class counts
target_column = 'Class'
target_count = df_train['Class'].value_counts()
print(target_count)

In [None]:
import matplotlib.pyplot as plt
target_count.plot(kind='bar')
plt.title('Class Distribution')
plt.ylabel('Count')
plt.xlabel('Class')
plt.show()

In [None]:
#check missing values
missing_value = df_train.isnull().sum()
total_missing_value = df_train.isnull().sum().sum()
column_missing_value = missing_value[missing_value>0]
print(column_missing_value)
print(f'total missing value {total_missing_value}')

In [None]:
missing_value_by_class = df_train.groupby(target_column).apply(lambda x: x.isnull().sum()[x.isnull().sum()>0])

print(missing_value_by_class )

In [None]:
#handle categorical column for both train and test dataset
df_train['EJ'] = df_train['EJ'].replace({'A':0, 'B':1})
df_test['EJ'] = df_test['EJ'].replace({'A':0, 'B':1})
print(df_train['EJ'])

In [None]:
train = df_train.drop(df_train.columns[0], axis=1)
train

In [None]:
test1=df_test.drop('Id',axis=1)
test1

In [None]:
y = train['Class']
y = y.values
X = train.drop(columns='Class')
X.shape

In [None]:
X=X.fillna(np.nan)
test1=test1.fillna(np.nan)

In [None]:
y.shape

#Apply median for missing values

In [None]:
from sklearn.impute import SimpleImputer

imputer=SimpleImputer(missing_values=np.nan, strategy='median')

X_imputed = imputer.fit_transform(X)
test1_imputed = imputer.transform(test1)

In [None]:
X=np.array(X_imputed)
test1=np.array(test1_imputed)
test1

In [None]:
test1_imputed.shape

In [None]:
#Split the train dataset into train & test datasets in a ration 75:25
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state=42, shuffle = True, stratify = y)

print(X_train.shape)
print(X_test.shape)

In [None]:
#Classifers 

# KNN

In [None]:
from scipy.spatial.distance import hamming
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
#GridSearch to get best tuned parameters
tuned_parameters = [{'n_neighbors': [3,4,5,6,7,8,9,10],
                      'metric':['hamming', 'manhattan', 'euclidean']}]

knn_model = GridSearchCV(estimator=KNeighborsClassifier(weights='uniform'),
                         param_grid=tuned_parameters,
                         refit=True,
                         cv=4,
                         n_jobs=None,
                         scoring='f1')

knn_model.fit(X_train, y_train)

print('Best F1 score: %.2f%%' % (knn_model.best_score_*100))
print('Best Params:', knn_model.best_params_)

In [None]:
#classifier model run
knn_model = KNeighborsClassifier(weights = 'uniform',n_neighbors=5, metric='euclidean')
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)
num_correct_predictions = (y_pred == y_test).sum()
accuracy = (num_correct_predictions / y_test.shape[0]) * 100
print('Accuracy score: %.2f%%' % accuracy)

In [None]:
#log loss calculation
from sklearn.metrics import log_loss

def balance_logloss(y_test, y_pred):
    logloss_positive = log_loss(y_test, y_pred)
    logloss_negative = log_loss(1 - y_test, 1-y_pred)
    balance_logloss = (logloss_positive + logloss_negative)/2.0
    
    return balance_logloss

balance_logloss(y_test, y_pred)

In [None]:
#Evaluation metrics

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

y_pred = knn_model.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)


precision = precision_score(y_test, y_pred)
print("Precision:", precision)


recall = recall_score(y_test, y_pred)
print("Recall:", recall)


f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

roc_auc = roc_auc_score(y_test, knn_model.predict_proba(X_test)[:, 1])
print("ROC-AUC:", roc_auc)

class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[122   6]
[ 18   9]]

Accuracy: 0.8451612903225807

Precision: 0.6

Recall: 0.3333333333333333

F1-Score: 0.42857142857142855

ROC-AUC: 0.685619212962963

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       128
           1       0.60      0.33      0.43        27

    accuracy                           0.85       155
   macro avg       0.74      0.64      0.67       155
weighted avg       0.82      0.85      0.83       155

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=10000, solver='liblinear',random_state=42, penalty="l1")
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy score: %.2f%%' % accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
#evaluation metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

y_pred = log_reg.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)


precision = precision_score(y_test, y_pred)
print("Precision:", precision)


recall = recall_score(y_test, y_pred)
print("Recall:", recall)


f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

roc_auc = roc_auc_score(y_test, log_reg.predict_proba(X_test)[:, 1])
print("ROC-AUC:", roc_auc)

class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[124   4]
 [ 12  15]]

Accuracy: 0.896774193548387

Precision: 0.7894736842105263

Recall: 0.5555555555555556

F1-Score: 0.6521739130434783

ROC-AUC: 0.8718171296296295

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94       128
           1       0.79      0.56      0.65        27

    accuracy                           0.90       155
   macro avg       0.85      0.76      0.80       155
weighted avg       0.89      0.90      0.89       155


In [None]:
#log loss calculation
from sklearn.metrics import log_loss

def balance_logloss(y_test, y_pred):
    logloss_positive = log_loss(y_test, y_pred)
    logloss_negative = log_loss(1 - y_test, 1-y_pred)
    balance_logloss = (logloss_positive + logloss_negative)/2.0
    
    return balance_logloss

balance_logloss(y_test, y_pred)

# Catboost

In [None]:
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight

In [None]:
#class weight assigned 
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

#cat boost classifier
catboost = CatBoostClassifier(class_weights=class_weights, random_state=42)


tuned_parameters = {'learning_rate': [0.01, 0.1],
                    'depth': [3,6,9],
                    'iterations': [100,200,300]}


grid_search = GridSearchCV(estimator=catboost, param_grid= tuned_parameters, cv=5, n_jobs=None, scoring='f1')
grid_search.fit(X_train, y_train)


print("Best Parameters: ", grid_search.best_params_)
print("Best F1-score: {:.2f}%".format(grid_search.best_score_ * 100))


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: {:.2f}%".format(accuracy * 100))

In [None]:
from sklearn.utils.class_weight import compute_class_weight
 
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))


In [None]:
#classifer with tuned parameters
from catboost import CatBoostClassifier

catboost_model = CatBoostClassifier(depth=6,iterations=100, learning_rate=0.1, class_weights=class_weights, random_state=42)
catboost_model.fit(X_train, y_train)

y_pred = catboost_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: {:.2f}%".format(accuracy * 100))

In [None]:
#Evaluation metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

y_pred = catboost_model.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)


precision = precision_score(y_test, y_pred)
print("Precision:", precision)


recall = recall_score(y_test, y_pred)
print("Recall:", recall)


f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

roc_auc = roc_auc_score(y_test, catboost_model.predict_proba(X_test)[:, 1])
print("ROC-AUC:", roc_auc)

class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[120   8]
 [  6  21]]

Accuracy: 0.9096774193548387

Precision: 0.7241379310344828

Recall: 0.7777777777777778

F1-Score: 0.75

ROC-AUC: 0.9635416666666666

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.94       128
           1       0.72      0.78      0.75        27

    accuracy                           0.91       155
   macro avg       0.84      0.86      0.85       155
weighted avg       0.91      0.91      0.91       155

In [None]:
#log loss calculation
from sklearn.metrics import log_loss

def balance_logloss(y_test, y_pred):
    logloss_positive = log_loss(y_test, y_pred)
    logloss_negative = log_loss(1 - y_test, 1-y_pred)
    balance_logloss = (logloss_positive + logloss_negative)/2.0
    
    return balance_logloss

balance_logloss(y_test, y_pred)

log loss 3.255555789984775

# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.utils.class_weight import compute_sample_weight

In [None]:
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

tuned_parameters = [{'max_depth': [ 3,4,5,6],
                     'n_estimators': [200, 300,400,500],
                     'learning_rate': [0.01, 0.1, 0.001]}]

gs = GridSearchCV(estimator=GradientBoostingClassifier(random_state=121),
                  param_grid=tuned_parameters,
                  refit=True,
                  cv=5,
                  n_jobs=None,
                  scoring='f1')

gs.fit(X_train, y_train, sample_weight=sample_weights)

print('Best F1 score: %.2f%%' % (gs.best_score_*100))
print('Best Params:', gs.best_params_)

In [None]:
#classifier with tuned parameters
gs_model = GradientBoostingClassifier(random_state=121, learning_rate=0.01, max_depth=3, n_estimators=400)
gs_model.fit(X_train, y_train, sample_weights)
y_pred_test = gs_model.predict(X_test)
f1_test = f1_score(y_test, y_pred_test)

print("F1 score on Test Set: %.2f%%" % (f1_test*100))

In [None]:
#Evaluation Metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

y_pred = gs_model.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)


precision = precision_score(y_test, y_pred)
print("Precision:", precision)


recall = recall_score(y_test, y_pred)
print("Recall:", recall)


f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

roc_auc = roc_auc_score(y_test, gs_model.predict_proba(X_test)[:, 1])
print("ROC-AUC:", roc_auc)

class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[121   7]
 [  6  21]]

Accuracy: 0.9161290322580645

Precision: 0.75

Recall: 0.7777777777777778

F1-Score: 0.7636363636363638

ROC-AUC: 0.9551504629629629

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       128
           1       0.75      0.78      0.76        27

    accuracy                           0.92       155
   macro avg       0.85      0.86      0.86       155
weighted avg       0.92      0.92      0.92       155

In [None]:
#log loss calculation
from sklearn.metrics import log_loss

def balance_logloss(y_test, y_pred):
    logloss_positive = log_loss(y_test, y_pred)
    logloss_negative = log_loss(1 - y_test, 1-y_pred)
    balance_logloss = (logloss_positive + logloss_negative)/2.0
    
    return balance_logloss

balance_logloss(y_test, y_pred)

3.0230160907001484

# XGBoost

In [None]:
import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

In [None]:
#classifier 
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic', 
    eval_metric='logloss',       
    use_label_encoder=False      
)

xgb_model.fit(X_train, y_train, sample_weights)
y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))


In [None]:
#Evaluation metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

y_pred = xgb_model.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)


precision = precision_score(y_test, y_pred)
print("Precision:", precision)


recall = recall_score(y_test, y_pred)
print("Recall:", recall)


f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

roc_auc = roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])
print("ROC-AUC:", roc_auc)

class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[123   5]
 [  5  22]]

Accuracy: 0.9354838709677419

Precision: 0.8148148148148148

Recall: 0.8148148148148148

F1-Score: 0.8148148148148148

ROC-AUC: 0.9635416666666666

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       128
           1       0.81      0.81      0.81        27

    accuracy                           0.94       155
   macro avg       0.89      0.89      0.89       155
weighted avg       0.94      0.94      0.94       155

In [None]:
#log loss calculation
from sklearn.metrics import log_loss

def balance_logloss(y_test, y_pred):
    logloss_positive = log_loss(y_test, y_pred)
    logloss_negative = log_loss(1 - y_test, 1-y_pred)
    balance_logloss = (logloss_positive + logloss_negative)/2.0
    
    return balance_logloss

balance_logloss(y_test, y_pred)

2.325396992846268

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.class_weight import compute_sample_weight

sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

In [None]:
dt = DecisionTreeClassifier()

tuned_parameters = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3,7,8,9],
    'min_samples_split': [2, 3,5,8, 10],
    'min_samples_leaf': [1, 2, 4,5,7,8,10]
}

grid_search = GridSearchCV(estimator= dt, param_grid= tuned_parameters, cv=5, n_jobs=None, scoring='accuracy')
grid_search.fit(X_train, y_train, sample_weight=sample_weights)

print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: {:.2f}%".format(grid_search.best_score_ * 100))

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
tree = DecisionTreeClassifier(max_depth=8,min_samples_leaf=1,min_samples_split= 3, criterion='entropy', random_state=42)
tree.fit(X_train, y_train, sample_weight=sample_weights)

y_pred = tree.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

In [None]:
#Evaluation metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

y_pred = tree.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)


precision = precision_score(y_test, y_pred)
print("Precision:", precision)


recall = recall_score(y_test, y_pred)
print("Recall:", recall)


f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

roc_auc = roc_auc_score(y_test, tree.predict_proba(X_test)[:, 1])
print("ROC-AUC:", roc_auc)

class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[117  11]
 [  7  20]]

Accuracy: 0.8838709677419355

Precision: 0.6451612903225806

Recall: 0.7407407407407407

F1-Score: 0.689655172413793

ROC-AUC: 0.8274016203703703

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.91      0.93       128
           1       0.65      0.74      0.69        27

    accuracy                           0.88       155
   macro avg       0.79      0.83      0.81       155
weighted avg       0.89      0.88      0.89       155

In [None]:
#log loss calculation
from sklearn.metrics import log_loss

def balance_logloss(y_test, y_pred):
    logloss_positive = log_loss(y_test, y_pred)
    logloss_negative = log_loss(1 - y_test, 1-y_pred)
    balance_logloss = (logloss_positive + logloss_negative)/2.0
    
    return balance_logloss

balance_logloss(y_test, y_pred)

4.185714587123282

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:
rf_model = RandomForestClassifier(random_state=42)

tuned_parameters = {'n_estimators': [50, 100,200, 300],
                    'max_depth': [3,5,10, 20],
                    'min_samples_split': [5, 10, 15, 20],
                    'min_samples_leaf': [1, 2, 4, 5, 8]}


grid_search = GridSearchCV(estimator=rf_model, param_grid= tuned_parameters, cv=5, n_jobs=None, scoring='accuracy')


grid_search.fit(X_train, y_train)


print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: {:.2f}%".format(grid_search.best_score_ * 100))

best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)


In [None]:
rf_model = RandomForestClassifier(class_weight ='balanced',max_depth=10, min_samples_leaf= 1,min_samples_split= 5, n_estimators= 200, random_state=123)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
#Evaluation Metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

y_pred = rf_model.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)


precision = precision_score(y_test, y_pred)
print("Precision:", precision)


recall = recall_score(y_test, y_pred)
print("Recall:", recall)


f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

roc_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])
print("ROC-AUC:", roc_auc)

class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[125   3]
 [ 11  16]]

Accuracy: 0.9096774193548387 

Precision: 0.8421052631578947

Recall: 0.5925925925925926

F1-Score: 0.6956521739130435

ROC-AUC: 0.9641203703703703

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       128
           1       0.84      0.59      0.70        27

    accuracy                           0.91       155
   macro avg       0.88      0.78      0.82       155
weighted avg       0.91      0.91      0.90       155

In [None]:
#log loss calculation
from sklearn.metrics import log_loss

def balance_logloss(y_test, y_pred):
    logloss_positive = log_loss(y_test, y_pred)
    logloss_negative = log_loss(1 - y_test, 1-y_pred)
    balance_logloss = (logloss_positive + logloss_negative)/2.0
    
    return balance_logloss

balance_logloss(y_test, y_pred)

3.255555789984775

# Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
#bagging with gradident boost
bagging_gs = BaggingClassifier(gs_model,
                               n_estimators=100,       
                               max_samples=0.6,       
                               max_features=0.5,       
                               bootstrap=False,         
                               random_state=42)

bagging_gs.fit(X_train, y_train)
y_pred = bagging_gs.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
#bagging with catboost
bagging_cat = BaggingClassifier(catboost_model,
                                n_estimators=100,       
                                max_samples=0.8,       
                                max_features=0.5,       
                                bootstrap=False,         
                                random_state=42)


bagging_cat.fit(X_train, y_train)
y_pred = bagging_cat.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
#Bagging with xgboost
bagging_xgb = BaggingClassifier(xgb_model,
                                n_estimators=100,       
                                max_samples=0.8,       
                                max_features=0.5,       
                                bootstrap=False,         
                                random_state=42)


bagging_xgb.fit(X_train, y_train)
y_pred = bagging_xgb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
#evaluation metrics for best one (catboost)
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

y_pred = bagging_cat.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)


precision = precision_score(y_test, y_pred)
print("Precision:", precision)


recall = recall_score(y_test, y_pred)
print("Recall:", recall)


f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

roc_auc = roc_auc_score(y_test, bagging_cat.predict_proba(X_test)[:, 1])
print("ROC-AUC:", roc_auc)

class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[124   4]
 [  6  21]]

Accuracy: 0.9354838709677419

Precision: 0.84

Recall: 0.7777777777777778

F1-Score: 0.8076923076923077

ROC-AUC: 0.9629629629629629

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       128
           1       0.84      0.78      0.81        27

    accuracy                           0.94       155
   macro avg       0.90      0.87      0.88       155
weighted avg       0.93      0.94      0.93       155

In [None]:
#log loss calculation
from sklearn.metrics import log_loss

def balance_logloss(y_test, y_pred):
    logloss_positive = log_loss(y_test, y_pred)
    logloss_negative = log_loss(1 - y_test, 1-y_pred)
    balance_logloss = (logloss_positive + logloss_negative)/2.0
    
    return balance_logloss

balance_logloss(y_test, y_pred)

2.325396992846268

# Majority Voting

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
#base models

# clf1 = tree
clf2 = rf_model
clf3 = catboost_model
clf4 = xgb_model
clf5 = gs_model
clf6 = bagging_gs
clf7 = bagging_cat
clf8 = bagging_xgb

voting_classifier = VotingClassifier(
    estimators=[('rf', clf2), ('cat', clf3), ('xgb', clf4), ('gs', clf5),('bg_gs', clf6), ('bg_cat', clf7), ('bg_xgb', clf8)],
    voting='soft' 
)

voting_classifier.fit(X_train, y_train)
y_pred = voting_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

print("Classification Report:")
print(classification_report(y_test, y_pred))


In [None]:
#Evaluation Metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

y_pred = voting_classifier.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)


precision = precision_score(y_test, y_pred)
print("Precision:", precision)


recall = recall_score(y_test, y_pred)
print("Recall:", recall)


f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

roc_auc = roc_auc_score(y_test, voting_classifier.predict_proba(X_test)[:, 1])
print("ROC-AUC:", roc_auc)

class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[124   4]
 [  6  21]]

Accuracy: 0.9354838709677419
Precision: 0.84
Recall: 0.7777777777777778
F1-Score: 0.8076923076923077
ROC-AUC: 0.9632523148148148

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       128
           1       0.84      0.78      0.81        27

    accuracy                           0.94       155
   macro avg       0.90      0.87      0.88       155
weighted avg       0.93      0.94      0.93       155

In [None]:
#log loss calculation
from sklearn.metrics import log_loss

def balance_logloss(y_test, y_pred):
    logloss_positive = log_loss(y_test, y_pred)
    logloss_negative = log_loss(1 - y_test, 1-y_pred)
    balance_logloss = (logloss_positive + logloss_negative)/2.0
    
    return balance_logloss

balance_logloss(y_test, y_pred)

2.325396992846268

In [None]:
# test_preds = voting_classifier.predict_proba(test1)
# test_preds

In [None]:
# # test_preds = voting_classifier.predict_proba(test1)
# test_preds
# submit = pd.DataFrame({'Id':df_test['Id'], 'class_0': test_preds[:, 0], 'class_1': test_preds[:, 1]})
# submit

In [None]:
# submit.to_csv('/kaggle/working/submission.csv', index=False)
# submit=pd.read_csv("/kaggle/working/submission.csv")
# submit.head()

In [None]:
X_test.shape

# Stack

In [91]:
from sklearn.ensemble import StackingClassifier


rf = rf_model

xgb = xgb_model

catboost = catboost_model

gb_boost = bagging_cat

lg_classifier = log_reg

#stacking classifier

estimators = [('rf', rf),
              ('xgb', xgb),
              ('cat', catboost),
              ('gb', gb_boost)]

stacking = StackingClassifier(estimators=estimators,
                              final_estimator=lg_classifier,
                              cv=4)

rf.fit(X_train, y_train)
print(f"Random forest model training Accuracy: {rf.score(X_train, y_train):0.2f}")
print(f"Random forest model test Accuracy: {rf.score(X_test, y_test):0.2f}")

xgb.fit(X_train, y_train)
print(f"\nXGBoost model training Accuracy: {xgb.score(X_train, y_train):0.2f}")
print(f"XGBoost model test Accuracy: {xgb.score(X_test, y_test):0.2f}")


gb_boost.fit(X_train, y_train)
print(f"\nCatBoost model training Accuracy: {gb_boost.score(X_train, y_train):0.2f}")
print(f"CatBoost model test Accuracy: {gb_boost.score(X_test, y_test):0.2f}")

catboost.fit(X_train, y_train)
print(f"\nCatBoost model training Accuracy: {catboost.score(X_train, y_train):0.2f}")
print(f"CatBoost model test Accuracy: {catboost.score(X_test, y_test):0.2f}")

stacking.fit(X_train, y_train)
print(f"\nStacking classifier training Accuracy: {stacking.score(X_train, y_train):0.2f}")
print(f"Stacking classifier test Accuracy: {stacking.score(X_test, y_test):0.2f}")

Random forest model training Accuracy: 1.00
Random forest model test Accuracy: 0.91

XGBoost model training Accuracy: 1.00
XGBoost model test Accuracy: 0.92
0:	learn: 0.6075713	total: 4.48ms	remaining: 443ms
1:	learn: 0.5419651	total: 9.03ms	remaining: 442ms
2:	learn: 0.4649892	total: 13.5ms	remaining: 437ms
3:	learn: 0.4036531	total: 18.1ms	remaining: 435ms
4:	learn: 0.3650346	total: 24.4ms	remaining: 464ms
5:	learn: 0.3331446	total: 29.4ms	remaining: 461ms
6:	learn: 0.2950778	total: 33.6ms	remaining: 447ms
7:	learn: 0.2667211	total: 37.9ms	remaining: 436ms
8:	learn: 0.2425027	total: 42.2ms	remaining: 426ms
9:	learn: 0.2301782	total: 46.6ms	remaining: 420ms
10:	learn: 0.2130236	total: 50.7ms	remaining: 410ms
11:	learn: 0.1980975	total: 54.8ms	remaining: 402ms
12:	learn: 0.1867005	total: 58.9ms	remaining: 394ms
13:	learn: 0.1753575	total: 62.9ms	remaining: 387ms
14:	learn: 0.1625547	total: 67.2ms	remaining: 381ms
15:	learn: 0.1524652	total: 71.5ms	remaining: 375ms
16:	learn: 0.1454715	

In [93]:
#Evaluation Metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

y_pred = stacking.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)


precision = precision_score(y_test, y_pred)
print("Precision:", precision)


recall = recall_score(y_test, y_pred)
print("Recall:", recall)


f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

roc_auc = roc_auc_score(y_test, stacking.predict_proba(X_test)[:, 1])
print("ROC-AUC:", roc_auc)

class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[124   4]
 [  6  21]]

Accuracy: 0.9354838709677419
Precision: 0.84
Recall: 0.7777777777777778
F1-Score: 0.8076923076923077
ROC-AUC: 0.9661458333333333

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       128
           1       0.84      0.78      0.81        27

    accuracy                           0.94       155
   macro avg       0.90      0.87      0.88       155
weighted avg       0.93      0.94      0.93       155



Confusion Matrix:
[[124   4]
 [  6  21]]

Accuracy: 0.9354838709677419
Precision: 0.84
Recall: 0.7777777777777778
F1-Score: 0.8076923076923077
ROC-AUC: 0.9661458333333333

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       128
           1       0.84      0.78      0.81        27

    accuracy                           0.94       155
   macro avg       0.90      0.87      0.88       155
weighted avg       0.93      0.94      0.93       155

In [92]:
#log loss calculation
from sklearn.metrics import log_loss

def balance_logloss(y_test, y_pred):
    logloss_positive = log_loss(y_test, y_pred)
    logloss_negative = log_loss(1 - y_test, 1-y_pred)
    balance_logloss = (logloss_positive + logloss_negative)/2.0
    
    return balance_logloss

balance_logloss(y_test, y_pred)

2.325396992846268

##### The best result of stacking is used for predicting the provided submit-test for the competition submission

In [94]:
test_preds = stacking.predict_proba(test1)
test_preds

array([[0.88194551, 0.11805449],
       [0.88194551, 0.11805449],
       [0.88194551, 0.11805449],
       [0.88194551, 0.11805449],
       [0.88194551, 0.11805449]])

In [95]:

submit = pd.DataFrame({'Id':df_test['Id'], 'class_0': test_preds[:, 0], 'class_1': test_preds[:, 1]})
submit

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.881946,0.118054
1,010ebe33f668,0.881946,0.118054
2,02fa521e1838,0.881946,0.118054
3,040e15f562a2,0.881946,0.118054
4,046e85c7cc7f,0.881946,0.118054


In [96]:
submit.to_csv('/kaggle/working/submission.csv', index=False)
submit=pd.read_csv("/kaggle/working/submission.csv")
submit.head()

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.881946,0.118054
1,010ebe33f668,0.881946,0.118054
2,02fa521e1838,0.881946,0.118054
3,040e15f562a2,0.881946,0.118054
4,046e85c7cc7f,0.881946,0.118054
