In [296]:
#import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgb

In [297]:
# Read dataset
train = pd.read_csv("Train Dataset .csv")
test = pd.read_csv("Test Dataset.csv")
var = pd.read_csv("Variable_Definitions.csv") 
submission = pd.read_csv("Sample Submission.csv")

# Rename the columns in the DataFrame
test.rename(columns={'id': 'Id', 'age': 'Age', 'sex': 'Sex'}, inplace=True)

#concatenating the train and test dataframe
train["targ"] = "train"
test["targ"] = "test"

df = pd.concat([train, test], ignore_index=True)
df.shape

(10000, 16)

In [298]:
train.shape, test.shape  #checking the shape of the train and test data

((7303, 16), (2697, 15))

In [299]:
df.head()  #checking the first five rows of the dataframe

Unnamed: 0,Id,Age,Sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,targ
0,16167,33,0,1,158,205,1,0,154,0,1.5,1,4,1,1.0,train
1,11275,53,1,2,198,154,0,1,104,0,0.8,2,1,0,0.0,train
2,13251,37,1,2,101,202,1,0,155,0,2.1,1,3,1,1.0,train
3,19921,75,0,0,113,306,1,2,88,1,4.9,0,2,2,1.0,train
4,11293,35,1,2,139,419,1,1,166,1,0.9,2,4,0,1.0,train


In [300]:
var

Unnamed: 0,Variable defination
0,age
1,sex
2,chest pain type (4 values)
3,resting blood pressure
4,serum cholestoral in mg/dl
5,fasting blood sugar > 120 mg/dl
6,resting electrocardiographic results (values 0...
7,maximum heart rate achieved
8,exercise induced angina
9,oldpeak = ST depression induced by exercise re...


In [301]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7303 entries, 0 to 7302
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Id        7303 non-null   int64  
 1   Age       7303 non-null   int64  
 2   Sex       7303 non-null   int64  
 3   cp        7303 non-null   int64  
 4   trestbps  7303 non-null   int64  
 5   chol      7303 non-null   int64  
 6   fbs       7303 non-null   int64  
 7   restecg   7303 non-null   int64  
 8   thalach   7303 non-null   int64  
 9   exang     7303 non-null   int64  
 10  oldpeak   7303 non-null   float64
 11  slope     7303 non-null   int64  
 12  ca        7303 non-null   int64  
 13  thal      7303 non-null   int64  
 14  target    7303 non-null   int64  
 15  targ      7303 non-null   object 
dtypes: float64(1), int64(14), object(1)
memory usage: 913.0+ KB


In [302]:
train['targ'].value_counts()

targ
train    7303
Name: count, dtype: int64

In [303]:
train['target'].value_counts()

target
1    5941
0    1362
Name: count, dtype: int64

In [304]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2697 entries, 0 to 2696
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Id        2697 non-null   int64  
 1   Age       2697 non-null   int64  
 2   Sex       2697 non-null   int64  
 3   cp        2697 non-null   int64  
 4   trestbps  2697 non-null   int64  
 5   chol      2697 non-null   int64  
 6   fbs       2697 non-null   int64  
 7   restecg   2697 non-null   int64  
 8   thalach   2697 non-null   int64  
 9   exang     2697 non-null   int64  
 10  oldpeak   2697 non-null   float64
 11  slope     2697 non-null   int64  
 12  ca        2697 non-null   int64  
 13  thal      2697 non-null   int64  
 14  targ      2697 non-null   object 
dtypes: float64(1), int64(13), object(1)
memory usage: 316.2+ KB


In [305]:
test['targ'].value_counts()

targ
test    2697
Name: count, dtype: int64

In [306]:
test.isnull().sum()

Id          0
Age         0
Sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
targ        0
dtype: int64

In [307]:
train.isnull().sum()

Id          0
Age         0
Sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
targ        0
dtype: int64

#### Removing 'targ' from the test and the train datasets

In [309]:
train = train.drop(['targ'], axis=1)

In [310]:
test = test.drop(['targ'], axis=1)

In [311]:
train

Unnamed: 0,Id,Age,Sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,16167,33,0,1,158,205,1,0,154,0,1.5,1,4,1,1
1,11275,53,1,2,198,154,0,1,104,0,0.8,2,1,0,0
2,13251,37,1,2,101,202,1,0,155,0,2.1,1,3,1,1
3,19921,75,0,0,113,306,1,2,88,1,4.9,0,2,2,1
4,11293,35,1,2,139,419,1,1,166,1,0.9,2,4,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7298,19401,30,1,2,107,177,1,2,119,0,2.7,1,0,0,0
7299,10446,42,1,2,96,551,1,2,76,0,1.9,2,3,2,1
7300,13219,51,1,0,151,165,1,0,190,1,0.9,0,0,2,1
7301,15349,29,0,0,195,287,1,2,161,1,3.4,1,1,0,1


In [312]:
test

Unnamed: 0,Id,Age,Sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,16501,70,1,0,163,495,0,2,170,1,2.0,1,0,1
1,10444,61,1,0,131,238,0,2,74,1,4.9,2,2,2
2,14288,53,1,0,95,558,1,1,73,1,0.7,1,1,0
3,10409,37,0,1,178,287,0,1,192,1,5.7,1,0,0
4,17330,35,0,3,104,281,0,0,122,0,1.3,1,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2692,14964,34,0,3,136,291,0,1,163,0,2.6,2,2,3
2693,16774,72,0,1,104,166,1,2,95,1,0.3,0,2,2
2694,18884,31,1,0,153,457,1,1,170,1,3.8,0,3,1
2695,10000,71,0,0,111,242,1,0,147,1,5.3,0,1,2


In [313]:
train.describe()

Unnamed: 0,Id,Age,Sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0
mean,15021.535396,53.172669,0.499658,1.502533,147.447487,342.80597,0.493085,1.013008,136.506093,0.503218,3.129851,0.99151,2.019033,1.502259,0.813501
std,2886.02608,14.18597,0.500034,1.115594,31.099538,127.291998,0.499986,0.815806,38.141966,0.500024,1.79116,0.817291,1.410546,1.113137,0.389535
min,10001.0,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12521.5,41.0,0.0,1.0,120.0,231.0,0.0,0.0,104.0,0.0,1.6,0.0,1.0,1.0,1.0
50%,15054.0,53.0,0.0,1.0,148.0,341.0,0.0,1.0,137.0,1.0,3.1,1.0,2.0,1.0,1.0
75%,17513.5,65.0,1.0,3.0,174.0,450.0,1.0,2.0,170.0,1.0,4.7,2.0,3.0,2.0,1.0
max,19998.0,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


### Exploratory Data Analysis

In [315]:
# Define the categorical features for easy data manipulation and EDA
#cat_cols = ['Sex', 'cp', 'fbs', 'exang', 'slope']
#for col in cat_cols:
#    df[col] = df[col].astype('object')

In [316]:
#sns.heatmap(df.isnull(), cbar=False)  #checking for missing values in the dataframe

In [317]:
#sns.countplot(x=df['target']);  #checking the distribution of the target variable

Note: The target variable is imbalanced

### Feature Engineering

In [320]:
# function to group the ages into bins
def group_age(x):
    if x < 30:
        return 'Young'
    elif 30 <= x < 40:
        return 'Adult'
    elif 40 <= x < 60:
        return 'Middle Age'
    else:
        return 'Old'

df['Age_bin'] = df['Age'].apply(group_age)

In [321]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Id        10000 non-null  int64  
 1   Age       10000 non-null  int64  
 2   Sex       10000 non-null  int64  
 3   cp        10000 non-null  int64  
 4   trestbps  10000 non-null  int64  
 5   chol      10000 non-null  int64  
 6   fbs       10000 non-null  int64  
 7   restecg   10000 non-null  int64  
 8   thalach   10000 non-null  int64  
 9   exang     10000 non-null  int64  
 10  oldpeak   10000 non-null  float64
 11  slope     10000 non-null  int64  
 12  ca        10000 non-null  int64  
 13  thal      10000 non-null  int64  
 14  target    7303 non-null   float64
 15  targ      10000 non-null  object 
 16  Age_bin   10000 non-null  object 
dtypes: float64(2), int64(13), object(2)
memory usage: 1.3+ MB


Note: Perform more feature engineering techniques

### Data Preprocessing

In [324]:
#label encoding the categorical columns
cat_cols.append('Age_bin')
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [325]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Id        10000 non-null  int64  
 1   Age       10000 non-null  int64  
 2   Sex       10000 non-null  int64  
 3   cp        10000 non-null  int64  
 4   trestbps  10000 non-null  int64  
 5   chol      10000 non-null  int64  
 6   fbs       10000 non-null  int64  
 7   restecg   10000 non-null  int64  
 8   thalach   10000 non-null  int64  
 9   exang     10000 non-null  int64  
 10  oldpeak   10000 non-null  float64
 11  slope     10000 non-null  int64  
 12  ca        10000 non-null  int64  
 13  thal      10000 non-null  int64  
 14  target    7303 non-null   float64
 15  targ      10000 non-null  object 
 16  Age_bin   10000 non-null  int64  
dtypes: float64(2), int64(14), object(1)
memory usage: 1.3+ MB


In [354]:
# splitting the data back to train and test and dropping the unnecessary columns
train_data = df[df["targ"] == "train"].drop(["targ", "Id"], axis=1)
test_data = df[df["targ"] == "test"].drop(["targ", "target", "Id"], axis=1)

In [356]:
X = train_data.drop(columns=["target"])
X.head()

Unnamed: 0,Age,Sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,Age_bin
0,33,0,1,158,205,1,0,154,0,1.5,1,4,1,0
1,53,1,2,198,154,0,1,104,0,0.8,2,1,0,1
2,37,1,2,101,202,1,0,155,0,2.1,1,3,1,0
3,75,0,0,113,306,1,2,88,1,4.9,0,2,2,2
4,35,1,2,139,419,1,1,166,1,0.9,2,4,0,0


In [358]:
test_data.head(2)

Unnamed: 0,Age,Sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,Age_bin
7303,70,1,0,163,495,0,2,170,1,2.0,1,0,1,2
7304,61,1,0,131,238,0,2,74,1,4.9,2,2,2,2


In [360]:
y = train_data["target"]
print(y.value_counts())

target
1.0    5941
0.0    1362
Name: count, dtype: int64


In [362]:
#Feature Scaling (Standardization)
#scaler = StandardScaler()
#X = scaler.fit_transform(X)
#test = scaler.fit_transform(test_data)

In [364]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [366]:
# viewing the shape of the X_train and X_test
X_train.shape, X_test.shape

((6207, 14), (1096, 14))

In [368]:
# viewing the shape of the y_train and y_test
y_train.shape, y_test.shape

((6207,), (1096,))

### Unbalanced Data Handelling

In [372]:
# Install imbalanced-learn if you have not used before
#!pip install imbalanced-learn

In [374]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter
# summarize class distribution
counter = Counter(y)
print(counter)
# define pipeline
over = SMOTE(sampling_strategy=0.3,random_state=1) #sampling_strategy=0.1,random_state=1
under = RandomUnderSampler(sampling_strategy=0.6)
steps = [ ('o', over),('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
Xb, Yb = pipeline.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(Yb)
print(counter)

Counter({1.0: 5941, 0.0: 1362})
Counter({1.0: 2970, 0.0: 1782})


In [376]:
x_trainb, x_testb, y_trainb, y_testb = train_test_split(Xb, Yb, test_size=0.25, random_state=42)
# 42 is just any random seed number"

In [378]:
#resampling the data using SMOTE because the target variable is imbalanced
#from imblearn.over_sampling import SMOTE

#smote = SMOTE(random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

 #Print class distribution before and after SMOTE
#print("Class distribution before SMOTE:")
#print(y.value_counts())

#print("\nClass distribution after SMOTE:")
#print(pd.Series(y_resampled).value_counts())

In [380]:
 #Print class distribution before and after SMOTE
#print("Class distribution before SMOTE:")
#print(y.value_counts())

#print("\nClass distribution after SMOTE:")
#print(pd.Series(y_resampled).value_counts())

### Model Building

In [383]:
from sklearn import metrics #accuracy measure
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier,  ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from catboost import CatBoostClassifier
from sklearn import model_selection
from sklearn.metrics import auc, roc_auc_score, accuracy_score
from sklearn.model_selection import KFold,StratifiedKFold,GridSearchCV,RandomizedSearchCV, train_test_split #For splitting

#Evaluation Metrics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [385]:
models = []
models.append(('log_model', LogisticRegression()))
models.append(('bc', BaggingClassifier()))
models.append(('dT', DecisionTreeClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('RFC', RandomForestClassifier()))
models.append(('GBC', GradientBoostingClassifier()))
models.append(('Adb', AdaBoostClassifier()))
models.append(('Extra_Tree', ExtraTreesClassifier()))
models.append(('Xgb', XGBClassifier()))
models.append(('lightgbm', LGBMClassifier()))
models.append(('cat', CatBoostClassifier()))

In [387]:
results =[]
names =[]
for name, model in models:
    model.fit(x_trainb, y_trainb)
    score = metrics.accuracy_score(y_testb, model.predict(x_testb))
    results.append(score)
    names.append(name)
    msg ="%s: %f " % (name, score)
    print(msg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


log_model: 0.805556 
bc: 0.856061 
dT: 0.801347 
LDA: 0.844276 
RFC: 0.856902 
GBC: 0.866162 
Adb: 0.860269 
Extra_Tree: 0.856902 
Xgb: 0.833333 
[LightGBM] [Info] Number of positive: 2240, number of negative: 1324
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000310 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 818
[LightGBM] [Info] Number of data points in the train set: 3564, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.628507 -> initscore=0.525818
[LightGBM] [Info] Start training from score 0.525818
lightgbm: 0.845118 
Learning rate set to 0.017726
0:	learn: 0.6677314	total: 4.28ms	remaining: 4.28s
1:	learn: 0.6466967	total: 7.93ms	remaining: 3.96s
2:	learn: 0.6272699	total: 12.9ms	remaining: 4.3s
3:	learn: 0.6080457	total: 16.4ms	remaining: 4.09s
4:	learn: 0.5913083	total: 19.7ms	remaining: 3.92s
5:	learn: 0.5698125	total: 22.2ms	remaining: 3.68s
6:	learn: 0

In [206]:
X.head()

Unnamed: 0,Age,Sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,Age_bin
0,33,0,1,158,205,1,0,154,0,1.5,1,4,1,0
1,53,1,2,198,154,0,1,104,0,0.8,2,1,0,1
2,37,1,2,101,202,1,0,155,0,2.1,1,3,1,0
3,75,0,0,113,306,1,2,88,1,4.9,0,2,2,2
4,35,1,2,139,419,1,1,166,1,0.9,2,4,0,0


In [208]:
test_data.head()

Unnamed: 0,Age,Sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,Age_bin
7303,70,1,0,163,495,0,2,170,1,2.0,1,0,1,2
7304,61,1,0,131,238,0,2,74,1,4.9,2,2,2,2
7305,53,1,0,95,558,1,1,73,1,0.7,1,1,0,1
7306,37,0,1,178,287,0,1,192,1,5.7,1,0,0,0
7307,35,0,3,104,281,0,0,122,0,1.3,1,4,3,0


In [210]:
sub = pd.read_csv("Sample Submission.csv")

In [389]:
LGBM = LGBMClassifier(n_estimators=500, max_depth=8,learning_rate=0.01,n_jobs=-1)
LGBM_model = LGBM.fit(x_trainb, y_trainb)
LGBM_y_predict = GBC_model.predict(x_testb)

[LightGBM] [Info] Number of positive: 2240, number of negative: 1324
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000635 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 818
[LightGBM] [Info] Number of data points in the train set: 3564, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.628507 -> initscore=0.525818
[LightGBM] [Info] Start training from score 0.525818


In [391]:
# performance
print(f'Accuracy Score: {accuracy_score(y_testb,LGBM_y_predict)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_testb, LGBM_y_predict)}')
print(f'Area Under Curve: {roc_auc_score(y_testb, LGBM_y_predict)}')
print(f'Recall score: {recall_score(y_testb,LGBM_y_predict)}')
print(f'Precision score: {precision_score(y_testb,LGBM_y_predict)}')
print(f'f1 score: {f1_score(y_testb,LGBM_y_predict)}')

Accuracy Score: 0.8838383838383839
Confusion Matrix: 
[[421  37]
 [101 629]]
Area Under Curve: 0.8904289047077825
Recall score: 0.8616438356164383
Precision score: 0.9444444444444444
f1 score: 0.9011461318051576


In [393]:
LGBM_test_pred = LGBM_model.predict(test_data)  #making predictions on the test data

In [395]:
# Create a submission DataFrame
LGBM_submission = pd.DataFrame({
    'ID': sub['Id'], 
    'Target': LGBM_test_pred.astype(int)
})

# Save the submission DataFrame to a CSV file
LGBM_submission.to_csv('LGBM_submission4.csv', index=False)

In [397]:
GBC = GradientBoostingClassifier(learning_rate=0.01, n_estimators=500, max_depth=8, random_state=1)
GBC_model = GBC.fit(x_trainb, y_trainb)
GBC_y_predict = GBC_model.predict(x_testb)

In [399]:
# performance
print(f'Accuracy Score: {accuracy_score(y_testb,GBC_y_predict)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_testb, GBC_y_predict)}')
print(f'Area Under Curve: {roc_auc_score(y_testb, GBC_y_predict)}')
print(f'Recall score: {recall_score(y_testb,GBC_y_predict)}')
print(f'Precision score: {precision_score(y_testb,GBC_y_predict)}')
print(f'f1 score: {f1_score(y_testb,GBC_y_predict)}')

Accuracy Score: 0.8476430976430976
Confusion Matrix: 
[[415  43]
 [138 592]]
Area Under Curve: 0.8585362206137465
Recall score: 0.810958904109589
Precision score: 0.9322834645669291
f1 score: 0.8673992673992673


In [401]:
GBC_test_pred = GBC_model.predict(test_data)  #making predictions on the test data

In [403]:
GBC_test_pred

array([1., 1., 1., ..., 1., 1., 0.])

In [405]:
sub.head()

Unnamed: 0,Id
0,16501
1,10444
2,14288
3,10409
4,17330


In [407]:
# Create a submission DataFrame
GBC_submission = pd.DataFrame({
    'ID': sub['Id'], 
    'Target': GBC_test_pred.astype(int)
})

# Save the submission DataFrame to a CSV file
GBC_submission.to_csv('GBC_submission9.csv', index=False)

#### SVM with balanced Data

In [409]:
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score

In [411]:
clf = svm.SVC(gamma=0.25, C=10)
clf.fit(x_trainb , y_trainb)
y_predictb = clf.predict(x_testb)

In [412]:
# performance
print(f'Accuracy Score: {accuracy_score(y_testb,y_predictb)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_testb, y_predictb)}')
print(f'Area Under Curve: {roc_auc_score(y_testb, y_predictb)}')
print(f'Recall score: {recall_score(y_testb,y_predictb)}')
print(f'Precision score: {precision_score(y_testb,y_predictb)}')
print(f'f1 score: {f1_score(y_testb,y_predictb)}')

Accuracy Score: 0.6388888888888888
Confusion Matrix: 
[[ 29 429]
 [  0 730]]
Area Under Curve: 0.5316593886462881
Recall score: 1.0
Precision score: 0.6298533218291631
f1 score: 0.7728957120169402


### Random Forest classifier with Balanced Data

In [415]:
rfcl = RandomForestClassifier(n_estimators=500, n_jobs=-1, max_depth=8, criterion="entropy", random_state=1)
rfcl = rfcl.fit(x_trainb, y_trainb)
rf_y_predict = rfcl.predict(x_testb)

In [417]:
# performance
print(f'Accuracy Score: {accuracy_score(y_testb,rf_y_predict)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_testb, rf_y_predict)}')
print(f'Area Under Curve: {roc_auc_score(y_testb, rf_y_predict)}')
print(f'Recall score: {recall_score(y_testb,rf_y_predict)}')
print(f'Precision score: {precision_score(y_testb,rf_y_predict)}')
print(f'f1 score: {f1_score(y_testb,rf_y_predict)}')

Accuracy Score: 0.8712121212121212
Confusion Matrix: 
[[457   1]
 [152 578]]
Area Under Curve: 0.8947987079021356
Recall score: 0.7917808219178082
Precision score: 0.998272884283247
f1 score: 0.8831168831168831


In [419]:
rfcl_test_pred = rfcl.predict(test_data)  #making predictions on the test data

In [421]:
# Create a submission DataFrame
rfcl_submission = pd.DataFrame({
    'ID': sub['Id'], 
    'Target': rfcl_test_pred.astype(int)
})

# Save the submission DataFrame to a CSV file
rfcl_submission.to_csv('rfcl_submission2.csv', index=False)

In [423]:
cat_model = CatBoostClassifier(iterations=500, learning_rate=0.01, max_depth=8, loss_function="CrossEntropy")
cat_model.fit(x_trainb , y_trainb)
cat_y_pred = model.predict(x_testb)

0:	learn: 0.6813636	total: 6.86ms	remaining: 3.42s
1:	learn: 0.6699217	total: 13.8ms	remaining: 3.43s
2:	learn: 0.6588353	total: 20.5ms	remaining: 3.39s
3:	learn: 0.6482208	total: 27.1ms	remaining: 3.36s
4:	learn: 0.6369329	total: 34.5ms	remaining: 3.42s
5:	learn: 0.6262172	total: 41ms	remaining: 3.38s
6:	learn: 0.6168076	total: 48.4ms	remaining: 3.4s
7:	learn: 0.6079993	total: 54.8ms	remaining: 3.37s
8:	learn: 0.5975431	total: 59.8ms	remaining: 3.26s
9:	learn: 0.5881468	total: 67.6ms	remaining: 3.31s
10:	learn: 0.5796388	total: 76.4ms	remaining: 3.4s
11:	learn: 0.5683175	total: 81.1ms	remaining: 3.3s
12:	learn: 0.5594486	total: 97.2ms	remaining: 3.64s
13:	learn: 0.5524075	total: 106ms	remaining: 3.68s
14:	learn: 0.5460819	total: 112ms	remaining: 3.63s
15:	learn: 0.5389780	total: 119ms	remaining: 3.6s
16:	learn: 0.5297143	total: 122ms	remaining: 3.47s
17:	learn: 0.5232216	total: 130ms	remaining: 3.48s
18:	learn: 0.5175947	total: 136ms	remaining: 3.45s
19:	learn: 0.5118960	total: 145ms	

In [425]:
# performance
print(f'Accuracy Score: {accuracy_score(y_testb,cat_y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_testb, cat_y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_testb, cat_y_pred)}')
print(f'Recall score: {recall_score(y_testb,cat_y_pred)}')
print(f'Precision score: {precision_score(y_testb,cat_y_pred)}')
print(f'f1 score: {f1_score(y_testb,cat_y_pred)}')

Accuracy Score: 0.8552188552188552
Confusion Matrix: 
[[429  29]
 [143 587]]
Area Under Curve: 0.8703954058742598
Recall score: 0.8041095890410959
Precision score: 0.952922077922078
f1 score: 0.8722139673105498


In [427]:
cat_test_pred = cat_model.predict(test_data)  #making predictions on the test data

In [429]:
cat_test_pred

array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

In [431]:
# Create a submission DataFrame
submission = pd.DataFrame({
    'ID': sub['Id'], 
    'Target': cat_test_pred.astype(int)
})

# Save the submission DataFrame to a CSV file
submission.to_csv('cat_submission4.csv', index=False)

In [451]:
# 5.2 Model Training with XGBoost
model = xgb.XGBClassifier(learning_rate=0.001, max_depth=8, n_estimators=600, n_jobs=-1, random_state=1)
#model.fit(X_resampled, y_resampled)
model.fit(x_trainb , y_trainb)

In [453]:
# 5.3 Model Evaluation
y_pred = model.predict(x_testb)
accuracy = accuracy_score(y_testb, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_testb, y_pred))
print(f'Confusion Matrix: \n{confusion_matrix(y_testb, y_pred)}')

Accuracy: 0.83
              precision    recall  f1-score   support

         0.0       0.78      0.77      0.77       458
         1.0       0.86      0.86      0.86       730

    accuracy                           0.83      1188
   macro avg       0.82      0.82      0.82      1188
weighted avg       0.83      0.83      0.83      1188

Confusion Matrix: 
[[353 105]
 [102 628]]


In [443]:
# Evaluate on train and test data
acc_train = model.score(x_trainb , y_trainb)
acc_test = model.score(X_test, y_test)

print("Training Accuracy:", acc_train)
print("Test Accuracy:", acc_test)

Training Accuracy: 0.9037598204264871
Test Accuracy: 0.8613138686131386


In [445]:
test_pred = model.predict(test_data)  #making predictions on the test data

In [447]:
sub = pd.read_csv("Sample Submission.csv")

In [1727]:
sub.head() #checking the first five rows of the submission dataframe

Unnamed: 0,Id
0,16501
1,10444
2,14288
3,10409
4,17330


In [449]:
# Create a submission DataFrame
submission = pd.DataFrame({
    'ID': sub['Id'], 
    'Target': test_pred
})

# Save the submission DataFrame to a CSV file
submission.to_csv('submission5.csv', index=False)

### ADABOOST

In [455]:
#Creating model
ada_model=AdaBoostClassifier(base_estimator=GradientBoostingClassifier(learning_rate=0.01, n_estimators=500, max_depth=8),
                             learning_rate=0.1, n_estimators=100, )

#Training Catboost Model on train set
ada_model.fit(x_trainb,y_trainb)

#Predictiing on Test Set
y_pred_ada=ada_model.predict(x_testb)



In [457]:
# performance
print(f'Accuracy Score: {accuracy_score(y_testb,y_pred_ada)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_testb, y_pred_ada)}')
print(f'Area Under Curve: {roc_auc_score(y_testb, y_pred_ada)}')
print(f'Recall score: {recall_score(y_testb,y_pred_ada)}')
print(f'Precision score: {precision_score(y_testb,y_pred_ada)}')
print(f'f1 score: {f1_score(y_testb,y_pred_ada)}')

Accuracy Score: 0.8425925925925926
Confusion Matrix: 
[[408  50]
 [137 593]]
Area Under Curve: 0.8515792307232158
Recall score: 0.8123287671232877
Precision score: 0.9222395023328149
f1 score: 0.8638018936635106


In [459]:
ada_test_pred = ada_model.predict(test_data)  #making predictions on the test data

In [461]:
# Create a submission DataFrame
ada_submission = pd.DataFrame({
    'ID': sub['Id'], 
    'Target': ada_test_pred.astype(int)
})

# Save the submission DataFrame to a CSV file
ada_submission.to_csv('ada_submission5.csv', index=False)

### Out of fold cross validation

In [228]:
from sklearn.model_selection import cross_val_score, KFold,StratifiedKFold,RepeatedStratifiedKFold
kfold = StratifiedKFold(n_splits =5)

In [250]:
cat_err_list = []
cat_test_pred = []
fold=RepeatedStratifiedKFold(n_splits =10,n_repeats=3,random_state=1)
i = 1
for train_index, test_index in fold.split(X,y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = CatBoostClassifier(learning_rate= 0.1,n_estimators =500,eval_metric = 'Accuracy', use_best_model= True,
    max_depth=8, border_count=32,l2_leaf_reg=3, loss_function = 'CrossEntropy')
    model.fit(X_train, y_train, eval_set =[(X_train,y_train),(X_test, y_test)],early_stopping_rounds=100)
    preds=model.predict(X_test)
    print("err: {}".format(metrics.f1_score(y_test, preds)) )
    cat_err_list.append(metrics.f1_score(y_test, preds))

    p2 = model.predict(test_data)

    cat_test_pred.append(p2)

np.mean(cat_err_list)

0:	learn: 0.8304930	test: 0.8304930	test1: 0.8207934	best: 0.8207934 (0)	total: 7.1ms	remaining: 3.54s
1:	learn: 0.8374924	test: 0.8374924	test1: 0.7989056	best: 0.8207934 (0)	total: 15.7ms	remaining: 3.91s
2:	learn: 0.8408399	test: 0.8408399	test1: 0.8112175	best: 0.8207934 (0)	total: 22.4ms	remaining: 3.71s
3:	learn: 0.8446439	test: 0.8446439	test1: 0.8057456	best: 0.8207934 (0)	total: 29ms	remaining: 3.6s
4:	learn: 0.8525563	test: 0.8525563	test1: 0.8125855	best: 0.8207934 (0)	total: 34.2ms	remaining: 3.38s
5:	learn: 0.8587949	test: 0.8587949	test1: 0.8248974	best: 0.8248974 (5)	total: 39.3ms	remaining: 3.24s
6:	learn: 0.8642727	test: 0.8642727	test1: 0.8248974	best: 0.8248974 (5)	total: 44.5ms	remaining: 3.13s
7:	learn: 0.8600122	test: 0.8600122	test1: 0.8180575	best: 0.8248974 (5)	total: 48.1ms	remaining: 2.96s
8:	learn: 0.8589470	test: 0.8589470	test1: 0.8043776	best: 0.8248974 (5)	total: 53.3ms	remaining: 2.91s
9:	learn: 0.8636640	test: 0.8636640	test1: 0.8071135	best: 0.8248974

0.8909922856688379

In [254]:
cat_test_pred

[array([1, 1, 1, ..., 1, 1, 0], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 0], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 0], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 0], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 0], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 0], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 0], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 0], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 0], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 0], dt

In [256]:
# Create a submission DataFrame
submission = pd.DataFrame({
    'ID': sub['Id'], 
    'Target': cat_test_pred[28].astype(int)
})

# Save the submission DataFrame to a CSV file
submission.to_csv('otf_submission5.csv', index=False)