In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [None]:
df = pd.read_csv('/content/train.csv')
df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15585961.0,Hs?,684.0,France,Male,41.0,10.0,0.0,2.0,1.0,1.0,173948.4,1.0
1,1,15643378.0,Bellucci,807.0,France,Male,32.0,2.0,0.0,2.0,1.0,0.0,144532.85,0.0
2,2,15651022.0,O'Donnell,553.0,Germany,Male,53.0,9.0,102278.52,1.0,1.0,0.0,158816.03,1.0
3,3,15676521.0,Chiang,587.0,France,Female,34.0,6.0,0.0,1.0,1.0,0.0,167984.72,1.0
4,4,15772650.0,Kambinachi,732.0,Germany,Female,30.0,5.0,135070.92,1.0,1.0,1.0,116097.26,0.0


Use Label Encoder for categorical data

In [None]:
categorical_features = ['Geography', 'Gender']
label_encoder = LabelEncoder()
for feature in categorical_features:
    df[feature] = label_encoder.fit_transform(df[feature])

df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15585961.0,Hs?,684.0,0,1,41.0,10.0,0.0,2.0,1.0,1.0,173948.4,1.0
1,1,15643378.0,Bellucci,807.0,0,1,32.0,2.0,0.0,2.0,1.0,0.0,144532.85,0.0
2,2,15651022.0,O'Donnell,553.0,1,1,53.0,9.0,102278.52,1.0,1.0,0.0,158816.03,1.0
3,3,15676521.0,Chiang,587.0,0,0,34.0,6.0,0.0,1.0,1.0,0.0,167984.72,1.0
4,4,15772650.0,Kambinachi,732.0,1,0,30.0,5.0,135070.92,1.0,1.0,1.0,116097.26,0.0


split the data

In [None]:
from sklearn.preprocessing import StandardScaler

x = df.drop(['id','CustomerId', 'Surname', 'Exited'], axis = 1)
y = df['Exited']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


# **Logistic regression**

---



In [None]:
logistic_model = LogisticRegression()
logistic_model.fit(x_train, y_train)
logistic_pred = logistic_model.predict(x_test)

print(f"F1 Score: {f1_score(y_test, logistic_pred)} \nROC AUC Score: {roc_auc_score(y_test, logistic_pred)}")

F1 Score: 0.5902293120638086 
ROC AUC Score: 0.7241637010429332


# **Desision Tree**

---



In [None]:
desision_tree_model = DecisionTreeClassifier()
desision_tree_model.fit(x_train, y_train)
desision_tree_pred = desision_tree_model.predict(x_test)

print(f"F1 Score: {f1_score(y_test, desision_tree_pred)} \nROC AUC Score: {roc_auc_score(y_test, desision_tree_pred)}")

F1 Score: 0.6175266175266175 
ROC AUC Score: 0.7629978242459594


# **Random Forest**

---



In [None]:
#random forest model is the best model, so we will use it


random_forest_model = RandomForestClassifier()

random_forest_model.fit(x_train, y_train)
random_forest_pred = random_forest_model.predict(x_test)

print(f"F1 Score: {f1_score(y_test, random_forest_pred)} \nROC AUC Score: {roc_auc_score(y_test, random_forest_pred)}")

F1 Score: 0.7124542124542125 
ROC AUC Score: 0.8023684299961368


In [None]:
submission_df = pd.read_csv('/content/test (1).csv')

categorical_features = ['Geography', 'Gender']
label_encoder = LabelEncoder()
for feature in categorical_features:
    submission_df[feature] = label_encoder.fit_transform(submission_df[feature])

x_submission = submission_df.drop(['id','CustomerId', 'Surname'], axis = 1)
x_submission = scaler.transform(x_submission)


In [None]:
#this cell is to get the csv file for the submission
submission_pred = random_forest_model.predict(x_submission)
results_df = pd.DataFrame({'id': submission_df['id'], 'Exited': submission_pred})
results_df.to_csv('results.csv',index=False)