In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.dummy import DummyClassifier
from sklearn.utils import shuffle

import warnings
warnings.filterwarnings("ignore")



In [None]:
#data = pd.read_csv("/kaggle/input/bank-customer-churn-modeling/Churn_Modelling.csv")
data=pd.DataFrame(pd.read_csv("/kaggle/input/bank-customer-churn-modeling/Churn_Modelling.csv"))

In [None]:
data.info()
data.head()

In [None]:
data.describe()

In [None]:
data.corr()

In [None]:
data=data.drop(['RowNumber', 'CustomerId','Surname'], axis=1)

In [None]:
data.head()

In [None]:
data.columns = data.columns.str.lower()

In [None]:
data['tenure'].describe()

In [None]:
100 * len(data[data['tenure'].isna()]) / data.shape[0]

In [None]:
data[data['tenure'].isna()].head()

In [None]:
data['tenure'] = data['tenure'].fillna(-1)

In [None]:
#Проведем кодирование с помощью OHE
data = pd.get_dummies(data, drop_first=True)

#Стандартизируем признаки с помощью StandardScaler
scaler = StandardScaler()
numeric = ['creditscore', 'age', 'balance', 'estimatedsalary']
scaler.fit(data[numeric])
data[numeric] = scaler.transform(data[numeric])

In [None]:
#С помощью train_test_split Разделим наш датасет на следующие выборки 
features = data.drop(['exited'], axis=1)
target = data['exited']

features_train, features_x, target_train, target_x = train_test_split(features, target, train_size=0.6, test_size=0.4, random_state=12345)
features_test, features_valid, target_test, target_valid = train_test_split(features_x, target_x, test_size=0.5, random_state=12345)

print('Training set size:', features_train.shape[0])
print('Validating set size:', features_valid.shape[0])
print('Test set size:',features_test.shape[0])

In [None]:
best_model = None
best_f1 = 0
best_roc = 0
best_depth = 0

In [None]:
for depth in range(1,13):
    model = DecisionTreeClassifier(random_state=12345, max_depth=depth)
    
    model.fit(features_train, target_train)
    
    predictions_model = model.predict(features_valid)
    
    f1_model = f1_score(target_valid, predictions_model)
    
    if f1_model > best_f1:
        best_model = model
        best_f1 = f1_model
        best_roc = roc_auc_score(target_valid, model.predict_proba(features_valid)[:,1])
        best_depth = depth

print('F1 Decision Tree:', best_f1)
print('ROC Best result:', best_roc)
print('Depth Tree:', best_depth)

In [None]:
best_model = None
best_f1 = 0
best_roc = 0
best_depth = 0
best_est = 0

In [None]:
for est in range(1,61):
    for depth in range(1,31):
        model = RandomForestClassifier(random_state=12345, max_depth=depth, n_estimators=est)
        model.fit(features_train, target_train)
        predictions_model = model.predict(features_valid)
        f1_model = f1_score(target_valid, predictions_model)
        
        if f1_model > best_f1:
            best_model = model
            best_f1 = f1_model
            best_roc = roc_auc_score(target_valid, model.predict_proba(features_valid)[:,1])
            best_depth = depth
            best_est = est
  
print('F1 Random Forest:', best_f1)
print('ROC Best result:', best_roc)
print('Forest depth:', best_depth)
print('Amount of estimators:', best_est)

In [None]:
model = LogisticRegression(random_state=12345, solver='liblinear')
model.fit(features_train, target_train)
predictions_model = model.predict(features_valid)
f1_model = f1_score(target_valid, predictions_model)
roc_model = roc_auc_score(target_valid, model.predict_proba(features_valid)[:,1])

print('F1 Model:',f1_model)
print('ROC Model:', roc_model)

In [None]:
target_train.value_counts(normalize=1)

In [None]:
target_valid.value_counts(normalize=1)

In [None]:
target_test.value_counts(normalize=1)

In [None]:
best_model = None
best_f1 = 0
best_roc = 0
best_depth=0
best_est = 0

for est in range(1, 26):
    for depth in range (1, 26):
        model = RandomForestClassifier(random_state=12345, n_estimators=est, max_depth=depth, class_weight='balanced')
        
        model.fit(features_train, target_train)
        
        predictions_model = model.predict(features_valid) 
        
        f1_model = f1_score(target_valid, predictions_model)

        if f1_model > best_f1:
            best_model = model
            best_f1 = f1_model
            best_roc = roc_auc_score(target_valid, model.predict_proba(features_valid)[:,1])
            best_depth = depth
            best_est = est

print("F1 Best Model:", f1_model) 
print("ROC Best Model:", best_roc)
print("Depth Tree:", best_depth) 
print("Amount of etimators:", best_est)

In [None]:
def upsample(features, target, repeat):
    features_zeros = features[target==0]
    features_ones = features[target==1]
    target_zeros=target[target==0]
    target_ones=target[target==1]
    
    features_upsampled = pd.concat([features_zeros]+[features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros]+[target_ones]*repeat)
    
    features_upsampled, target_upsampled = shuffle(features_upsampled, target_upsampled, random_state=12345)
    return features_upsampled, target_upsampled 

In [None]:
features_upsampled, target_upsampled = upsample(features_train, target_train, 4)

In [None]:
target_upsampled.value_counts()