In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn.utils import shuffle
import matplotlib.pyplot as plt


np.random.seed(12345)

data = pd.read_csv('D:/Churn.csv')

data = data.drop(['RowNumber', 'Surname', 'CustomerId'], axis=1)

data['Tenure'] = data['Tenure'].fillna(1)

print(data['Geography'].value_counts())
data = pd.get_dummies(data, drop_first=True)

features = data.drop('Exited', axis=1)
target = data['Exited']

# соотношение выборок: 3:1:1
features_train, features_rem, target_train, target_rem = train_test_split(features, target, train_size=0.6)
features_valid, features_test, target_valid, target_test = train_test_split(features_rem, target_rem, test_size=0.2)

# первая техника: upsampling. target_train.value_counts до: 0: 4804, 1: 1196; после: 0: 4804, 1: 3588
'''
ratio = len(data.loc[data['Exited'] == 0]) // len(data.loc[data['Exited'] == 1])
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]
    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    features_upsampled, target_upsampled = shuffle(features_upsampled, target_upsampled, random_state=12345)
    
    return features_upsampled, target_upsampled

features_upsampled_train, target_upsampled_train = upsample(features_train, target_train, ratio)
'''
# вторая техника: downsampling. target_train.value_counts до: 0: 4804, 1: 1196; после: 0: 979, 1: 1196
print(target_train.value_counts())
fraction = 0.2037
def downsample(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]
    features_downsampled = pd.concat([features_zeros.sample(frac=fraction, random_state=12345)] + [features_ones])
    target_downsampled = pd.concat([target_zeros.sample(frac=fraction, random_state=12345)] + [target_ones])
    features_downsampled, target_downsampled = shuffle(features_downsampled, target_downsampled, random_state=12345)
    return features_downsampled, target_downsampled

features_downsampled_train, target_downsampled_train = downsample(features_train, target_train, fraction)
print(target_downsampled_train.value_counts())

# 1 model: tree classifier
best_score = 0
best_depth = 0
for depth in range(1, 14):
    model_decision_tree = DecisionTreeClassifier(max_depth=depth)
    model_decision_tree.fit(features_train, target_train)
    model_decision_tree_predictions = model_decision_tree.predict(features_valid)
    score = f1_score(target_valid, model_decision_tree_predictions)
    if score > best_score:
        best_depth = depth
        best_score = score
print(best_score, ' ', best_depth)

# 2 model: random forest classifier
best_score = 0
best_depth = 0
best_est = 0
for est in range(10, 23):
    for depth in range(1, 18):
        model_forest = RandomForestClassifier(n_estimators=est, max_depth=depth)
        model_forest.fit(features_train, target_train)
        forest_predict = model_forest.predict(features_valid)
        score = f1_score(target_valid, forest_predict)
        if score > best_score:
            best_depth = depth
            best_score = score
            best_est = est
print(best_score, ' ', best_depth, ' ', best_est)            
  
# 3 model: logistic regression
model = LogisticRegression(solver='liblinear', max_iter=100)
model.fit(features_train, target_train)
predict_regr = model.predict(features_valid)
print(f1_score(target_valid, predict_regr))
probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
fpr, tpr, thresholds = roc_curve(target_valid, probabilities_one_valid)
auc_roc = roc_auc_score(target_valid, probabilities_one_valid)
print(auc_roc)
'''
plt.figure()
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-кривая')
plt.show()
'''

France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64
0    4804
1    1196
Name: Exited, dtype: int64
1    1196
0     979
Name: Exited, dtype: int64
0.5523091423185674   6
0.57397504456328   14   15
0.11340206185567009
0.6557291822178353


"\nplt.figure()\nplt.plot(fpr, tpr)\nplt.xlim([0.0, 1.0])\nplt.ylim([0.0, 1.0])\nplt.xlabel('False Positive Rate')\nplt.ylabel('True Positive Rate')\nplt.title('ROC-кривая')\nplt.show()\n"

# Результаты до обработки на валидационной выборке
## f1_score
### tree classifier model: 0.5523
### random forest: 0.5739
### logistic regression: 0.1134
### auc_roc score: 0.6557

# Upsampling: результаты на тестовой выборке
## f1_score
### tree classifier model: 0.5885
### random forest: 0.6441
### logistic regression: 0.4538
### auc_roc score: 0.6999

# Downsampling: результаты на тестовой выборке
## f1_score
### tree classifier model: 0.5871
### random forest: 0.5920
### logistic regression: 0.430
### auc_roc score: 0.6983