# Нужно спрогнозировать, уйдёт клиент ли клиент из банка (Exited). Рассмотреть задачу при помощи разных (как минимум 3) моделей классификации, для каждой из которых подобрать гиперпараметры (хотя бы для одной - при помощи GridSearchCV). Датасет и небольшой тестовый пример (на него не сильно опирайтесь, смотрите лекцию) приложены к заданию

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

**Импорт данных**

In [None]:
file_path ='/content/Churn.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


**Предобработка данных**

In [None]:
df.isnull().sum()

Unnamed: 0,0
RowNumber,0
CustomerId,0
Surname,0
CreditScore,0
Geography,0
Gender,0
Age,0
Tenure,909
Balance,0
NumOfProducts,0


In [None]:
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

df['Tenure'] = df['Tenure'].fillna(df['Tenure'].median())

df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [None]:
X = df.drop('Exited', axis=1)
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['Geography', 'Gender']),
        ('num', StandardScaler(), ['CreditScore', 'Age', 'Tenure', 'Balance',
                                   'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'])
    ])

In [None]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print("\nAccuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("\n", classification_report(y_test, y_pred))

In [None]:
log_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

log_reg.fit(X_train, y_train)
print("Логистическая регрессия:")
evaluate_model(log_reg, X_test, y_test)

Логистическая регрессия:

Accuracy: 0.8115
Precision: 0.5563380281690141
Recall: 0.2010178117048346
F1 Score: 0.2953271028037383

               precision    recall  f1-score   support

           0       0.83      0.96      0.89      1607
           1       0.56      0.20      0.30       393

    accuracy                           0.81      2000
   macro avg       0.69      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000



In [None]:
tree = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

param_grid_tree = {
    'classifier__max_depth': [3, 5, 7, 10, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

grid_tree = GridSearchCV(tree, param_grid_tree, cv=5, scoring='f1', n_jobs=-1)
grid_tree.fit(X_train, y_train)
print("Tree:")
print("\nГиперпараметры для Tree:", grid_tree.best_params_)
evaluate_model(grid_tree.best_estimator_, X_test, y_test)

Tree:

Гиперпараметры для Tree: {'classifier__max_depth': 7, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2}

Accuracy: 0.852
Precision: 0.6789667896678967
Recall: 0.4681933842239186
F1 Score: 0.5542168674698795

               precision    recall  f1-score   support

           0       0.88      0.95      0.91      1607
           1       0.68      0.47      0.55       393

    accuracy                           0.85      2000
   macro avg       0.78      0.71      0.73      2000
weighted avg       0.84      0.85      0.84      2000



In [None]:
knn = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])

param_grid_knn = {
    'classifier__n_neighbors': [3, 5, 7, 9, 11],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]
}

grid_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='f1', n_jobs=-1)
grid_knn.fit(X_train, y_train)
print("K-NN:")
print("\nГиперпараметры для  K-NN:", grid_knn.best_params_)
evaluate_model(grid_knn.best_estimator_, X_test, y_test)

K-NN:

Гиперпараметры для  K-NN: {'classifier__n_neighbors': 3, 'classifier__p': 2, 'classifier__weights': 'uniform'}

Accuracy: 0.8375
Precision: 0.6214285714285714
Recall: 0.44274809160305345
F1 Score: 0.5170876671619614

               precision    recall  f1-score   support

           0       0.87      0.93      0.90      1607
           1       0.62      0.44      0.52       393

    accuracy                           0.84      2000
   macro avg       0.75      0.69      0.71      2000
weighted avg       0.82      0.84      0.83      2000



In [None]:
svm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(probability=True, random_state=42))
])

param_grid_svm = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto']
}

grid_svm = GridSearchCV(svm, param_grid_svm, cv=3, scoring='f1', n_jobs=-1)
grid_svm.fit(X_train, y_train)

print("SVM:")
print("\nГиперпараметры для SVM:", grid_svm.best_params_)
evaluate_model(grid_svm.best_estimator_, X_test, y_test)

SVM:

Гиперпараметры для SVM: {'classifier__C': 10, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf'}

Accuracy: 0.8605
Precision: 0.7355371900826446
Recall: 0.4529262086513995
F1 Score: 0.5606299212598426

               precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.74      0.45      0.56       393

    accuracy                           0.86      2000
   macro avg       0.81      0.71      0.74      2000
weighted avg       0.85      0.86      0.85      2000

