In [19]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, label_binarize

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier     
from lightgbm import LGBMClassifier          

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)

In [4]:
data = pd.read_csv('/Users/tarlanjabiyev/Desktop/Sprint 19/churn.csv')

In [6]:
data.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,1,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,2,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,3,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,4,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,5,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


## Datanın hazırlanması - Data Preprocessing

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 18 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         45211 non-null  int64 
 1   age        45211 non-null  int64 
 2   job        45211 non-null  object
 3   marital    45211 non-null  object
 4   education  45211 non-null  object
 5   default    45211 non-null  object
 6   balance    45211 non-null  int64 
 7   housing    45211 non-null  object
 8   loan       45211 non-null  object
 9   contact    45211 non-null  object
 10  day        45211 non-null  int64 
 11  month      45211 non-null  object
 12  duration   45211 non-null  int64 
 13  campaign   45211 non-null  int64 
 14  pdays      45211 non-null  int64 
 15  previous   45211 non-null  int64 
 16  poutcome   45211 non-null  object
 17  y          45211 non-null  object
dtypes: int64(8), object(10)
memory usage: 6.2+ MB


In [11]:
data["y"] = data["y"].replace({"yes": 1, "no": 0}).astype("category")

  data["y"] = data["y"].replace({"yes": 1, "no": 0}).astype("category")


In [13]:
data["y"].value_counts(normalize=True).round(2)

y
0    0.88
1    0.12
Name: proportion, dtype: float64

In [15]:
data.isna().sum()

id           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [17]:
target = 'y'
exclude = ['id']

In [179]:
X = data.drop(columns=[target])
y = data[target]

In [86]:
# Datanın bölünməsi - Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [88]:
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

In [98]:
#For Tree/Forest/XGB: numeric passthrough (no scaling), one-hot for categoricals
preprocess_basic = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop",
)
preprocess_basic

In [100]:
# For GLM/KNN/SVM: scaling for numeric, one-hot for categoricals
preprocess_scaled = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("scaler", StandardScaler())]), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop",
)
preprocess_scaled

## Alqoritmalar

In [181]:
pos_label = 1

In [183]:
def eval_clf(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)

    # Basic metrics
    acc = accuracy_score(y_test, y_hat)
    f1 = f1_score(y_test, y_hat, average="weighted")
    prec = precision_score(y_test, y_hat, average="weighted", zero_division=0)
    rec = recall_score(y_test, y_hat, average="weighted", zero_division=0)

    metrics = {"model": name, "accuracy": acc, "f1_weighted": f1,
               "precision_weighted": prec, "recall_weighted": rec}

    # Probabilities (if available)
    roc = np.nan
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_test)
        roc = roc_auc_score(y_test, proba[:, pos_label])
        metrics.update({"roc_auc": roc})

    print(f"\n[{name}]")
    print(f"Accuracy: {acc:,.4f}")
    print(f"F1 score: {f1:,.4f}")
    print(f"Prec: {prec:,.4f}")
    print(f"Recall: {rec:,.4f}")
    print(f"ROC AUC: {roc:,.4f}")

    return metrics

In [185]:
results = []

### GLM — Logistic Regression

In [188]:
glm_pipe = Pipeline([
    ("prep", preprocess_scaled),
    ("model", LogisticRegression(max_iter=1000)) 
])
results.append(eval_clf("GLM (Logistic)", glm_pipe, X_train, y_train, X_test, y_test))


[GLM (Logistic)]
Accuracy: 0.9069
F1 score: 0.8950
Prec: 0.8945
Recall: 0.9069
ROC AUC: 0.9246


### KNN

In [191]:
knn_pipe = Pipeline([
    ("prep", preprocess_scaled),
    ("model", KNeighborsClassifier(n_neighbors=5))
])
results.append(eval_clf("KNN", knn_pipe, X_train, y_train, X_test, y_test))


[KNN]
Accuracy: 0.8976
F1 score: 0.8883
Prec: 0.8846
Recall: 0.8976
ROC AUC: 0.8493


### Naive Bayes

In [193]:
nb_pipe = Pipeline([
    ("prep", preprocess_basic),
    ("model", GaussianNB())
])
results.append(eval_clf("Naive Bayes", nb_pipe, X_train, y_train, X_test, y_test))


[Naive Bayes]
Accuracy: 0.8609
F1 score: 0.8684
Prec: 0.8783
Recall: 0.8609
ROC AUC: 0.8514


### SVM

In [204]:
svm_pipe = Pipeline([
    ("prep", preprocess_scaled),
    ("model", SVC(kernel="rbf", probability=True, random_state=123))
])
results.append(eval_clf("SVM", svm_pipe, X_train, y_train, X_test, y_test))


[SVM (RBF)]
Accuracy: 0.9120
F1 score: 0.9022
Prec: 0.9016
Recall: 0.9120
ROC AUC: 0.9187


### Decision Tree

In [207]:
tree_pipe = Pipeline([
    ("prep", preprocess_basic),
    ("model", DecisionTreeClassifier(max_depth=15, min_samples_split=2, random_state=123))
])
results.append(eval_clf("Decision Tree", tree_pipe, X_train, y_train, X_test, y_test))


[Decision Tree]
Accuracy: 0.9007
F1 score: 0.8983
Prec: 0.8964
Recall: 0.9007
ROC AUC: 0.7410


### Random Forest (Bagging)

In [210]:
rf_pipe = Pipeline([
    ("prep", preprocess_basic),
    ("model", RandomForestClassifier(
        n_estimators=500,
        min_samples_leaf=5,
        random_state=123
    ))
])
results.append(eval_clf("Random Forest", rf_pipe, X_train, y_train, X_test, y_test))


[Random Forest]
Accuracy: 0.9142
F1 score: 0.9057
Prec: 0.9048
Recall: 0.9142
ROC AUC: 0.9418


### XGBoost (Boosting)

In [213]:
xgb_pipe = Pipeline([
    ("prep", preprocess_basic),
    ("model", XGBClassifier(
        objective='binary:logistic',
        eval_metric="logloss",
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        min_child_weight=5,
        random_state=123
    ))
])
results.append(eval_clf("XGBoost", xgb_pipe, X_train, y_train, X_test, y_test))


[XGBoost]
Accuracy: 0.9104
F1 score: 0.9076
Prec: 0.9056
Recall: 0.9104
ROC AUC: 0.9428


### LightGBM (Boosting)

In [216]:
lgb_pipe = Pipeline([
    ("prep", preprocess_basic),
    ("model", LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        num_leaves=2**6,
        min_child_samples=5,
        random_state=123
    ))
])
results.append(eval_clf("LightGBM", lgb_pipe, X_train, y_train, X_test, y_test))

[LightGBM] [Info] Number of positive: 4232, number of negative: 31936
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1285
[LightGBM] [Info] Number of data points in the train set: 36168, number of used features: 52
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.117010 -> initscore=-2.021059
[LightGBM] [Info] Start training from score -2.021059

[LightGBM]
Accuracy: 0.9093
F1 score: 0.9061
Prec: 0.9039
Recall: 0.9093
ROC AUC: 0.9411


## Ən yaxşı model seçimi

In [227]:
pd.DataFrame(results).sort_values("roc_auc", ascending=False).reset_index(drop=True)

Unnamed: 0,model,accuracy,f1_weighted,precision_weighted,recall_weighted,roc_auc
0,XGBoost,0.910428,0.907554,0.905554,0.910428,0.942796
1,Random Forest,0.914188,0.905671,0.904763,0.914188,0.94176
2,LightGBM,0.909322,0.906059,0.903877,0.909322,0.941058
3,GLM (Logistic),0.906889,0.895045,0.894484,0.906889,0.924579
4,SVM (RBF),0.911976,0.902192,0.901628,0.911976,0.918723
5,Naive Bayes,0.860887,0.868365,0.878317,0.860887,0.851447
6,KNN,0.8976,0.888285,0.884562,0.8976,0.849302
7,Decision Tree,0.900697,0.898311,0.896393,0.900697,0.740957
