In [406]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.metrics import roc_auc_score

### Out data

#### Our dataset consists of telecom company customers, where we need to analyze their churn rate

In [407]:
data = pd.read_csv('./train.csv')

In [408]:
# Num features
num_cols = [
    'ClientPeriod',
    'MonthlySpending',
    'TotalSpent'
]

# Categorial features
cat_cols = [
    'Sex',
    'IsSeniorCitizen',
    'HasPartner',
    'HasChild',
    'HasPhoneService',
    'HasMultiplePhoneNumbers',
    'HasInternetService',
    'HasOnlineSecurityService',
    'HasOnlineBackup',
    'HasDeviceProtection',
    'HasTechSupportAccess',
    'HasOnlineTV',
    'HasMovieSubscription',
    'HasContractPhone',
    'IsBillingPaperless',
    'PaymentMethod'
]

feature_cols = num_cols + cat_cols
target_col = 'Churn'

### Something about data

In [409]:
data.sample(5)

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,Sex,IsSeniorCitizen,HasPartner,HasChild,HasPhoneService,HasMultiplePhoneNumbers,HasInternetService,HasOnlineSecurityService,HasOnlineBackup,HasDeviceProtection,HasTechSupportAccess,HasOnlineTV,HasMovieSubscription,HasContractPhone,IsBillingPaperless,PaymentMethod,Churn
710,70,48.4,3442.8,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,Yes,Two year,No,Bank transfer (automatic),0
4073,28,35.9,973.65,Female,1,Yes,Yes,No,No phone service,DSL,No,No,No,No,Yes,No,Month-to-month,Yes,Electronic check,0
4983,63,75.7,4676.7,Male,0,Yes,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,No,Yes,Two year,No,Mailed check,0
2053,52,74.0,3877.65,Female,0,No,No,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,No,One year,No,Credit card (automatic),0
2725,71,76.9,5522.7,Male,0,No,Yes,Yes,No,DSL,Yes,Yes,Yes,Yes,Yes,No,Two year,Yes,Credit card (automatic),0


In [410]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5282 entries, 0 to 5281
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ClientPeriod              5282 non-null   int64  
 1   MonthlySpending           5282 non-null   float64
 2   TotalSpent                5282 non-null   object 
 3   Sex                       5282 non-null   object 
 4   IsSeniorCitizen           5282 non-null   int64  
 5   HasPartner                5282 non-null   object 
 6   HasChild                  5282 non-null   object 
 7   HasPhoneService           5282 non-null   object 
 8   HasMultiplePhoneNumbers   5282 non-null   object 
 9   HasInternetService        5282 non-null   object 
 10  HasOnlineSecurityService  5282 non-null   object 
 11  HasOnlineBackup           5282 non-null   object 
 12  HasDeviceProtection       5282 non-null   object 
 13  HasTechSupportAccess      5282 non-null   object 
 14  HasOnlin

In [411]:
data.isna().sum()

ClientPeriod                0
MonthlySpending             0
TotalSpent                  0
Sex                         0
IsSeniorCitizen             0
HasPartner                  0
HasChild                    0
HasPhoneService             0
HasMultiplePhoneNumbers     0
HasInternetService          0
HasOnlineSecurityService    0
HasOnlineBackup             0
HasDeviceProtection         0
HasTechSupportAccess        0
HasOnlineTV                 0
HasMovieSubscription        0
HasContractPhone            0
IsBillingPaperless          0
PaymentMethod               0
Churn                       0
dtype: int64

### Let's analyze our data and draw graphs

In [412]:
data["TotalSpent"] = data["TotalSpent"].replace(' ', np.nan)
data.dropna(inplace=True)

In [413]:
data[num_cols] = data[num_cols].astype(float)

In [414]:
px.box(data[num_cols])

In [415]:
for key in data.keys():
    print(data[key].value_counts())
    print()

ClientPeriod
1.0     457
72.0    284
2.0     165
3.0     162
4.0     136
       ... 
49.0     45
57.0     39
39.0     35
44.0     34
36.0     34
Name: count, Length: 72, dtype: int64

MonthlySpending
20.05    46
19.65    35
19.95    34
20.00    32
19.80    32
         ..
65.05     1
73.25     1
95.55     1
98.45     1
98.20     1
Name: count, Length: 1466, dtype: int64

TotalSpent
20.20      9
19.75      8
20.05      6
19.65      6
19.90      5
          ..
41.85      1
4326.25    1
950.20     1
4264.00    1
1375.60    1
Name: count, Length: 4977, dtype: int64

Sex
Male      2651
Female    2622
Name: count, dtype: int64

IsSeniorCitizen
0    4431
1     842
Name: count, dtype: int64

HasPartner
No     2704
Yes    2569
Name: count, dtype: int64

HasChild
No     3676
Yes    1597
Name: count, dtype: int64

HasPhoneService
Yes    4754
No      519
Name: count, dtype: int64

HasMultiplePhoneNumbers
No                  2508
Yes                 2246
No phone service     519
Name: count, dtype: 

In [416]:
def ceil(x):
    return int(x) + 1 * (x != int(x))

In [417]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [418]:
tot_col = 5
tot_rows = ceil(data.columns.size / tot_col)

fig = make_subplots(rows=tot_rows, cols=tot_col, specs=[[{'type': 'bar'}] * tot_col] * tot_rows, 
                    subplot_titles=data.keys())


for row in range(tot_rows):
    for col in range(tot_col):
        fig.add_trace(
            go.Bar(y=data.iloc[:, row * tot_col + col].value_counts().values,
                    x=data.iloc[:, row * tot_col + col].value_counts().index,),
            row = row + 1, col=col + 1)


fig.update_layout(height=1200, width=1000, title_text="Data")
fig.show()

### Count target classes

In [419]:
(data["Churn"].value_counts(normalize=True).round(2) * 100).astype(str) + "%"

Churn
0    74.0%
1    26.0%
Name: proportion, dtype: object

#### Try Linear Models to out problem

In [420]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline

#### Train test split

In [421]:
train_data, test_data = train_test_split(data, test_size = 0.3, shuffle=True, random_state=42)

In [422]:
train_data.shape, test_data.shape

((3691, 20), (1582, 20))

### Normalize

In [423]:
scaler = StandardScaler()
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])
test_data[num_cols] = scaler.transform(test_data[num_cols])

### One-hot

In [424]:
encoder = OneHotEncoder(drop="first")

In [425]:
encoded_train = pd.DataFrame(encoder.fit_transform(train_data[cat_cols]).todense())
encoded_test = pd.DataFrame(encoder.transform(test_data[cat_cols]).todense())
encoded_train.index = train_data[cat_cols].index
encoded_test.index = test_data[cat_cols].index

train_data = train_data.drop(cat_cols, axis=1)
test_data = test_data.drop(cat_cols, axis=1)

train_data = pd.concat([train_data, encoded_train], axis=1)
test_data = pd.concat([test_data, encoded_test], axis=1)

In [426]:
train_data.columns = train_data.columns.astype(str)
test_data.columns = test_data.columns.astype(str)

In [427]:
X = train_data.drop(target_col, axis=1)
y = train_data[target_col]

#### LogisticRegression with cross validation

In [428]:
model = LogisticRegressionCV(cv=5, n_jobs=-1).fit(X, y)

In [429]:
model.score(test_data.drop(target_col, axis=1), test_data[target_col])

0.7958280657395702

### Best params our model

In [430]:
model.get_params()

{'Cs': 10,
 'class_weight': None,
 'cv': 5,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1.0,
 'l1_ratios': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': -1,
 'penalty': 'l2',
 'random_state': None,
 'refit': True,
 'scoring': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0}

### Now let's try Random Forest Classifier (Bootstrap aggregating on decision trees)

In [431]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [432]:
forest = RandomForestClassifier()

In [433]:
np.mean(cross_val_score(forest, X, y, cv = 5))

0.7875910829473654

In [434]:
forest.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [435]:
grid_params = {
	"n_estimators" : np.arange(100, 501, 100),
	"max_depth" : [3, 5, 7, 9]
}

In [436]:
from sklearn.model_selection import GridSearchCV

In [437]:
grid_clf = GridSearchCV(forest, grid_params, cv=5)
grid_clf.fit(X, y)

In [438]:
grid_clf.best_params_

{'max_depth': 7, 'n_estimators': 400}

In [439]:
best_forest = RandomForestClassifier(max_depth=9, n_estimators=500)

In [440]:
np.mean(cross_val_score(best_forest, X, y, cv = 5))

0.8000550073159731

#### At the end let's get best score using Categorial Boosting

In [460]:
catt_cols = np.arange(0, 27).astype(str)
X[[*catt_cols]] = X[[*catt_cols]].astype(int)
test_data[[*catt_cols]] = test_data[[*catt_cols]].astype(int)

In [441]:
from catboost import CatBoostClassifier, Pool

In [442]:
cat = CatBoostClassifier(verbose=False)

In [444]:
cat.fit(X, y);
preds = cat.predict_proba(test_data.drop(target_col, axis=1))[:, 1]

In [445]:
f"{round(roc_auc_score(test_data[target_col], preds) * 100, 2)}% Almost do notghing"

'83.29% Almost do notghing'

#### Let's increase params and define learning rate

In [499]:
cat = CatBoostClassifier(num_trees=2000, learning_rate=1e-3, verbose=False)

In [500]:
cat.fit(X, y, cat_features=catt_cols);
preds = cat.predict_proba(test_data.drop(target_col, axis=1))[:, 1]

In [501]:
y_pred = cat.predict_proba(test_data.drop(target_col, axis=1))[:, 1]

In [502]:
roc_auc_score(test_data[target_col], y_pred)

0.8405841588279362