In [46]:
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

In [44]:
with open('../data/df_clean.pkl', 'rb') as f:
    df_clean = pickle.load(f)

In [28]:
df_clean

Unnamed: 0,Churn_Yes,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,...,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,cluster_1,cluster_2
0,0,0,1,0,0,1,0,0,0,0,...,1,0,1,0,0,1,29.85,29.85,0,1
1,0,1,0,0,1,0,0,0,0,0,...,0,0,0,1,0,34,56.95,1889.5,1,0
2,1,1,0,0,1,0,0,0,0,0,...,1,0,0,1,0,2,53.85,108.15,1,0
3,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,45,42.30,1840.75,0,1
4,1,0,0,0,1,0,0,1,0,0,...,1,0,1,0,0,2,70.70,151.65,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,1,1,1,0,1,0,0,0,...,1,0,0,1,0,24,84.80,1990.5,1,0
7039,0,0,1,1,1,0,1,1,0,0,...,1,1,0,0,0,72,103.20,7362.9,1,0
7040,0,0,1,1,0,1,0,0,0,0,...,1,0,1,0,0,11,29.60,346.45,0,1
7041,1,1,1,0,1,0,1,1,0,0,...,1,0,0,1,1,4,74.40,306.6,1,0


In [29]:
X = df_clean.iloc[:, 1:]

In [30]:
y = df_clean['Churn_Yes']

In [31]:
# We now split our dataset between train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Scaling of our X_train and X_test
columns_to_scale = ['tenure', 'MonthlyCharges', 'TotalCharges']
std = StandardScaler()
std.fit(X_train[columns_to_scale])
X_train[columns_to_scale] = std.transform(X_train[columns_to_scale])
X_test[columns_to_scale] = std.transform(X_test[columns_to_scale])

In [32]:
X_train

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,...,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,cluster_1,cluster_2
2964,1,1,0,1,0,1,0,0,0,1,...,1,0,1,0,0,-0.340832,-0.210557,-0.409637,1,0
5113,0,1,1,1,0,1,1,0,0,1,...,0,1,0,0,0,1.574024,1.630408,2.602889,1,0
5363,1,1,1,1,0,1,0,1,1,0,...,0,0,0,1,0,1.533282,-1.309143,-0.246971,0,0
5074,0,0,1,1,0,0,0,1,1,0,...,0,1,0,0,0,0.677709,-1.505557,-0.597636,0,0
156,0,0,0,1,0,1,1,0,0,0,...,1,0,0,0,0,-0.422315,0.648338,-0.199795,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4939,1,0,0,1,0,0,0,1,1,0,...,0,0,0,1,0,-1.277888,-1.523866,-0.996249,0,0
3269,1,0,0,1,0,1,0,1,1,0,...,0,1,0,0,0,1.329574,-1.327453,-0.297417,0,0
1658,1,1,1,1,0,1,1,0,0,1,...,0,1,0,0,0,0.718450,1.374071,1.355014,1,0
2612,0,1,1,1,0,1,0,0,0,0,...,0,1,0,0,0,-0.748248,0.525163,-0.502333,1,0


In [33]:
X_test

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,...,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,cluster_1,cluster_2
5561,1,0,0,1,0,0,0,1,1,0,...,0,0,1,0,0,-1.277888,-1.465608,-0.995476,0,0
5814,1,0,0,1,0,0,0,1,1,0,...,1,1,0,0,0,-0.666764,-1.467273,-0.858718,0,0
2645,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,-1.277888,-1.493905,-0.995851,0,0
3983,1,0,0,0,1,0,0,0,0,0,...,1,0,1,0,0,-1.277888,-1.345762,-0.993886,0,1
6438,1,0,0,1,0,1,1,0,0,0,...,1,0,1,0,1,-1.277888,0.323756,-0.971733,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2757,0,1,1,1,0,1,1,0,0,1,...,0,0,0,0,0,1.492541,0.658325,1.578880,1,0
5702,1,1,1,1,0,1,1,0,0,1,...,1,1,0,0,1,1.574024,1.345774,2.329593,1,0
1662,1,0,0,1,0,0,1,0,0,0,...,1,0,1,0,0,-1.114922,0.168955,-0.861213,1,0
2766,0,1,0,1,0,1,1,0,0,1,...,1,0,1,0,0,1.288833,1.675350,2.262473,1,0


In [35]:
steps = [('dim_reducer', PCA()),
         ('estimator', LogisticRegression(max_iter=1000))]

pipe = Pipeline(steps=steps)

In [37]:
pipe

Pipeline(steps=[('dim_reducer', PCA()),
                ('estimator', LogisticRegression(max_iter=1000))])

In [38]:
clf = LogisticRegression()

In [42]:
type(clf).__name__

'LogisticRegression'

In [74]:
steps = [('dim_reducer', PCA(n_components=22)),
         ('estimator', AdaBoostClassifier(n_estimators=20, learning_rate=1.2))]

pipe = Pipeline(steps=steps)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(f1_score(y_test, y_pred))

0.5882352941176471
