In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

np.random.seed(59)

# set the max columns to none
pd.set_option('display.max_columns', None)

In [2]:
rel_path = 'featured_data\\train.csv'
df = pd.read_csv(rel_path)
df.head()

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn,total_minutes,total_calls,total_charge,area_code_area_code_408,area_code_area_code_415,area_code_area_code_510,state_MD,state_MN,state_NJ,state_TX,state_WV,state_other,avg_mt_minutes,avg_mt_calls,avg_mt_charge,avg_call_charge,avg_intl_call_charge,avg_call_minutes,avg_intl_call_minutes,both_plans
0,107,0,1,26,13.7,3,3.7,1,0,611.5,329,55.54,0,1,0,0,0,0,0,0,1,5.714953,3.074766,0.519065,0.168815,1.233333,1.858663,4.566667,0
1,137,0,0,0,12.2,5,3.29,0,0,527.2,328,59.0,0,1,0,0,0,1,0,0,0,3.848175,2.394161,0.430657,0.179878,0.658,1.607317,2.44,0
2,84,1,0,0,6.6,7,1.78,2,0,558.2,248,65.02,1,0,0,0,0,0,0,0,1,6.645238,2.952381,0.774048,0.262177,0.254286,2.250806,0.942857,0
3,75,1,0,0,10.1,3,2.73,3,0,501.9,356,49.36,0,1,0,0,0,0,0,0,1,6.692,4.746667,0.658133,0.138652,0.91,1.409831,3.366667,0
4,121,0,1,24,7.5,7,2.03,3,0,779.3,314,76.28,0,0,1,0,0,0,0,0,1,6.440496,2.595041,0.630413,0.24293,0.29,2.481847,1.071429,0


### Without Dealing with Imbalance

In [3]:
X = df.drop('churn', axis=1)
y = df.churn

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [4]:
# prepare the cross-validation procedure
cv = StratifiedKFold(n_splits=10, random_state=59, shuffle=True)
# create model
model = LogisticRegression()
# evaluate model
scores = cross_val_score(model, X, y, scoring='f1_weighted', cv=cv, n_jobs=-1)
# report performance
print('Weighted F1-Score: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Weighted F1-Score: 0.845 (0.014)


### With SMOTE

In [5]:
X = df.drop('churn', axis=1)
y = df.churn

In [6]:
from collections import Counter
from imblearn.over_sampling import SMOTE

# summarize class distribution
counter = Counter(y)
print(counter)
# transform the dataset
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y)
print(counter)

scaler = StandardScaler()
X = scaler.fit_transform(X)

Counter({0: 3597, 1: 598})
Counter({0: 3597, 1: 3597})


In [7]:
# prepare the cross-validation procedure
cv = StratifiedKFold(n_splits=10, random_state=59, shuffle=True)
# create model
model = LogisticRegression()
# evaluate model
scores = cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1)
# report performance
print('F1-Score: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

F1-Score: 0.844 (0.010)


### With SMOTEENN (SMOTE + ENN - duet of undersampling and oversampling)

In [8]:
from collections import Counter
from imblearn.combine import SMOTEENN

X = df.drop('churn', axis=1)
y = df.churn
# summarize class distribution
counter = Counter(y)
print(counter)
# transform the dataset
oversample = SMOTEENN()
X, y = oversample.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y)
print(counter)

scaler = StandardScaler()
X = scaler.fit_transform(X)

Counter({0: 3597, 1: 598})
Counter({1: 3157, 0: 2154})


In [9]:
# prepare the cross-validation procedure
cv = StratifiedKFold(n_splits=10, random_state=59, shuffle=True)
# create model
model = LogisticRegression()
# evaluate model
scores = cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1)
# report performance
print('F1-Score: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

F1-Score: 0.882 (0.013)
