In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
churndata = pd.read_csv('Customer-Churn.csv')
churndata

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [3]:
churndata.Churn.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [4]:
churndata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [5]:
#correcting the dtype of the column : Total_charge

churndata["TotalCharges"] = pd.to_numeric(churndata["TotalCharges"], errors='coerce')
churndata[["TotalCharges"]]

Unnamed: 0,TotalCharges
0,29.85
1,1889.50
2,108.15
3,1840.75
4,151.65
...,...
7038,1990.50
7039,7362.90
7040,346.45
7041,306.60


In [6]:
churndata.shape

(7043, 16)

In [7]:
# Checking the nan values.. 

churndata.isnull().sum()/len(churndata)

gender              0.000000
SeniorCitizen       0.000000
Partner             0.000000
Dependents          0.000000
tenure              0.000000
PhoneService        0.000000
OnlineSecurity      0.000000
OnlineBackup        0.000000
DeviceProtection    0.000000
TechSupport         0.000000
StreamingTV         0.000000
StreamingMovies     0.000000
Contract            0.000000
MonthlyCharges      0.000000
TotalCharges        0.001562
Churn               0.000000
dtype: float64

In [8]:
churndata["TotalCharges"] = churndata["TotalCharges"].fillna(churndata["TotalCharges"].mean())

In [9]:
churndata.shape

(7043, 16)

In [10]:
churndata["Churn"] = churndata["Churn"].replace({'No':0, 'Yes':1})

In [11]:
# features: tenure, SeniorCitizen, MonthlyCharges and TotalCharges:

numerical = churndata.select_dtypes("number")
numerical

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0,1,29.85,29.85,0
1,0,34,56.95,1889.50,0
2,0,2,53.85,108.15,1
3,0,45,42.30,1840.75,0
4,0,2,70.70,151.65,1
...,...,...,...,...,...
7038,0,24,84.80,1990.50,0
7039,0,72,103.20,7362.90,0
7040,0,11,29.60,346.45,0
7041,1,4,74.40,306.60,1


In [12]:
# X,y Split

y = numerical['Churn']
X = numerical.drop(['Churn'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Scaling data
transformer = MinMaxScaler().fit(X_train) # need to keep transformer
X_train_normalized = transformer.transform(X_train)
X_train_norm = pd.DataFrame(X_train_normalized, columns=X_train.columns)

In [14]:
transformer2 = MinMaxScaler().fit(X_test) 
X_test_normalized = transformer2.transform(X_test)
X_test_norm = pd.DataFrame(X_test_normalized, columns=X_test.columns)

In [43]:
X_test_norm.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0.0,0.013889,0.06097,0.000682
1,0.0,0.569444,0.065467,0.112965
2,0.0,0.722222,0.006497,0.117039
3,0.0,0.013889,0.576212,0.006639
4,0.0,0.930556,0.318341,0.374551


In [15]:
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True) 

In [16]:
LR = LogisticRegression(random_state=0, solver='lbfgs').fit(X_train_norm, y_train)

In [17]:
LR.score(X_test_norm, y_test)

0.8041163946061036

In [18]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

pred = LR.predict(X_test_norm)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.6947791164658634
recall:  0.46380697050938335
f1:  0.5562700964630225


In [19]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)

array([[960,  76],
       [200, 173]], dtype=int64)

# Oversampling


In [20]:
from sklearn.utils import resample

In [21]:
train = pd.concat([X_train_norm, y_train],axis=1)
train.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0.0,0.291667,0.464375,0.152089,0
1,0.0,0.75,0.786746,0.589736,0
2,0.0,0.013889,0.051819,0.000537,1
3,0.0,0.055556,0.517688,0.025288,1
4,0.0,0.0,0.434978,0.261309,0


In [22]:
no_churn = train[train['Churn']==0]
yes_churn = train[train['Churn']==1]

In [23]:
display(no_churn.shape)
display(yes_churn.shape)

(4138, 5)

(1496, 5)

In [24]:
yes_churn_oversampled = resample(yes_churn,
                                    replace=True,
                                    n_samples = len(no_churn),
                                    random_state=0)

In [25]:
display(no_churn.shape)
display(yes_churn_oversampled.shape)

(4138, 5)

(4138, 5)

In [26]:
train_oversampled = pd.concat([no_churn,yes_churn_oversampled],axis=0)
train_oversampled.tail()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
54,0.0,0.375,0.760339,0.312232,1
1364,1.0,0.055556,0.564524,0.034958,1
4912,0.0,0.166667,0.811659,0.14074,1
4765,1.0,0.291667,0.78276,0.232114,1
1522,0.0,0.111111,0.848032,0.095604,1


In [27]:
y_train_over = train_oversampled['Churn'].copy()
X_train_over = train_oversampled.drop('Churn',axis = 1).copy()

In [28]:
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_over, y_train_over)
pred1 = LR.predict(X_test_norm)
print('Accuracy is ', LR.score(X_test_norm, y_test))
print("precision: ",precision_score(y_test,pred1))
print("recall: ",recall_score(y_test,pred1))
print("f1: ",f1_score(y_test,pred1))

Accuracy is  0.7437899219304471
precision:  0.5107913669064749
recall:  0.7613941018766756
f1:  0.6114101184068891


In [29]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred1)

array([[764, 272],
       [ 89, 284]], dtype=int64)

# Undersampling

In [30]:
no_churn_undersampled = resample(no_churn,
                                    replace=False,
                                    n_samples = len(yes_churn),
                                    random_state=0)

In [31]:
display(no_churn_undersampled.shape)
display(yes_churn.shape)

(1496, 5)

(1496, 5)

In [32]:
train_undersampled = pd.concat([yes_churn,no_churn_undersampled])
train_undersampled.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
2,0.0,0.013889,0.051819,0.000537,1
3,0.0,0.055556,0.517688,0.025288,1
8,0.0,0.263889,0.213254,0.082454,1
10,1.0,0.291667,0.506726,0.168007,1
26,1.0,0.111111,0.735924,0.082062,1


In [33]:
y_train_under = train_undersampled['Churn'].copy()
X_train_under = train_undersampled.drop('Churn',axis = 1).copy()

In [34]:
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_under, y_train_under)
pred2 = LR.predict(X_test_norm)
print('Accuracy is ', LR.score(X_test_norm, y_test))
print("precision: ",precision_score(y_test,pred2))
print("recall: ",recall_score(y_test,pred2))
print("f1: ",f1_score(y_test,pred2))

Accuracy is  0.7437899219304471
precision:  0.5107142857142857
recall:  0.7667560321715817
f1:  0.6130760986066451


In [35]:
confusion_matrix(y_test,pred2)

array([[762, 274],
       [ 87, 286]], dtype=int64)

# SMOTE

In [39]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=100,k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train, y_train)

In [40]:
X_train_SMOTE.shape

(8276, 4)

In [42]:
LR_smote = LogisticRegression(max_iter=1000)
LR_smote.fit(X_train_SMOTE, y_train_SMOTE)
pred3 = LR.predict(X_test)

print('Accuracy is ', LR_smote.score(X_test_norm, y_test))
print("precision: ",precision_score(y_test,pred3))
print("recall: ",recall_score(y_test,pred3))
print("f1: ",f1_score(y_test,pred3))

confusion_matrix(y_test,pred3)

Accuracy is  0.7352732434350603
precision:  0.26931407942238267
recall:  1.0
f1:  0.42434584755403865


array([[  24, 1012],
       [   0,  373]], dtype=int64)