Import the required libraries and modules that you would need.

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

Read that data into Python and call the dataframe churnData.

In [17]:
churnData = pd.read_csv('Customer-Churn.csv')

In [18]:
churnData

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


Check the datatypes of all the columns in the data. You would see that the column TotalCharges is object type. Convert this column into numeric type using pd.to_numeric function.

In [19]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')


Check for null values in the dataframe. Replace the null values.

In [20]:
churnData.fillna(0, inplace=True)

In [21]:
churnData

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.50,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.50,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.90,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.60,Yes


In [22]:
churnData.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

Use the following features: tenure, SeniorCitizen, MonthlyCharges and TotalCharges:
Scale the features either by using normalizer or a standard scaler.
Split the data into a training set and a test set.
Fit a logistic regression model on the training data.
Check the accuracy on the test data.

In [23]:
totalcharges_median = churnData['TotalCharges'].median()
churnData['TotalCharges'].fillna(totalcharges_median, inplace=True)

In [24]:
#Use the following features: tenure, SeniorCitizen, MonthlyCharges and TotalCharges:
features = ['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']
X = churnData[features]
y = churnData['Churn']

In [25]:

#Scale the features either by using normalizer or a standard scaler.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [26]:
#Split the data into a training set and a test set.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, churnData['Churn'], test_size=0.2, random_state=42)

In [27]:
#Fit a logistic regression model on the training data.
classification = LogisticRegression()
classification.fit(X_train, y_train)

Check the accuracy on the test data.

In [28]:
predictions = classification.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

          No       0.83      0.93      0.88      1036
         Yes       0.70      0.47      0.56       373

    accuracy                           0.81      1409
   macro avg       0.76      0.70      0.72      1409
weighted avg       0.79      0.81      0.79      1409



Check for the imbalance.

In [30]:
churnData['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes.

In [31]:
from imblearn.over_sampling import RandomOverSampler

X = churnData[features]
y = churnData['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

ros = RandomOverSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train_scaled, y_train)

y_train.value_counts()

No     4139
Yes    4139
Name: Churn, dtype: int64

In [32]:
classification_over = LogisticRegression(random_state=42, max_iter=10000)
classification_over.fit(X_train, y_train)

predictions_over = classification_over.predict(X_test_scaled)
print(classification_report(y_test, predictions_over))

              precision    recall  f1-score   support

          No       0.89      0.72      0.79      1035
         Yes       0.49      0.75      0.59       374

    accuracy                           0.73      1409
   macro avg       0.69      0.73      0.69      1409
weighted avg       0.78      0.73      0.74      1409



In [33]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)

X = churnData[features]
y = churnData['Churn']

X_, y_ = rus.fit_resample(X, y)
y_.value_counts()

No     1869
Yes    1869
Name: Churn, dtype: int64

In [34]:
#model check 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train_, y_train_ = rus.fit_resample(X_train, y_train)
classification_under = LogisticRegression(random_state=42, max_iter=100000)
classification_under.fit(X_train_, y_train_)
predictions_under = classification_under.predict(X_test)
print(classification_report(y_test, predictions_under))

              precision    recall  f1-score   support

          No       0.89      0.72      0.79      1035
         Yes       0.49      0.75      0.59       374

    accuracy                           0.73      1409
   macro avg       0.69      0.73      0.69      1409
weighted avg       0.78      0.73      0.74      1409

