In [28]:
#Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report
import warnings

warnings.filterwarnings('ignore')

In [3]:
#Reading the data
churnData = pd.read_csv('files_for_lab\Customer-Churn.csv')

In [6]:
#Checking the shape of the data
churnData.shape

(7043, 16)

In [7]:
#Checking the head of the data
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [8]:
#Describe the data
churnData.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [9]:
#Checking the data types
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [10]:
#Converting the TotalCharges column to numeric
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')

In [11]:
#Checking for null values
churnData.isnull().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [12]:
#Checking the TotalCharges column
churnData['TotalCharges'].value_counts()

TotalCharges
20.20      11
19.75       9
20.05       8
19.90       8
19.65       8
           ..
6849.40     1
692.35      1
130.15      1
3211.90     1
6844.50     1
Name: count, Length: 6530, dtype: int64

In [13]:
#Filling the null values with 0, as the null values may represent no charges.
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(0)

In [15]:
#Applying StandardScaler to the features tenure, SeniorCitizen, MonthlyCharges and TotalCharges
scaler = StandardScaler()
churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']])

In [16]:
#Splitting the data into train and test using the prior features
X = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
y = churnData['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
#Fitting a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(np.array(X_test))

In [31]:
#Checking the accuracy of the test data
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.83      0.93      0.88      1036
         Yes       0.70      0.47      0.56       373

    accuracy                           0.81      1409
   macro avg       0.76      0.70      0.72      1409
weighted avg       0.79      0.81      0.79      1409



Now we will deal with imbalance

In [20]:
#Checking for imbalance in the data
churnData['Churn'].value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [21]:
#Oversampling the minority class
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X, y)
y_ros.value_counts()

Churn
No     5174
Yes    5174
Name: count, dtype: int64

In [32]:
#Fitting a logistic regression model with the oversampled data
X_train_ros, X_test_ros, y_train_ros, y_test_ros = train_test_split(X_ros, y_ros, test_size=0.2, random_state=42)
model_ros = LogisticRegression()
model_ros.fit(X_train_ros, y_train_ros)
model_ros.score(X_test_ros, y_test_ros)
predictions = model.predict(np.array(X_test_ros))

In [33]:
#Checking the accuracy of the test data
print(classification_report(y_test_ros, predictions))

              precision    recall  f1-score   support

          No       0.62      0.92      0.74      1021
         Yes       0.85      0.46      0.60      1049

    accuracy                           0.69      2070
   macro avg       0.74      0.69      0.67      2070
weighted avg       0.74      0.69      0.67      2070



In [23]:
#Now undersampling the majority class
rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_resample(X, y)
y_rus.value_counts()

Churn
No     1869
Yes    1869
Name: count, dtype: int64

In [34]:
#Fitting a logistic regression model with the undersampled data
X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(X_rus, y_rus, test_size=0.2, random_state=42)
model_rus = LogisticRegression()
model_rus.fit(X_train_rus, y_train_rus)
predictions = model.predict(np.array(X_test_rus))

In [35]:
#Checking the accuracy of the test data
print(classification_report(y_test_rus, predictions))

              precision    recall  f1-score   support

          No       0.64      0.92      0.75       379
         Yes       0.85      0.47      0.60       369

    accuracy                           0.70       748
   macro avg       0.75      0.69      0.68       748
weighted avg       0.74      0.70      0.68       748

