<a href="https://colab.research.google.com/github/Olamilek4n/LEK4N/blob/main/ChunAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
# Import necessary libraries

import pandas as pd
import numpy as np
import random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import  train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Generating a Tele-communication dataset for the churn analysis

# Creating an empty DataFrame

churn_data = pd.DataFrame(columns=[
    'customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
    'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
    'OnlineSecurity', 'DeviceProtection', 'TechSupport',
    'StreamingTV', 'StreamingMovies', 'Contract',
    'PaperlessBilling', 'PaymentMethod',
    'MonthlyCharges', 'TotalCharges', 'Churn'
])

# Populating the DataFrame with random data

for i in range(10000):
    churn_data = churn_data.append({
        'customerID': f'{i:04d}-XXXXX',
        'gender': random.choice(['Male', 'Female']),
        'SeniorCitizen': random.randint(0, 1),
        'Partner': random.choice(['Yes', 'No']),
        'Dependents': random.choice(['Yes', 'No']),
        'tenure': random.randint(1, 72),
        'PhoneService': random.choice(['Yes', 'No']),
        'MultipleLines': random.choice(['Yes', 'No', 'No phone service']),
        'InternetService': random.choice(['DSL', 'Fiber optic', 'No']),
        'OnlineSecurity': random.choice(['Yes', 'No', 'No internet service']),
        'DeviceProtection': random.choice(['Yes', 'No', 'No internet service']),
        'TechSupport': random.choice(['Yes', 'No', 'No internet service']),
        'StreamingTV': random.choice(['Yes', 'No', 'No internet service']),
        'StreamingMovies': random.choice(['Yes', 'No', 'No internet service']),
        'Contract': random.choice(['Month-to-month', 'One year', 'Two year']),
        'PaperlessBilling': random.choice(['Yes', 'No']),
        'PaymentMethod': random.choice(['Electronic check', 'Mailed check', 'Bank transfer (automatic)', 'Credit card (automatic)']),
        'MonthlyCharges': round(random.uniform(20, 100), 2),
        'TotalCharges': round(random.uniform(50, 5000), 2),
        'Churn': random.choice(['Yes', 'No'])
    }, ignore_index=True)

In [3]:
# Writing the DataFrame to a CSV file

churn_data.to_csv('churn_data.csv', index=False)

In [4]:
# Getting an overview of the dataset

churn_data.head(20)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0000-XXXXX,Male,1,Yes,No,36,No,Yes,Fiber optic,Yes,No internet service,No,Yes,No,Two year,No,Credit card (automatic),69.69,108.47,No
1,0001-XXXXX,Male,1,Yes,No,21,No,Yes,DSL,No internet service,Yes,No,Yes,Yes,One year,No,Credit card (automatic),83.75,1669.93,No
2,0002-XXXXX,Male,1,No,Yes,52,Yes,Yes,Fiber optic,No internet service,No internet service,Yes,No internet service,No,Month-to-month,No,Credit card (automatic),63.57,794.52,No
3,0003-XXXXX,Female,0,No,Yes,43,Yes,Yes,DSL,No,No internet service,No,Yes,No internet service,One year,Yes,Bank transfer (automatic),31.86,4674.21,No
4,0004-XXXXX,Male,0,Yes,Yes,32,Yes,Yes,No,No,No internet service,Yes,No internet service,Yes,Month-to-month,Yes,Electronic check,21.36,1431.1,No
5,0005-XXXXX,Male,0,Yes,No,30,Yes,Yes,No,No internet service,Yes,No internet service,No,Yes,Month-to-month,No,Mailed check,38.16,1952.03,No
6,0006-XXXXX,Male,1,Yes,No,71,No,No,DSL,No internet service,No internet service,No,No internet service,No,One year,Yes,Electronic check,22.79,398.73,Yes
7,0007-XXXXX,Male,0,No,Yes,7,Yes,No,Fiber optic,No,Yes,No internet service,No,No,Two year,Yes,Credit card (automatic),33.12,4957.07,No
8,0008-XXXXX,Female,1,Yes,Yes,13,No,Yes,DSL,No,No,No,No internet service,Yes,One year,No,Bank transfer (automatic),67.87,1390.45,Yes
9,0009-XXXXX,Female,0,No,No,15,No,No,DSL,Yes,No,Yes,No internet service,Yes,Month-to-month,No,Credit card (automatic),91.36,4890.43,Yes


In [5]:
#Some columns are not needed for this analysis, lets see the list of columns and then remove unwanted ones

churn_data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [6]:
# Removing the customerID column

churn_data.drop('customerID',axis=1, inplace=True)

In [10]:
# Getting info on the dataset (Data Types)

churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            10000 non-null  object 
 1   SeniorCitizen     10000 non-null  object 
 2   Partner           10000 non-null  object 
 3   Dependents        10000 non-null  object 
 4   tenure            10000 non-null  object 
 5   PhoneService      10000 non-null  object 
 6   MultipleLines     10000 non-null  object 
 7   InternetService   10000 non-null  object 
 8   OnlineSecurity    10000 non-null  object 
 9   DeviceProtection  10000 non-null  object 
 10  TechSupport       10000 non-null  object 
 11  StreamingTV       10000 non-null  object 
 12  StreamingMovies   10000 non-null  object 
 13  Contract          10000 non-null  object 
 14  PaperlessBilling  10000 non-null  object 
 15  PaymentMethod     10000 non-null  object 
 16  MonthlyCharges    10000 non-null  float64

In [11]:
# To ease the analysis for Logistic Regression purposes, the column [Churn] with values "Yes" and "No" by replacing the churn column with 0 and 1

churn_data['Churn'].replace({'Yes':1,'No':0},inplace=True)

In [12]:
# Converting the column [TotalCharges] to numeric

churn_data['TotalCharges'] = pd.to_numeric(churn_data['TotalCharges'],errors='coerce')

churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            10000 non-null  object 
 1   SeniorCitizen     10000 non-null  object 
 2   Partner           10000 non-null  object 
 3   Dependents        10000 non-null  object 
 4   tenure            10000 non-null  object 
 5   PhoneService      10000 non-null  object 
 6   MultipleLines     10000 non-null  object 
 7   InternetService   10000 non-null  object 
 8   OnlineSecurity    10000 non-null  object 
 9   DeviceProtection  10000 non-null  object 
 10  TechSupport       10000 non-null  object 
 11  StreamingTV       10000 non-null  object 
 12  StreamingMovies   10000 non-null  object 
 13  Contract          10000 non-null  object 
 14  PaperlessBilling  10000 non-null  object 
 15  PaymentMethod     10000 non-null  object 
 16  MonthlyCharges    10000 non-null  float64

In [13]:
# Checking for totals for nulls or NaN values

churn_data.isnull().any().sum()

0

In [14]:
# If there are nulls are NaNs, the line of code below will eliminnate them!

churn_data.dropna(axis=0,inplace=True)

In [15]:
# Summary of Numerical Values

churn_data[['tenure','MonthlyCharges']].describe()

Unnamed: 0,MonthlyCharges
count,10000.0
mean,60.207389
std,22.901324
min,20.0
25%,40.3375
50%,60.765
75%,79.8325
max,99.99


In [16]:
# Check for correlation in the Dataset

churn_data.corr()

Unnamed: 0,MonthlyCharges,TotalCharges,Churn
MonthlyCharges,1.0,-0.001334,-0.016245
TotalCharges,-0.001334,1.0,0.005818
Churn,-0.016245,0.005818,1.0


This situation of a very low or no correlation is possible with random dataset

In [18]:
# Converting columns to Binary ( 1 and 0)

churn_data = pd.get_dummies(churn_data)

In [19]:
# Outcome of the previous process

churn_data.head(10)

Unnamed: 0,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,69.69,108.47,0,0,1,0,1,0,1,1,...,0,0,0,1,1,0,0,1,0,0
1,83.75,1669.93,0,0,1,0,1,0,1,1,...,1,0,1,0,1,0,0,1,0,0
2,63.57,794.52,0,0,1,0,1,1,0,0,...,0,1,0,0,1,0,0,1,0,0
3,31.86,4674.21,0,1,0,1,0,1,0,0,...,0,0,1,0,0,1,1,0,0,0
4,21.36,1431.1,0,0,1,1,0,0,1,0,...,1,1,0,0,0,1,0,0,1,0
5,38.16,1952.03,0,0,1,1,0,0,1,1,...,1,1,0,0,1,0,0,0,0,1
6,22.79,398.73,1,0,1,0,1,0,1,1,...,0,0,1,0,0,1,0,0,1,0
7,33.12,4957.07,0,0,1,1,0,1,0,0,...,0,0,0,1,0,1,0,1,0,0
8,67.87,1390.45,1,1,0,0,1,0,1,0,...,1,0,1,0,1,0,1,0,0,0
9,91.36,4890.43,1,1,0,1,0,1,0,1,...,1,1,0,0,1,0,0,1,0,0


In [20]:
# Data Type Confirmation

churn_data.dtypes

MonthlyCharges                             float64
TotalCharges                               float64
Churn                                        int64
gender_Female                                uint8
gender_Male                                  uint8
                                            ...   
PaperlessBilling_Yes                         uint8
PaymentMethod_Bank transfer (automatic)      uint8
PaymentMethod_Credit card (automatic)        uint8
PaymentMethod_Electronic check               uint8
PaymentMethod_Mailed check                   uint8
Length: 115, dtype: object

Now that Data preparation is almost complete, the data will be splitted into partitions

In [21]:
# Seperating the "Churn" column from the initial Dataset

X= churn_data.drop('Churn', axis=1)
y = churn_data['Churn']

In [22]:
# Performing Train Test Split using the recently concluded parameters X and y

X_train,X_test,y_train,y_test  = train_test_split(X,y)

In [23]:
# Checking the split size

X_train.shape

(7500, 114)

In [25]:
# Fitting the Model on the training datasets

log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)

In [26]:
# Churn Prediction begins!!!

y_pred = log_reg.predict(X_test)

In [28]:
# Using the confusion matrix to see how well the model is performing

confusion_matrix(y_test, y_pred)

array([[862, 365],
       [858, 415]])

In [29]:
# Using the F1 Score to check accuracy of the model

f1 = f1_score(y_test, y_pred)

print(f1)

0.40428641013151484


Conclusion: This model is having a 40% prediction accuracy, which is very low due to the usage of random Dataset with a low correlation. Now, imagine your company's dataset which is more realistic on this model.  