## This is a supervised machine learning problem(classification as i have a target column and have to find yes or no)

In [1]:
#  Import all the required libraries

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    recall_score,
    precision_score,
    roc_auc_score
)

# Load the data

In [3]:
try:
    data = pd.read_csv("dataset/Telco_Customer_Churn_lyst1769326950438.csv")
    print(f"\nSuccessfully loaded the Customer Churn data:{data.shape}")
except Exception as e:
    print(e)


Successfully loaded the Customer Churn data:(7043, 21)


In [4]:
type(data)

pandas.core.frame.DataFrame

# Cleaning Data

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
# Clean total charges column
data["TotalCharges"] = pd.to_numeric(data["TotalCharges"],errors="coerce")
# erros = coerce => converts any number which cannot be convert into Nan
original_data = len(data)

data = data.dropna(subset=["TotalCharges"])
# data.info()
churn_rate = data.groupby("Churn").size()
print(churn_rate)


Churn
No     5163
Yes    1869
dtype: int64


In [7]:
data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

## Dependent and Target Features

In [8]:
target_features = 'Churn'
numeric_features = ['tenure','TotalCharges','MonthlyCharges']
categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents','PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod',]


## Combine all features for x and y

In [9]:
#X -> independent features
#Y -> dependent features
X = data[numeric_features+categorical_features]
Y = data[target_features]

### Target -> Churn
### 0 -> Not churn
### 1-> Not churn

In [10]:
from collections import Counter
# The data is imbalanced, and biased. So i want my model to be unbiased and be balanced
Counter(Y)
# The model has fewer patterns of "churners" to learn from
#there is more risk of biasing toward predicting No


Counter({'No': 5163, 'Yes': 1869})

#  Split the data into train and test set

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=42,stratify=Y)
# churn_rate_train = Y_train.mean()
print(Y_train)
print(Y_test)

4499    Yes
1933     No
4668     No
5681     No
3610    Yes
       ... 
5161     No
3451     No
4135     No
4249     No
272     Yes
Name: Churn, Length: 4922, dtype: object
4221    No
1820    No
2375    No
5462    No
1791    No
        ..
4685    No
4768    No
6150    No
3234    No
4451    No
Name: Churn, Length: 2110, dtype: object


In [12]:
print(Y_train)

4499    Yes
1933     No
4668     No
5681     No
3610    Yes
       ... 
5161     No
3451     No
4135     No
4249     No
272     Yes
Name: Churn, Length: 4922, dtype: object


In [13]:
Y_train = Y_train.astype(str).str.strip().replace({"Yes":1,"No":0}).astype(int)
Y_test = Y_test.astype(str).str.strip().replace({"Yes":1,"No":0}).astype(int)

  Y_train = Y_train.astype(str).str.strip().replace({"Yes":1,"No":0}).astype(int)
  Y_test = Y_test.astype(str).str.strip().replace({"Yes":1,"No":0}).astype(int)


In [14]:
print(f"Training data churn rate: {Y_train.mean():.4f}")
print(f"Testing data churn rate: {Y_test.mean():.4f}")

# Verifying that the churn rate is similar in testing and training set

Training data churn rate: 0.2657
Testing data churn rate: 0.2659


## Creating a pipeline

In [15]:
from sklearn.compose import ColumnTransformer
#------1 . Numeric preprocessing batch
numeric_transformer = Pipeline(
    steps = [('imputer',SimpleImputer(strategy='median')),
            ('scaler',StandardScaler()) #priority between small and big values
             #Standard Scaler z= (x-u)/s
            ]
)

#----- 2. Categorical preprocessing batch
categorical_transformer = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehot',OneHotEncoder(handle_unknown='ignore'))
    ]
)

# ----- 3. Combine the above batches with Column transformer
preprocessor = ColumnTransformer(
    transformers = [
     ('num',numeric_transformer,numeric_features),
     ('cat',categorical_transformer,categorical_features)
    ],
    remainder = 'drop'
)

#----4 . Creat the final, full stack pipeline

clf_pipeline = Pipeline(steps=
    [('preprocessor',preprocessor),
    ('classifier',LogisticRegression(
        class_weight = "balanced",
     random_state = 42
    )   
    ),
    ]
)



## Model training on training set

In [16]:
clf_pipeline.fit(X_train,Y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,100


## Saving the pipeline

In [None]:
import joblib
joblib.dump(clf_pipeline,"churn_pipeline.joblib")

## Model Prediction

In [22]:
y_predict = clf_pipeline.predict(X_test)
print(y_predict)

[0 1 0 ... 1 0 1]


In [18]:
y_prob  = clf_pipeline.predict_proba(X_test)[:,1]
print(y_prob)

[0.39717109 0.57209269 0.12870455 ... 0.74641361 0.03938791 0.76153802]


## Model evaluation

In [19]:
print(confusion_matrix(Y_test,y_predict))

[[1110  439]
 [ 115  446]]


In [20]:
print(recall_score(Y_test,y_predict))

0.7950089126559715


In [21]:
print(precision_score(Y_test,y_predict))

0.503954802259887
