In this dataset we are going to work on customer details, and will try to predict weather the customer has left the company or not.

In [None]:
#import the libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Read & Observe Data

/Users/dhanrajtentu/ds-projects/projects/01-customer-churn/code.ipynb

In [None]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.columns

In [None]:
# remove some non necessary columns
#df.drop(columns=['customerID','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection', 'TechSupport','StreamingTV', 'StreamingMovies','PaymentMethod'],inplace=True)

In [None]:
df.drop(columns=["customerID"],inplace=True)

In [None]:
for column in df.columns:
    print(f"{column}: {df[column].unique()}")
    print('-'*50)

## Data Cleaning

In [None]:
#remove the columns where the data is imbalance
df.drop(columns=['gender','PhoneService','MultipleLines'],inplace=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

First we need to convert Yes No to int 0 or 1 type

In [None]:
for col in df.columns:
    print(f"{col} : {df[col].dtype}")

In [None]:
# create a function to change the yes no to 1 or 0
def convert_string_int(df):
    accecpt = {'Yes', "No"}
    #get the column names with only yes or no
    yes_no_column = [col for col in df.columns
                    if all(
                        str(val).strip() in accecpt
                        for val in df[col].unique()
                    )]
    to_int_dict = {'yes':1,'no' :0}
    # change yes or no columns to 1 or 0
    for col in yes_no_column:
        df[col] = df[col].str.strip().str.lower().map(to_int_dict)
        df[col] = df[col].astype(int)
    accecpt_two = {'yes','no','no internet service'}
    column_name_two = [col for col in df.columns
                   if all(
                       str(val).strip().lower() in accecpt_two
                       for val in df[col].unique() 
                   )]
    to_int_dict_two =  {'yes':1,'no' :0, 'no internet service':0}
    for col in column_name_two:
        df[col] = df[col].str.strip().str.lower().map(to_int_dict_two)
        df[col] = df[col].astype(int)
    return df    
    

In [None]:
df = convert_string_int(df)

In [None]:
df.head()

In [None]:
#save cleaned dataset
df.to_csv("Churn_Cleaned_dataset.csv")

In [None]:
df.info()

In [None]:
sns.pairplot(data=df, hue='Churn')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(data=df.corr(numeric_only=True),annot=True, fmt= '.1f',cmap='crest')
plt.show()

## Feature Selection and Modelling

In [None]:
# import libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay

In [None]:
X = df.drop(columns='Churn',axis=1)
y = df['Churn']

# train test spllit
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.2)

In [None]:
# get numeric features
numeric_features = X.select_dtypes(int).columns.to_list()
cat_features = X.select_dtypes(object).columns.to_list()
numeric_features.append('MonthlyCharges') # append the only float columns

In [None]:
# Scale and Encode the numeric and categorical columns respectively

# scale numeric
numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])
# onehot the categorical
cat_transformer = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore',sparse_output=False,drop='first'))
])

# transform
preprocessor = ColumnTransformer([
    ('num',numeric_transformer,numeric_features),
    ('cat',cat_transformer,cat_features)
],remainder='drop')

In [None]:
preprocessor

In [None]:
# from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
warnings.filterwarnings("ignore", category=RuntimeWarning)


In [None]:

models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced',
    max_iter=500,
    C=0.1,solver='lbfgs'),
    'Random Forest': RandomForestClassifier(),
    #"GBosst" : GradientBoostingClassifier()
}

for name, clf in models.items():
    print(name)

    pipe = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('classifier', clf)
    ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)  
    y_proba = pipe.predict_proba(X_test)[:, 1] # get the probability
    y_pred = (y_proba >= 0.40).astype(int) # reduce the probability to get more accurates churns

   # print(f"Accuracy Score   : {accuracy_score(y_test, y_pred)}")
    print(f"F1 Score         : {f1_score(y_test, y_pred)}")
    print(f"Precision Score  : {precision_score(y_test, y_pred)}")
    print(f"Recall Score: {recall_score(y_test,y_pred)}")
    print(f"Confustion Matrix : \n{confusion_matrix(y_test,y_pred)}")
    
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4, 2))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["No Churn", "Churn"])
    disp.plot(cmap="Blues", values_format='d')
    plt.title("Confusion Matrix")
    plt.show()
    print("-" * 50)



After tuning the model and after changing the parameters of logistic regression we got the highest recall score that is 0.87. As our data is trying to predict a bussiness, the false positives and false negatives should be considered and must be more accurate.

In [None]:
import pickle 

In [31]:
pickle.dump(pipe,open("pipeline_churn.pkl",'wb'))