In [5]:
import numpy as np
import pandas as pd

import plotly.express as px #for visualization
import matplotlib.pyplot as plt #for visualization


from sklearn.preprocessing import LabelEncoder

#Creating data frame by reading dataset
data_df = pd.read_excel("../Dataset/customer_churn_large_dataset.xlsx")

#Data Cleaning
data_df.dropna(inplace=True)  # Remove rows with missing values

data_df.drop_duplicates(inplace=True)  # Remove duplicate rows


gender_mapping = {'Male': 1, 'Female': 0}
location_mapping = {'Los Angeles': 0, 'Chicago': 1, 'Miami': 2, 'New York': 3, 'Houston': 4}

# Apply gender mapping to the 'Gender' column
data_df['Gender'] = data_df['Gender'].map(gender_mapping)

# Apply location mapping to the 'Location' column
data_df['Location'] = data_df['Location'].map(location_mapping)

data_df.drop(["CustomerID","Name"],axis=1,inplace = True)

print(data_df.head())


   Age  Gender  Location  Subscription_Length_Months  Monthly_Bill  \
0   63       1         0                          17         73.36   
1   62       0         3                           1         48.76   
2   24       0         0                           5         85.47   
3   36       0         2                           3         97.94   
4   46       0         2                          19         58.14   

   Total_Usage_GB  Churn  
0             236      0  
1             172      0  
2             460      0  
3             297      1  
4             266      0  


In [6]:
#feature scaling
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()

data_df['Subscription_Length_Months'] = sc.fit_transform(data_df[['Subscription_Length_Months']])
data_df['Monthly_Bill'] = sc.fit_transform(data_df[['Monthly_Bill']])
data_df['Total_Usage_GB'] = sc.fit_transform(data_df[['Total_Usage_GB']])

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

#Import metric for performance evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.model_selection import GridSearchCV



#Split data into train and test sets
from sklearn.model_selection import train_test_split
X = data_df.drop('Churn', axis=1)
y = data_df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

#Defining the modelling function
def modeling(alg, alg_name, params={}):
    model = alg(**params) #Instantiating the algorithm class and unpacking parameters if any
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    #Performance evaluation
    def print_scores(alg, y_true, y_pred):
        print(alg_name)
        acc_score = accuracy_score(y_true, y_pred)
        print("accuracy: ",acc_score)
        pre_score = precision_score(y_true, y_pred)
        print("precision: ",pre_score)
        rec_score = recall_score(y_true, y_pred)
        print("recall: ",rec_score)
        f_score = f1_score(y_true, y_pred, average='weighted')
        print("f1_score: ",f_score)

    print_scores(alg, y_test, y_pred)
    return model

# Running logistic regression model
log_model = modeling(LogisticRegression, 'Logistic Regression')

Logistic Regression
accuracy:  0.49946666666666667
precision:  0.49507862880416337
recall:  0.29312077165248845
f1_score:  0.4774073123830897


In [8]:
#Random forest
rf_model = modeling(RandomForestClassifier, "Random Forest Classification")

Random Forest Classification
accuracy:  0.4955
precision:  0.4928282968945829
recall:  0.4741107910777681
f1_score:  0.49527452905170916


In [9]:
#Decision tree
dt_model = modeling(DecisionTreeClassifier, "Decision Tree Classification")

Decision Tree Classification
accuracy:  0.5006333333333334
precision:  0.4982719659710222
recall:  0.502176970996048
f1_score:  0.5006349562761271


In [None]:
### Trying other machine learning algorithms: SVC
svc_model = modeling(SVC, 'SVC Classification')

In [10]:
#Naive bayes
nb_model = modeling(GaussianNB, "Naive Bayes Classification")

Naive Bayes Classification
accuracy:  0.4992333333333333
precision:  0.4951436247158504
recall:  0.32098600040190234
f1_score:  0.4829504314568535


In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define a grid of hyperparameters to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l2']
}

# Create a logistic regression model
model = LogisticRegression()

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)


from sklearn.linear_model import LogisticRegression

# Instantiate a logistic regression model with the best hyperparameters
best_model = LogisticRegression(C=best_params['C'], penalty=best_params['penalty'])

# Train the model on your full training dataset
best_model.fit(X_train, y_train)


from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test dataset
y_pred = best_model.predict(X_test)

# Calculate accuracy and other relevant metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Model Accuracy:", accuracy)
print("Classification Report:\n", report)

Best Hyperparameters: {'C': 10, 'penalty': 'l2'}
Model Accuracy: 0.4995
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.70      0.59     15071
           1       0.50      0.29      0.37     14929

    accuracy                           0.50     30000
   macro avg       0.50      0.50      0.48     30000
weighted avg       0.50      0.50      0.48     30000



In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

# Perform 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy:", scores.mean())

Cross-Validation Accuracy: 0.5007900000000001


In [15]:
#Saving best model
import joblib
#Sava the model to disk
filename = 'model.sav'
print(log_model)
joblib.dump(log_model, filename)

LogisticRegression()


['model.sav']