In [None]:
#Importing necessary libraries
import pandas as pd
import os
import numpy as np
import joblib
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Loading the dataset
Dataset = pd.read_csv("customer_churn_large_dataset.csv")
Dataset

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0
...,...,...,...,...,...,...,...,...,...
99995,99996,Customer_99996,33,Male,Houston,23,55.13,226,1
99996,99997,Customer_99997,62,Female,New York,19,61.65,351,0
99997,99998,Customer_99998,64,Male,Chicago,17,96.11,251,1
99998,99999,Customer_99999,51,Female,New York,20,49.25,434,1


In [None]:
#Performing Pre-processing
Dataset.isnull().sum()

CustomerID                    0
Name                          0
Age                           0
Gender                        0
Location                      0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
dtype: int64

In [None]:
Dataset.describe()

Unnamed: 0,CustomerID,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,50000.5,44.02702,12.4901,65.053197,274.39365,0.49779
std,28867.657797,15.280283,6.926461,20.230696,130.463063,0.499998
min,1.0,18.0,1.0,30.0,50.0,0.0
25%,25000.75,31.0,6.0,47.54,161.0,0.0
50%,50000.5,44.0,12.0,65.01,274.0,0.0
75%,75000.25,57.0,19.0,82.64,387.0,1.0
max,100000.0,70.0,24.0,100.0,500.0,1.0


In [None]:
Dataset.columns

Index(['CustomerID', 'Name', 'Age', 'Gender', 'Location',
       'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB',
       'Churn'],
      dtype='object')

In [None]:
Dataset = Dataset.drop(['CustomerID','Name'],axis=1)

In [None]:
Dataset.head()

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,63,Male,Los Angeles,17,73.36,236,0
1,62,Female,New York,1,48.76,172,0
2,24,Female,Los Angeles,5,85.47,460,0
3,36,Female,Miami,3,97.94,297,1
4,46,Female,Miami,19,58.14,266,0


In [None]:
Dataset['Gender'].unique()


array(['Male', 'Female'], dtype=object)

In [None]:
Dataset['Location'].unique()

array(['Los Angeles', 'New York', 'Miami', 'Chicago', 'Houston'],
      dtype=object)

In [None]:
X = Dataset.drop(['Churn'],axis=1)
y = Dataset['Churn']

In [None]:
le = LabelEncoder()
X['Gender']=le.fit_transform(X['Gender'])
X['Location']=le.fit_transform(X['Location'])

In [None]:
X.head()

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB
0,63,Male,Los Angeles,17,73.36,236
1,62,Female,New York,1,48.76,172
2,24,Female,Los Angeles,5,85.47,460
3,36,Female,Miami,3,97.94,297
4,46,Female,Miami,19,58.14,266


In [88]:
#Handling imbalanced data using SMOTE
X_res,y_res = SMOTE().fit_resample(X,y)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_res,y_res,test_size=0.30,random_state=42)

In [89]:
y_res.value_counts()

0    50221
1    50221
Name: Churn, dtype: int64

In [None]:
X['Gender'].value_counts()

0    50216
1    49784
Name: Gender, dtype: int64

In [None]:
#Performing Feature scaling
Feature_scaling = StandardScaler()


In [None]:
X_train = Feature_scaling.fit_transform(X_train)
X_test = Feature_scaling.transform(X_test)

In [None]:
#Performing Logistic Regression
Logist = LogisticRegression()

In [None]:
Logist.fit(X_train,y_train)

In [None]:
y_pred =Logist.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

0.4991869379086052

In [None]:
precision_score(y_test,y_pred)

0.4994535519125683

In [None]:
recall_score(y_test,y_pred)

0.4850414593698176

In [None]:
f1_score(y_test,y_pred)

0.4921420158169275

In [None]:
#Performing GradientBoostingClassifier
gbc = GradientBoostingClassifier()

In [None]:
gbc.fit(X_train,y_train)

In [None]:
y_pred1 = gbc.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred1)

0.500978993130455

In [None]:
precision_score(y_test,y_pred1)

0.5014378689268957

In [None]:
recall_score(y_test,y_pred1)

0.4395356550580431

In [None]:
f1_score(y_test,y_pred1)

0.46845063452225244

In [None]:
#Random Forest Classifier
Rf = RandomForestClassifier()

In [None]:
Rf.fit(X_train,y_train)

In [None]:
y_pred2 = Rf.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred2)

0.4969302757773869

In [None]:
precision_score(y_test,y_pred2)

0.4971169686985173

In [None]:
recall_score(y_test,y_pred2)

0.4803980099502488

In [None]:
f1_score(y_test,y_pred2)

0.48861451270114364

In [None]:
X_res = Feature_scaling.fit_transform(X_res)

In [None]:
Rf.fit(X_res,y_res)

In [None]:
#Saving the Model
joblib.dump(Rf,'Customer-Churn-Prediction')

['Customer-Churn-Prediction']

In [None]:
Ml_model = joblib.load('Customer-Churn-Prediction')

In [None]:
Dataset.columns

Index(['Age', 'Gender', 'Location', 'Subscription_Length_Months',
       'Monthly_Bill', 'Total_Usage_GB', 'Churn'],
      dtype='object')

In [None]:
#Input feature variable to predict
#For categorical Data->Gender(0=Female,1=Male)&Location(0=Chicago,1=Houston,2=Los Angeles,3=Miami,4=New York)
Ml_model.predict([[62,0,4,1,48.76,172]])

array([0])