In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
from tqdm import tqdm

In [4]:
df=pd.read_excel('/content/customer_churn_large_dataset.xlsx')

In [5]:
df.head(5)

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0


In [6]:
df.shape

(100000, 9)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CustomerID                  100000 non-null  int64  
 1   Name                        100000 non-null  object 
 2   Age                         100000 non-null  int64  
 3   Gender                      100000 non-null  object 
 4   Location                    100000 non-null  object 
 5   Subscription_Length_Months  100000 non-null  int64  
 6   Monthly_Bill                100000 non-null  float64
 7   Total_Usage_GB              100000 non-null  int64  
 8   Churn                       100000 non-null  int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 6.9+ MB


In [8]:
df.describe()

Unnamed: 0,CustomerID,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,50000.5,44.02702,12.4901,65.053197,274.39365,0.49779
std,28867.657797,15.280283,6.926461,20.230696,130.463063,0.499998
min,1.0,18.0,1.0,30.0,50.0,0.0
25%,25000.75,31.0,6.0,47.54,161.0,0.0
50%,50000.5,44.0,12.0,65.01,274.0,0.0
75%,75000.25,57.0,19.0,82.64,387.0,1.0
max,100000.0,70.0,24.0,100.0,500.0,1.0


#### As we can see all columns show the normal distribution bechause theri is not such large difference in 75% to max so all columns are normally distributed.

In [9]:
df.isnull().sum()

CustomerID                    0
Name                          0
Age                           0
Gender                        0
Location                      0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
dtype: int64

#### this dataset is very clean no null value contains

In [None]:
# Loop through each column
for column in tqdm(df.columns):
    # Determine the data type of the column
    dtype = df[column].dtype

    # If the column is numerical, create a histogram
    if dtype in [[np.int64, np.float64]]:
        plt.figure(figsize=(5, 3))
        sns.histplot(df[column],kde=True)
        plt.title(f'Distribution of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.show()

    # If the column is categorical, create a count plot
    elif dtype == 'object':
        plt.figure(figsize=(5, 3))
        sns.countplot(data=df, x=column)
        plt.title(f'Count of {column}')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.show()

## it will take so much time to run on my cpu

## categorical to numerical

In [10]:
label_encoder=LabelEncoder()
df['Gender']=label_encoder.fit_transform(df['Gender'])

In [11]:
df.columns

Index(['CustomerID', 'Name', 'Age', 'Gender', 'Location',
       'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB',
       'Churn'],
      dtype='object')

In [12]:
df['Location'].unique()

array(['Los Angeles', 'New York', 'Miami', 'Chicago', 'Houston'],
      dtype=object)

In [13]:
df=pd.get_dummies(df,columns=['Location'],prefix=['Location'])

In [14]:
df.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn,Location_Chicago,Location_Houston,Location_Los Angeles,Location_Miami,Location_New York
0,1,Customer_1,63,1,17,73.36,236,0,0,0,1,0,0
1,2,Customer_2,62,0,1,48.76,172,0,0,0,0,0,1
2,3,Customer_3,24,0,5,85.47,460,0,0,0,1,0,0
3,4,Customer_4,36,0,3,97.94,297,1,0,0,0,1,0
4,5,Customer_5,46,0,19,58.14,266,0,0,0,0,1,0


## split the data into training and testing

In [15]:
X = df.drop(['CustomerID', 'Name', 'Churn'], axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
print('Training Dataset ',X_train.shape)
print('Testing Dataset ',X_test.shape)

Training Dataset  (80000, 10)
Testing Dataset  (20000, 10)


## Feature engineering

In [17]:
# Generating relevant features ('Bill_to_Usage_Ratio')
X_train['Bill_to_Usage_Ratio'] = X_train['Monthly_Bill'] / X_train['Total_Usage_GB']
X_test['Bill_to_Usage_Ratio'] = X_test['Monthly_Bill'] / X_test['Total_Usage_GB']

In [18]:
#2. Feature Engineering: Finding Customer Tenure
X_train['Customer_Tenure'] = X_train['Age'] - X_train['Subscription_Length_Months']
X_test['Customer_Tenure'] = X_test['Age'] - X_test['Subscription_Length_Months']

## standardscaler

In [19]:
sr = StandardScaler()
X_train=sr.fit_transform(X_train)
X_test = sr.transform(X_test)

In [20]:
logreg = LogisticRegression()
svc_classifier = SVC()
dt_classifier = DecisionTreeClassifier()
knn_classifier = KNeighborsClassifier(5)
rf_classifier = RandomForestClassifier(n_estimators=1000, criterion = 'entropy', random_state = 0 )


In [21]:

logreg.fit(X_train, y_train)
svc_classifier.fit(X_train, y_train)
dt_classifier.fit(X_train, y_train)
rf_classifier.fit(X_train, y_train)

In [22]:
logreg_ypred = logreg.predict(X_test)
svc_classifier_ypred = svc_classifier.predict(X_test)
dt_classifier_ypred = dt_classifier.predict(X_test)
rf_classifier_ypred = rf_classifier.predict(X_test)

In [29]:
logreg_acc = accuracy_score(y_test, logreg_ypred)
classification_rep_log = classification_report(y_test, logreg_ypred)
conf_matrix_log= confusion_matrix(y_test, logreg_ypred)
svc_classifier_acc = accuracy_score(y_test, svc_classifier_ypred)
classification_rep_svc = classification_report(y_test, svc_classifier_ypred)
conf_matrix_svc = confusion_matrix(y_test, svc_classifier_ypred)
dt_classifier_acc = accuracy_score(y_test, dt_classifier_ypred)
classification_rep_dt = classification_report(y_test, dt_classifier_ypred)
conf_matrix_dt = confusion_matrix(y_test, dt_classifier_ypred)
rf_classifier_acc = accuracy_score(y_test, rf_classifier_ypred)
classification_rep_rf = classification_report(y_test, rf_classifier_ypred)
conf_matrix_rf = confusion_matrix(y_test, rf_classifier_ypred)

In [30]:
print ("Logistic Regression : ", round(logreg_acc*100, 2))
print("Classification Report:\n", classification_rep_log)
print("Confusion Matrix:\n", conf_matrix_log)
print('-----'*30)
print ("Support Vector      : ", round(svc_classifier_acc*100, 2))
print("Classification Report:\n", classification_rep_svc)
print("Confusion Matrix:\n", conf_matrix_svc)
print('-----'*30)
print ("Decision Tree       : ", round(dt_classifier_acc*100, 2))
print("Classification Report:\n", classification_rep_dt)
print("Confusion Matrix:\n", conf_matrix_dt)
print('-----'*30)
print ("Random Forest       : ", round(rf_classifier_acc*100, 2))
print("Classification Report:\n", classification_rep_rf)
print("Confusion Matrix:\n", conf_matrix_rf)


Logistic Regression :  50.21
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.62      0.56     10079
           1       0.50      0.38      0.43      9921

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.49     20000
weighted avg       0.50      0.50      0.49     20000

Confusion Matrix:
 [[6253 3826]
 [6132 3789]]
------------------------------------------------------------------------------------------------------------------------------------------------------
Support Vector      :  50.04
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.59      0.54     10079
           1       0.50      0.41      0.45      9921

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000

Confusion Matrix:
 [[5898 4181]
 [5811 4110

In [31]:
# Define the hyperparameters and their possible values
param_grid = {
    'penalty': ['l1', 'l2'],             # Regularization penalty ('l1' or 'l2')
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # Inverse of regularization strength
    'solver': ['liblinear', 'saga']      # Algorithm to use in the optimization problem
}

# Create a logistic regression model
logreg = LogisticRegression()

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Get the best estimator (model)
best_logreg = grid_search.best_estimator_

# Make predictions on the test set using the best model
logreg_ypred = best_logreg.predict(X_test)

# Calculate accuracy using the best model
logreg_acc = accuracy_score(y_test, logreg_ypred)

# Print the best hyperparameters and accuracy
print("Best Hyperparameters:", best_params)
print("Accuracy on Test Set (after tuning):", logreg_acc)

Best Hyperparameters: {'C': 10, 'penalty': 'l2', 'solver': 'saga'}
Accuracy on Test Set (after tuning): 0.5021


In [None]:
# # Define the hyperparameters and their possible values
# param_grid = {
#     'n_estimators': [100, 200],            # Number of trees in the forest
#     'max_depth': [None, 10, 20],            # Maximum depth of the trees
#     'min_samples_split': [2, 5],           # Minimum number of samples required to split an internal node
#     'min_samples_leaf': [1, 2],             # Minimum number of samples required to be at a leaf node
#     'bootstrap': [True, False]                 # Whether bootstrap samples are used when building trees
# }

# # Create a Random Forest Classifier
# rf_classifier = RandomForestClassifier()

# # Create a GridSearchCV object
# grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# # Fit the grid search to the training data
# grid_search.fit(X_train, y_train)

# # Get the best hyperparameters
# best_params = grid_search.best_params_

# # Get the best estimator (model)
# best_rf_classifier = grid_search.best_estimator_

# # Make predictions on the test set using the best model
# rf_ypred = best_rf_classifier.predict(X_test)

# # Calculate accuracy using the best model
# rf_acc = accuracy_score(y_test, rf_ypred)

# # Print the best hyperparameters and accuracy
# print("Best Hyperparameters:", best_params)
# print("Accuracy on Test Set (after tuning):", rf_acc)

In [34]:
# saving the model
import pickle
pickle_out = open("churn.pkl", mode = "wb")
pickle.dump(logreg, pickle_out)
pickle_out.close()