# Importing Modules

In [57]:
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [104]:
import pandas as pd
import numpy as np

churn_data = pd.read_excel("customer_churn_large_dataset.xlsx")
churn_data.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0


# Data PreProcessing

In [105]:
churn_data.dropna()
churn_data.drop("Name" , axis = 1 , inplace= True)
churn_data.drop("CustomerID" , axis = 1 , inplace= True)

#Vectorizing Genders

churn_data["Gender"] = churn_data["Gender"].apply(lambda g: 1 if g=="Male" else 0)

#Vectorizing Cities

locations = list(churn_data["Location"].unique())
churn_data["Location"] = churn_data["Location"].apply(lambda x: locations.index(x))

churn_data


Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,63,1,0,17,73.36,236,0
1,62,0,1,1,48.76,172,0
2,24,0,0,5,85.47,460,0
3,36,0,2,3,97.94,297,1
4,46,0,2,19,58.14,266,0
...,...,...,...,...,...,...,...
99995,33,1,4,23,55.13,226,1
99996,62,0,1,19,61.65,351,0
99997,64,1,3,17,96.11,251,1
99998,51,0,1,20,49.25,434,1


# Training Validation And Test Datasets

In [106]:
churn_df = churn_data.sample(frac = 1)
train , test = np.split(churn_df , [int(0.8*len(churn_df))])   

In [107]:
print(len(train))
train.head()


80000


Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
24184,55,1,4,6,61.88,156,1
51487,33,1,0,2,35.21,109,0
80275,29,1,1,6,75.41,487,0
167,33,0,4,1,95.13,320,0
59897,36,0,3,17,95.76,375,0


In [108]:

print(len(test))
test.head()


20000


Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
79806,55,1,1,21,84.31,327,1
39001,29,1,3,6,68.84,488,0
65213,29,1,0,6,38.16,117,0
21450,38,1,4,9,39.39,235,1
40842,36,1,1,22,45.98,85,0


In [109]:
def scale_dataset(df, oversample=False):
    x = df[df.columns[:-1]].values
    y = df[df.columns[-1]].values

    scaler = StandardScaler()
    X = scaler.fit_transform(x)

    if oversample:
        ros = RandomOverSampler()
        X,y = ros.fit_resample(X , y)

    data = np.hstack((X , np.reshape( y , (-1, 1))))

    return data , X , y

In [110]:
np.unique(train.columns[-1])

array(['Churn'], dtype='<U5')

In [111]:
train , x_train , y_train = scale_dataset(train,oversample=True)
test , x_test , y_test = scale_dataset(test , oversample= False)

In [112]:
x_train[5000]

array([-1.4421323 , -0.99513683, -0.00161641,  0.21851728, -0.74767178,
       -0.6212928 ])

# Logistic Regression Model

In [113]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [114]:
lr_model = LogisticRegression()
lr_model.fit(x_train,y_train)

In [115]:
y_pred = lr_model.predict(x_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.50      0.50     40249
           1       0.50      0.50      0.50     40249

    accuracy                           0.50     80498
   macro avg       0.50      0.50      0.50     80498
weighted avg       0.50      0.50      0.50     80498



# Support Vector Machine

In [116]:
from sklearn.svm import SVC

In [117]:
svm_model = SVC()
svm_model.fit(x_train, y_train)

In [118]:
y_pred = svm_model.predict(x_test)
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.53      0.50      0.52     10497
           1       0.48      0.51      0.49      9503

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.51      0.50      0.50     20000

