In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt   
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import roc_auc_score,roc_curve,classification_report,confusion_matrix,plot_confusion_matrix

In [2]:
df = pd.read_excel("Cellphone.xlsx")

In [3]:
df.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,1,108.0,0.0,0.0,0.38,2.0,115.1,114.0,41.8,10.57,13.8
1,0,113.0,1.0,1.0,3.59,1.0,44.9,63.0,55.9,6.71,13.3
2,0,101.0,1.0,0.0,0.0,0.0,257.3,84.0,60.0,9.24,13.5
3,0,80.0,1.0,1.0,2.89,4.0,166.4,92.0,77.9,11.92,10.7
4,0,70.0,1.0,0.0,0.0,0.0,175.4,130.0,44.0,7.98,11.6


In [4]:
df.describe()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
count,3333.0,3303.0,3315.0,3324.0,3317.0,3281.0,3298.0,3322.0,3320.0,3309.0,3326.0
mean,0.144914,101.159552,0.902866,0.276474,0.814827,1.563852,179.863069,100.434377,56.288735,10.048598,10.237974
std,0.352067,39.879736,0.296185,0.447321,1.270329,1.31859,54.58144,20.079248,16.438343,2.539063,2.793192
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0
25%,0.0,74.0,1.0,0.0,0.0,1.0,143.7,87.0,45.0,8.33,8.5
50%,0.0,101.0,1.0,0.0,0.0,1.0,179.6,101.0,53.5,10.07,10.3
75%,0.0,127.0,1.0,1.0,1.78,2.0,216.7,114.0,66.125,11.77,12.1
max,1.0,243.0,1.0,1.0,5.4,9.0,350.8,165.0,111.3,18.19,20.0


In [5]:
df.isnull().sum()

Churn               0
AccountWeeks       30
ContractRenewal    18
DataPlan            9
DataUsage          16
CustServCalls      52
DayMins            35
DayCalls           11
MonthlyCharge      13
OverageFee         24
RoamMins            7
dtype: int64

In [6]:
cols = ["DataPlan", "ContractRenewal"]
for c in cols:
    mode = df[c].mode()[0]
    df[c].fillna(mode, inplace = True)
    
df.isnull().sum()

Churn               0
AccountWeeks       30
ContractRenewal     0
DataPlan            0
DataUsage          16
CustServCalls      52
DayMins            35
DayCalls           11
MonthlyCharge      13
OverageFee         24
RoamMins            7
dtype: int64

In [7]:
from sklearn.impute import SimpleImputer
SI = SimpleImputer(strategy = "median")

In [8]:
df = pd.DataFrame(SI.fit_transform(df), columns = df.columns)

In [9]:
df.isnull().sum()

Churn              0
AccountWeeks       0
ContractRenewal    0
DataPlan           0
DataUsage          0
CustServCalls      0
DayMins            0
DayCalls           0
MonthlyCharge      0
OverageFee         0
RoamMins           0
dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Churn            3333 non-null   float64
 1   AccountWeeks     3333 non-null   float64
 2   ContractRenewal  3333 non-null   float64
 3   DataPlan         3333 non-null   float64
 4   DataUsage        3333 non-null   float64
 5   CustServCalls    3333 non-null   float64
 6   DayMins          3333 non-null   float64
 7   DayCalls         3333 non-null   float64
 8   MonthlyCharge    3333 non-null   float64
 9   OverageFee       3333 non-null   float64
 10  RoamMins         3333 non-null   float64
dtypes: float64(11)
memory usage: 286.6 KB


In [11]:
df.Churn.value_counts(normalize=True)

0.0    0.855086
1.0    0.144914
Name: Churn, dtype: float64

In [12]:
df_copy = df.copy()

In [13]:
X = df_copy.drop("Churn", axis = 1)
y = df_copy.Churn

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.30, random_state = 1, stratify = y)

In [15]:
X_train.shape

(2333, 10)

In [16]:
clf = LinearDiscriminantAnalysis()
model = clf.fit(X_train, y_train)

In [17]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

In [18]:
confusion_matrix(y_test, pred_test)

array([[811,  44],
       [105,  40]], dtype=int64)

In [19]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

         0.0       0.89      0.95      0.92       855
         1.0       0.48      0.28      0.35       145

    accuracy                           0.85      1000
   macro avg       0.68      0.61      0.63      1000
weighted avg       0.83      0.85      0.83      1000



In [20]:
pred_train = model.predict_proba(X_train)
np.array(pred_train[:,1])[2]

0.1542254825795781

In [21]:
pred_train[:,1]

array([0.02069932, 0.20179416, 0.15422548, ..., 0.10058096, 0.16733976,
       0.0528347 ])

In [22]:
for j in np.arange(0.1,0.9, 0.1):
    custom_prob = j
    custom_cutoff = []
    
    for i in range(0,len(y_train)):
        if np.array(pred_train[:,1])[i] > custom_prob:
            a = 1
        else:
            a = 0
            
        custom_cutoff.append(a)

In [23]:
custom_cutoff

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
