In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('telecom_customer_churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.shape

(7043, 21)

### Drop irrelevant columns

In [6]:
df1 = df.drop({'customerID','gender','StreamingMovies','PaperlessBilling','PaymentMethod'}, axis='columns')
df1.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,Contract,MonthlyCharges,TotalCharges,Churn
0,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,Month-to-month,29.85,29.85,No
1,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,One year,56.95,1889.5,No
2,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,Month-to-month,53.85,108.15,Yes
3,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,One year,42.3,1840.75,No
4,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [10]:
df1.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [37]:
df1['TotalCharges'].min()

' '

In [42]:
df1[df1['TotalCharges'] == ' ']

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,Contract,MonthlyCharges,TotalCharges,Churn
488,0,Yes,Yes,0,No,No phone service,DSL,Yes,No,Yes,Yes,Yes,Two year,52.55,,No
753,0,No,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,20.25,,No
936,0,Yes,Yes,0,Yes,No,DSL,Yes,Yes,Yes,No,Yes,Two year,80.85,,No
1082,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,25.75,,No
1340,0,Yes,Yes,0,No,No phone service,DSL,Yes,Yes,Yes,Yes,Yes,Two year,56.05,,No
3331,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,19.85,,No
3826,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,25.35,,No
4380,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,20.0,,No
5218,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,One year,19.7,,No
6670,0,Yes,Yes,0,Yes,Yes,DSL,No,Yes,Yes,Yes,Yes,Two year,73.35,,No


In [44]:
df2 = df1[df1['TotalCharges'] != ' ']
df2.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,Contract,MonthlyCharges,TotalCharges,Churn
0,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,Month-to-month,29.85,29.85,No
1,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,One year,56.95,1889.5,No
2,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,Month-to-month,53.85,108.15,Yes
3,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,One year,42.3,1840.75,No
4,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [47]:
df2.dtypes

SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [49]:
df2['TotalCharges'] = df2['TotalCharges'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['TotalCharges'] = df2['TotalCharges'].astype(float)


In [50]:
df2.dtypes

SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
Contract             object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

### Remove outliers using Z-score

##### Check for outliers

In [25]:
df1[df1.tenure<(df1.tenure.mean()-3*df.tenure.std())]

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,Contract,MonthlyCharges,TotalCharges,Churn


In [53]:
df2[df2.tenure>(df2.tenure.mean()+3*df2.tenure.std())]

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,Contract,MonthlyCharges,TotalCharges,Churn


In [52]:
df2[df2.MonthlyCharges<(df2.MonthlyCharges.mean()-3*df2.MonthlyCharges.std())]

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,Contract,MonthlyCharges,TotalCharges,Churn


In [55]:
df2[df2.MonthlyCharges>(df2.MonthlyCharges.mean()+3*df2.MonthlyCharges.std())]

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,Contract,MonthlyCharges,TotalCharges,Churn


In [56]:
df2[df2.TotalCharges>(df2.TotalCharges.mean()+3*df2.TotalCharges.std())]

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,Contract,MonthlyCharges,TotalCharges,Churn


In [57]:
df2[df2.TotalCharges<(df2.TotalCharges.mean()-3*df2.TotalCharges.std())]

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,Contract,MonthlyCharges,TotalCharges,Churn


In [58]:
df2.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,Contract,MonthlyCharges,TotalCharges,Churn
0,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,Month-to-month,29.85,29.85,No
1,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,One year,56.95,1889.5,No
2,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,Month-to-month,53.85,108.15,Yes
3,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,One year,42.3,1840.75,No
4,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [60]:
df3 = pd.get_dummies(df2, drop_first=True)
df3.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,...,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,Contract_One year,Contract_Two year,Churn_Yes
0,0,1,29.85,29.85,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,34,56.95,1889.5,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,0,2,53.85,108.15,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,0,45,42.3,1840.75,0,0,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0
4,0,2,70.7,151.65,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [69]:
X = df3.drop('Churn', axis='columns')
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,...,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,Contract_One year,Contract_Two year
0,0,1,29.85,29.85,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,34,56.95,1889.5,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,2,53.85,108.15,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,45,42.3,1840.75,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,1,0
4,0,2,70.7,151.65,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [70]:
y = df3.Churn

In [71]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Use Cross Score Validation to find the best performing model

In [86]:
RFC_score = cross_val_score(RandomForestClassifier(), X_scaled, y)
RFC_score

array([0.78820185, 0.78606965, 0.76600284, 0.78662873, 0.80654339])

In [87]:
RFC_score.mean()

0.7866892928165513

In [90]:
LGR_score = cross_val_score(LogisticRegression(), X_scaled, y)
LGR_score

array([0.80170576, 0.81307747, 0.78520626, 0.80156472, 0.80156472])

In [91]:
LGR_score.mean()

0.8006237861697407

In [76]:
SVM_score = cross_val_score(SVC(), X_scaled, y)
SVM_score

array([0.79033404, 0.80241649, 0.7802276 , 0.79445235, 0.80085349])

In [77]:
SVM_score.mean()

0.7936567922428095

### Use the best performing model to make predictions

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y,test_size=0.2)

In [92]:
model = LogisticRegression()

In [94]:
model.fit(X_train,y_train)

LogisticRegression()

In [95]:
model.score(X_train,y_train)

0.8030222222222222

In [98]:
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,...,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,Contract_One year,Contract_Two year
0,0,1,29.85,29.85,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,34,56.95,1889.5,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,2,53.85,108.15,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,45,42.3,1840.75,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,1,0
4,0,2,70.7,151.65,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [104]:
model.predict([[0,34,56.95,1889.50,0,0,1,0,0,0,1,0,0,0,1,1,0,1,0,0,1,0,1]])

array([1], dtype=uint8)