<a href="https://colab.research.google.com/github/SurekhaBerlin/Python/blob/main/predictiveanalytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

In [None]:
df = pd.read_csv('/content/Customer-Churn.csv')

In [None]:
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


#### Data Cleaning

In [None]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [None]:
df.dropna(how='any', inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 
 17  

In [None]:
# y-variable distribution
df.Churn.value_counts()

Churn
No     5163
Yes    1869
Name: count, dtype: int64

In [None]:
df.Churn.value_counts()/len(df)*100

Churn
No     73.421502
Yes    26.578498
Name: count, dtype: float64

In [None]:
df.drop(columns=['customerID'], inplace=True)

#### Feature Encoding

- Dummy Encoding
- OneHot Encoding
- Label Encoding --> Not recommended for X-variables
- Target Encoding
- Hash Encoding
- Binary Encoding

In [None]:
df_dummies = pd.get_dummies(df, drop_first=True)

In [None]:
df_dummies.head(5)

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,False,True,False,True,False,False
1,0,34,56.95,1889.5,True,False,False,True,False,False,...,False,False,False,True,False,False,False,False,True,False
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,False,True,False,False,True,True
3,0,45,42.3,1840.75,True,False,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
4,0,2,70.7,151.65,False,False,False,True,False,False,...,False,False,False,False,False,True,False,True,False,True


#### Model Building - Pre Requisites

In [None]:
df_dummies.head(5)

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,False,True,False,True,False,False
1,0,34,56.95,1889.5,True,False,False,True,False,False,...,False,False,False,True,False,False,False,False,True,False
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,False,True,False,False,True,True
3,0,45,42.3,1840.75,True,False,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
4,0,2,70.7,151.65,False,False,False,True,False,False,...,False,False,False,False,False,True,False,True,False,True


In [None]:
X = df_dummies.drop('Churn_Yes', axis=1)
y = df_dummies['Churn_Yes']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
len(df_dummies)

7032

In [None]:
len(X_train)

5625

In [None]:
len(X_test)

1407

#### Feature Scaling

In [None]:
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

#### Model Building

##### **kNN Classifier**

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)

In [None]:
y_pred = knn.predict(X_test)



In [None]:
print('Accuracy of kNN model is: ',round(accuracy_score(y_pred, y_test)*100,2), '%')

Accuracy of kNN model is:  73.56 %


##### **DT Classifier**

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train_scaled, y_train)

In [None]:
y_pred_dt = dt.predict(X_test)



In [None]:
print('Accuracy of Decision Tree model is: ',round(accuracy_score(y_pred_dt, y_test)*100,2), '%')

Accuracy of Decision Tree model is:  67.16 %


##### **RF Classifier**

In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train_scaled, y_train)

In [None]:
y_pred_rf = rf.predict(X_test)



In [None]:
print('Accuracy of Random Forest model is: ',round(accuracy_score(y_pred_rf, y_test)*100,2), '%')

Accuracy of Random Forest model is:  73.7 %


##### **AdaBoost Classifier**

In [None]:
ada = AdaBoostClassifier(n_estimators=100)
ada.fit(X_train_scaled, y_train)

In [None]:


y_pred_ada = ada.predict(X_test)



In [None]:
print('Accuracy of Ada Boost model is: ',round(accuracy_score(y_pred_ada, y_test)*100,2), '%')

Accuracy of Ada Boost model is:  73.56 %


#### **XGBoost**

In [None]:
xgboost_model = XGBClassifier()
xgboost_model.fit(X_train_scaled, y_train)
y_pred_xgb = xgboost_model.predict(X_test)
print('Accuracy of XG Boost model is: ',round(accuracy_score(y_pred_xgb, y_test)*100,2), '%')

Accuracy of XG Boost model is:  73.56 %



## NAIVE BAYES.  

In [None]:

from sklearn.naive_bayes import BernoulliNB

In [None]:
model_nb = BernoulliNB()

In [None]:
model_nb.fit(X_train_scaled, y_train)

In [None]:
y_pred_nb = model_nb.predict(X_test_scaled)

In [None]:
print(accuracy_score(y_test,y_pred_nb)*100)

75.26652452025586


## SVM Classifier

In [None]:
from sklearn.svm import SVC

In [None]:
model_svc = SVC()

In [None]:
model_svc.fit(X_train_scaled,y_train)

In [None]:
y_pred_svc = model_svc.predict(X_test_scaled)

In [None]:
print(accuracy_score(y_test,y_pred_svc)*100)

78.53589196872778


## Logistic Classifier

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model_logisticregression = LogisticRegression()

In [None]:
model_logisticregression.fit(X_train_scaled,y_train)

In [None]:
y_pred_logisticregression = model_logisticregression.predict(X_test_scaled)

In [None]:
print(accuracy_score(y_test,y_pred_logisticregression)*100)

79.60199004975125
