**Conversational AI: Data Science(UCS663)**
**Lab Evaluation**
# **Telecom Churn Prediction**
**Performed By: Shobheet Pandey**

**Roll No: 101916041**

**Group: 3CS10**

In [241]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [242]:
#importing required libraries
import cudf
import cupy as cp
import cuml
from cuml.linear_model import LogisticRegression
from cuml.ensemble import RandomForestClassifier
from cuml.neighbors import KNeighborsClassifier
from cuml.model_selection import train_test_split
from cuml.datasets.classification import make_classification
from sklearn.metrics import accuracy_score,classification_report
import matplotlib.pyplot as plt
# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000

In [243]:
#loading dataset
df = cudf.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.shape

In [244]:
df.head()

In [245]:
#checking for null values
df.isnull().sum()

**Exploratory Data Analysis**

In [246]:
#performing EDA
#checking for how much users according to dataset are churning
import seaborn as sns
dfp=df
dfp=df.to_pandas()
sns.countplot(dfp['Churn'])

In [247]:
#checking for churn for genders
sns.countplot(x='gender', hue='Churn', data=dfp);

In [248]:
#checking in case of senior citizens
sns.countplot(x='SeniorCitizen', hue='Churn', data=dfp);

In [249]:
#in case of internet service
sns.countplot(x='InternetService', hue='Churn', data=dfp);

In [250]:
#in case of payment method
sns.countplot(x='PaymentMethod', hue='Churn', data=dfp);

In [251]:
sns.countplot(x='PhoneService', hue='Churn', data=dfp);

In [252]:
sns.countplot(x='StreamingMovies', hue='Churn', data=dfp);

In [253]:
sns.countplot(x='StreamingTV', hue='Churn', data=dfp);

In [254]:
sns.countplot(x='OnlineSecurity', hue='Churn', data=dfp);

In [255]:
sns.countplot(x='PaperlessBilling', hue='Churn', data=dfp);

In [256]:
sns.countplot(x='TechSupport', hue='Churn', data=dfp);

In [257]:
sns.countplot(x='Dependents', hue='Churn', data=dfp);

In [258]:
sns.countplot(x='DeviceProtection', hue='Churn', data=dfp);

In [259]:
ax = sns.distplot(dfp['tenure'], hist=True, kde=False, 
             bins=int(180/5), color = 'yellow', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
ax.set_ylabel('Customers')
ax.set_xlabel('Tenure')
ax.set_title('No. of Customers by their tenure')

In [277]:
dfp.hist(figsize=(20,16))
plt.show() 

**Modelling Data:**

**Label Encoding**

In [261]:
#performing label encoding
le=cuml.preprocessing.LabelEncoder()
for i in df.columns:
    if df[i].dtype==np.number:
        continue
    df[i]=le.fit_transform(df[i])

In [262]:
df.info()

In [263]:
#checking for correlation
dfp=df
dfp=df.to_pandas()
sns.heatmap(dfp.corr())

In [264]:
df.head()

In [265]:
Y=df['Churn']
X=df.drop('Churn', axis=1)

In [266]:
#scaling the data
from cuml.preprocessing import StandardScaler
X=StandardScaler().fit_transform(X)

In [267]:
#splitting into train and test dataset
x_train,x_test,y_train,y_test=cuml.train_test_split(X,Y,test_size=0.3,random_state=42)

**Logistic Regression**

In [268]:
#implementing logistic regression
from cuml.metrics import accuracy_score
lr=LogisticRegression()
lr.fit(x_train.astype('float32'),y_train.astype('float32'))
predict= lr.predict(x_test.astype('float32'))
print("Accuracy Score: ",accuracy_score(y_test, predict))

**K-Nearest Neighbours**

In [269]:
#implementing K-Nearest Neighbours
from cuml.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
knn.fit(x_train.astype('float32'),y_train.astype('float32'))
predict= knn.predict(x_test.astype('float32'))
print("Accuracy Score: ",accuracy_score(y_test, predict))

**Gradient Boosting Classifier**

In [270]:
#implementing Gradient Boosting
#as gradient boosting and extreme gradient boosting are not supported in rapids so converting train and test data to pandas
x_trainp=x_train
x_trainp=x_train.to_pandas()
x_testp=x_test
x_testp=x_test.to_pandas()
y_trainp=y_train
y_trainp=y_train.to_pandas()
y_testp=y_test
y_testp=y_test.to_pandas()
from sklearn.ensemble import GradientBoostingClassifier
gb=GradientBoostingClassifier()
gb.fit(x_trainp.astype('float32'),y_trainp.astype('float32'))
predict= gb.predict(x_testp.astype('float32'))
print("Accuracy Score: ",accuracy_score(y_testp, predict))

**Extreme Gradient Boosting Classifier(XGB)**

In [271]:
#implementing Extreme Gradient Boosting
from xgboost import XGBClassifier
xgb=XGBClassifier(learning_rate = 0.3, max_depth = 15, n_estimators = 400,eval_metric = 'auc',use_label_encoder=False,objective = 'binary:logistic', random_state = 42)
xgb.fit(x_trainp.astype('float32'),y_trainp.astype('float32'))
predict= xgb.predict(x_testp.astype('float32'))
print()
print("Accuracy Score: ",accuracy_score(y_testp, predict))

**Random Forest Classifier**

In [272]:
#implementing Random Forest Classifier
rfc=RandomForestClassifier(n_estimators=500 , random_state =50, max_features = "auto")
rfc.fit(x_trainp.astype('float32'),y_trainp.astype('float32'))
predict= rfc.predict(x_testp.astype('float32'))
print("Accuracy Score: ",accuracy_score(y_testp, predict))

In [273]:
#trying to improve accuracy of Random Forest Classifier by hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
estimators=range(50,100)
max_features = ['auto', 'sqrt']
max_depth = range(4,12)
min_samples_split = range(2,8)
min_samples_leaf = range(1,8)
bootstrap = [True, False]
criterion=['gini','entropy']
random_grid = {'n_estimators':estimators,
              'max_features':max_features,
              'max_depth':max_depth,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf,
              'bootstrap':bootstrap,
              'criterion':criterion}
rf=RandomForestClassifier(n_estimators=500 , oob_score = True, n_jobs = -1,random_state =50, max_features = "auto",max_leaf_nodes = 30)
rf=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,cv=5,verbose=1)
rf.fit(x_trainp,y_trainp)
best_estimate=rf.best_params_
rf2=RandomForestClassifier(**best_estimate)
rf2.fit(x_trainp,y_trainp)
predict=rf2.predict(x_testp)
print("Accuracy Score: ",accuracy_score(y_testp, predict))

As we can see, **Logistic Regression** gave us the best accuracy score

**Evaluation Metrics**

In [274]:
#confusion matrix as according to logistic regression which gave the best accuracy
from cuml.metrics import confusion_matrix
lr.fit(x_train.astype('float32'),y_train.astype('float32'))
predict= lr.predict(x_test.astype('float32'))
print("Confusion Matrix:")
print(confusion_matrix(y_test.astype('int'),predict.astype('int')))
x_trainp=x_train
x_trainp=x_train.to_pandas()
x_testp=x_test
x_testp=x_test.to_pandas()
y_trainp=y_train
y_trainp=y_train.to_pandas()
y_testp=y_test
y_testp=y_test.to_pandas()
predictp=predict
predictp=predictp.to_pandas()
print(classification_report(y_testp,predictp))

**Feature Importance**

In [275]:
#as according to logistic regression which gave the best accuracy score
from sklearn.linear_model import LogisticRegression
l_r=LogisticRegression()
l_r.fit(x_trainp.astype('float32'),y_trainp.astype('float32'))
predict= l_r.predict(x_testp.astype('float32'))
weights = pd.Series(l_r.coef_[0],
                 index=X.columns.values)
print (weights.sort_values(ascending = False)[:20].plot(kind='bar',x=dfp.columns))

In [276]:
dfp.info()

So,according to the feature importance plot column no. 18 ie **"Monthly Charges"** contributes most for the customers churning out followed by **"Total Charges"**(19) and **"Paperless Billing"**(16).