In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('churn.csv')

In [None]:
df.head(5)

In [None]:
# Data Dictionary --> Details about the data/about each features

In [None]:
# EDA & Data Cleaning

In [None]:
# Univariate & Multi-variate analysis --> Measures of Central tendency, dispersion & shape

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
round(df.describe(),2).T

In [None]:
df.head(5)

In [None]:
df['Geography'].unique()

In [None]:
df['Geography'].value_counts()

In [None]:
import numpy as np

In [None]:
df.columns

In [None]:
df.groupby('Geography').agg({'EstimatedSalary':[np.mean,np.std],
                             'Exited':[np.mean,'count'],'Balance':['min','max',np.mean]})

###### Call of concern : German (most premium custs) are leaving

In [None]:
df.head()

In [None]:
# Do a detailed analysis/cleaning & visualisation of the data

In [None]:
# Class distribution for Classification

In [None]:
# Distribution of the target variable --> Regression

In [None]:
df['Exited'].value_counts()

In [None]:
# Baseline accuracy

In [None]:
7963/10000

In [None]:
2037/10000

In [None]:
#!pip install --user seaborn

In [None]:
import seaborn as sns

In [None]:
sns.distplot(df['CreditScore'])

In [None]:
sns.boxplot(y=df['CreditScore'])

In [None]:
df.head()

In [None]:
x = df.iloc[:,[2,5,6,7,8,9,10,11]]

In [None]:
# One hot, label encoding --> numpy arrays

In [None]:
dummies = pd.get_dummies(df[['Geography','Gender']], drop_first=True)

In [None]:
x = pd.concat([x,dummies], axis=1)

In [None]:
y=df['Exited']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
1617/2000

In [None]:
6346/8000

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg = LogisticRegression()

In [None]:
model_lg = lg.fit(x_train,y_train)

In [None]:
prediction = model_lg.predict(x_test)

In [None]:
pd.crosstab(y_test,prediction)

In [None]:
# Accuracy
(1584+15)/2000

In [None]:
# Precision

In [None]:
15/(15+33)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier(min_samples_leaf=10,min_samples_split=5, max_depth=4)

In [None]:
model_dt = dtree.fit(x_train,y_train)

In [None]:
pred_dtree = model_dt.predict(x_test)

In [None]:
pd.crosstab(y_test,pred_dtree)

In [None]:
(1414+174)/2000

In [None]:
174/(174+204)

In [None]:
(1560+137)/2000

In [None]:
137/(137+57)

In [None]:
# Explore Ensemble models for better results

In [None]:
!whereis spark

In [None]:
import findspark

In [None]:
findspark.init('/usr/local/spark')

In [None]:
import pyspark

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("classifier").getOrCreate()

In [None]:
spark

In [None]:
df1 = spark.read.csv('churn.csv', header=True, inferSchema=True)

In [None]:
df1.head()

In [None]:
df1.printSchema()

In [None]:
# EDA, cleaning & detailed analysis

In [None]:
# df1.toPandas

In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, 

In [None]:
cat_id = StringIndexer(inputCol="Geography", outputCol="Geo").fit(df1)

In [None]:
df1 = cat_id.transform(df1)

In [None]:
df1.select('Geo','Geography').show()

In [None]:
cat_id1 = StringIndexer(inputCol="Gender", outputCol="Gen").fit(df1)

In [None]:
df1 = cat_id1.transform(df1)

In [None]:
df1.select('Gen','Gender').show(15)

In [None]:
cat_one = OneHotEncoder(inputCol="Geo",outputCol="Geon_encoded")

In [None]:
df1=cat_one.transform(df1)

In [None]:
df1.select('Geo','Geography',"Geon_encoded").show(15)

In [None]:
cat_two = OneHotEncoder(inputCol="Gen",outputCol="Gen_encoded")

In [None]:
df1=cat_two.transform(df1)

In [None]:
# from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
df.columns

In [None]:
df_assembler = VectorAssembler(inputCols=['CreditScore','Age','Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
       'EstimatedSalary','Geon_encoded','Gen_encoded'], outputCol="features")

In [None]:
df1 = df_assembler.transform(df1)

In [None]:
df1.select("features").show(5)

In [None]:
final_df = df1.select("features","Exited")

In [None]:
final_df.show(5)

In [None]:
train_data,test_data = final_df.randomSplit([0.8,0.2], seed=42)

In [None]:
train_data.count()

In [None]:
test_data.count()

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression

In [None]:
lg1 = LogisticRegression(labelCol="Exited")

In [None]:
model_lg1 = lg1.fit(train_data)

In [None]:
pred_lg1 = model_lg1.evaluate(test_data)

In [None]:
pred_lg1.predictions.show(5)

In [None]:
pred_lg1.predictions.collect()

In [None]:
pred_lg2 = model_lg1.transform(test_data)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [None]:
eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Exited')

In [None]:
eval.evaluate(pred_lg2)

In [None]:
dtree = DecisionTreeClassifier(featuresCol='features', labelCol='Exited')

In [None]:
model_tree = dtree.fit(train_data)

In [None]:
pred_tree=model_tree.transform(test_data)

In [None]:
pred_tree.show()

In [None]:
eval_tree = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Exited')

In [None]:
acc = eval_tree.evaluate(pred_tree)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
y_origin = pred_tree.select('Exited').collect()

In [None]:
y_predicted = pred_tree.select('prediction').collect()

In [None]:
cm = confusion_matrix(y_true=y_origin,y_pred=y_predicted)

In [None]:
# Accuracy
(187+1465)/2000

In [None]:
# Precision
187/(187+88)