# Classification on Customer Churn Dataset for the Data Analytics Session
by Rathachai C.


## 0) To load libraries

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score



---



## 1) To load data and check

In [0]:
churn = pd.read_csv('https://rathachai.github.io/DA101/data/customer-churn-data.csv')

In [0]:
churn

In [0]:
churn.info()

In [0]:
churn.describe()



---



## 2) To do data pre-processing

In [0]:
churn.dropna(inplace=True)

In [0]:
churn.info()

## 3) To do data exploration

### 3.1) to view data

In [0]:
sns.countplot(x="Churn", data=churn);


In [0]:
sns.pairplot(churn, hue="Churn")

In [0]:
sns.countplot(x="Gender", hue="Churn", data=churn);


In [0]:
sns.countplot(x="Payment Method", hue="Churn", data=churn);

### 3.2) to do data encoder

In [0]:
lb_make = LabelEncoder()

In [0]:
churn["Gender Code"] = lb_make.fit_transform(churn["Gender"])

### 3.3) to do one-hot encoding

In [0]:
churn = pd.get_dummies(churn, columns=["Payment Method"])

In [0]:
churn



---



## 4) To do **`Logistic Regression`**

### 4.1) To select data

In [0]:
X = churn[["Age", "LastTransaction", "Gender Code", "Payment Method_cash", "Payment Method_cheque", "Payment Method_credit card"]]
y = churn["Churn"]

In [0]:
X

In [0]:
y

### 4.2) to split train and test datasets

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

### 4.3) to train and create a linear regression model

In [0]:
clf = LogisticRegression()

In [0]:
clf.fit(X_train,y_train)

In [0]:
clf.coef_

In [0]:
clf.intercept_

### 4.4) To predict from the test set

In [0]:
y_pred = clf.predict(X_test)

In [0]:
y_pred

### 4.5) To evaluate the predicted value with the test set

In [0]:
cm_labels = churn["Churn"].unique()
print(cm_labels)
confusion_matrix(y_test, y_pred, labels=cm_labels)

In [0]:
print(classification_report(y_test,y_pred))

In [0]:
f1 = f1_score(y_test, y_pred, average='weighted')
print ("F1 =", f1)



---



## 5) To do **Decision Tree**

In [0]:
# Import required libraries
from sklearn import tree

# Select Data
X = churn[["Age", "LastTransaction", "Gender Code", "Payment Method_cash", "Payment Method_cheque", "Payment Method_credit card"]]
y = churn["Churn"]

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

# Train and Create a Model
clf = tree.DecisionTreeClassifier()
clf.fit(X, y)

# Predict
y_pred = clf.predict(X_test)

# Print Confusion Matrix
print("***** Confusion Matrix *****")
cm_labels = churn["Churn"].unique()
print(cm_labels)
print(confusion_matrix(y_test, y_pred, labels=cm_labels))

# Print Report
print()
print("***** Report *****")
print(classification_report(y_test,y_pred))

# Check F1 value
print()
print("***** F1 *****")
f1 = f1_score(y_test, y_pred, average='weighted')
print ("F1 = ", f1)

## Exercises


1.   work with 5-fold cross-validation  (hint: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html)
2.   use other classifier techiques e.g. Bayes, SVN, ANN,


### do your exerciese here

In [0]:
# run your code here