In [None]:
import pandas as pd
import numpy as np
from plotly import express as px
from matplotlib import pyplot as plt

# Introduction

For this lecture we will be using the customer churn dataset. The dataset contains information about customers of a telecom company. The goal is to predict whether a customer will churn (i.e., stop using the company's services) based on the customer's demographic information, account information, and usage of various services.

1. Basic data exploration,
2. Data cleaning and Dummy encoding of categorical features.
3. Use Weight of Evidence (WoE) to transform categorical features into numeric features that can be used for prediction.
4. Use Principal Component Analysis (PCA) to reduce the dimensionality of the dataset.
5. Use Logistic Regression and to predict whether a customer will churn.
6. Build and deploy a basic neural network using TensorFlow and Keras to predict whether a customer will churn.

In [None]:
# load data
churn_df = pd.read_csv('data/telecom_customer_churn.csv')

# 1. Data Exploration

In [None]:
churn_df.head()

In [None]:
churn_df.info()

In [None]:
# count number of nulls
churn_df.isnull().sum()

In [None]:
# if a feature has type object count the number of unique values
for col in churn_df.columns:
    if churn_df[col].dtype == 'object':
        print(f'{col}: {churn_df[col].nunique()} unique Categories')

In [None]:
print('Number of unique cities: ',churn_df['City'].nunique())

In [None]:
# print all the categories in a feature
feat = 'Streaming TV'
print(f'Categories in the feature "{feat}": ')
for reason in churn_df[feat].unique():
    print('\t',reason)

In [None]:
# print the 10 cities with the most customers
churn_df['City'].value_counts().head(10)

In [None]:
churn_df['Customer Status'].value_counts()

In [None]:
# we want to find the customer status for each city, so we group by city and get the value counts of Customer Status
churn_df.groupby('City')['Customer Status'].value_counts().head(25)

In [None]:
# use px.density_mapbox to show the number of customers in each city, center the map on California
fig = px.density_mapbox(churn_df, lat='Latitude', lon='Longitude', radius=10, zoom=5.5, mapbox_style='stamen-terrain', height=1200, width=900)
fig.show()

# 2. Data Cleaning

In [None]:
# drop irrelevant features
garbage_cols = ['Gender','Customer ID','Latitude', 'Longitude', 'Zip Code', 'City',  'Churn Category', 'Churn Reason']
churn_df_clean = churn_df.drop(garbage_cols, axis=1)

In [None]:
churn_df_clean.info()

In [None]:
# we don't care about customers that just Joined, so let's drop them
churn_df_clean = churn_df_clean[churn_df_clean['Customer Status'] != 'Joined']

In [None]:
# map the values of the feature 'Customer Status' to numerical values and call it Churned
churn_df_clean['Churned'] = churn_df_clean['Customer Status'].map({'Stayed': 0, 'Churned':1})
churn_df_clean = churn_df_clean.drop('Customer Status', axis=1)

In [None]:
# dealing with missing values, in our categorical features we will replace the missing values with 'Not Applicable'
categorical_features = [col for col in churn_df_clean.columns if churn_df_clean[col].dtype == 'object']
print(categorical_features)

In [None]:
# replace missing values with 'Not Applicable'
for col in categorical_features:
    churn_df_clean[col] = churn_df_clean[col].fillna('Not Applicable')

In [None]:
# you can also replace missing values with the mode of the feature
#  in case you forgot mode is the most frequent value in a feature
for col in categorical_features:
    churn_df_clean[col] = churn_df_clean[col].fillna(churn_df_clean[col].mode()[0])

In [None]:
churn_df_clean.info()

In [None]:
# there a few numerical features with missing values, we will replace them with the mean of the feature
numerical_features = [col for col in churn_df_clean.columns if churn_df_clean[col].dtype != 'object']
print(numerical_features)

for num_f in numerical_features:
    churn_df_clean[num_f] = churn_df_clean[num_f].fillna(churn_df_clean[num_f].mean())

In [None]:
churn_df_clean.info()

Dummy encoding is a common way to transform categorical features into numeric features that can be used for prediction. The process involves creating a new binary feature for each category in a categorical feature. For example, the feature "Offer" has 5 categories: "Offer A", "Offer B", "Offer C", "Offer D", and "Offer E". Dummy encoding will create 5 new binary features: "Offer A", "Offer B", "Offer C", "Offer D", and "Offer E". Each of these new features will have a value of 1 if the customer was offered that particular offer and 0 otherwise.

While thi can be useful, features with a large number of categories can lead to a large number of new features. For example, the feature "City" has 112 categories. Dummy encoding will create 112 new features, one for each city. This can lead to a large number of features and the curse of dimensionality. To avoid this, we will only dummy encode features with a small number of categories.

In [None]:
# Dummy encode the categorical features
churn_df_dum = pd.get_dummies(churn_df_clean, columns=categorical_features, drop_first=True)

In [None]:
churn_df_dum.info()

# 3.2 Weight of Evidence (WoE) and Information Value (IV)

Weight of Evidence (WoE) is a statistical technique commonly used in credit scoring and other applications to assess the predictive power of independent variables in a logistic regression model. It involves transforming categorical variables into numeric values that can be used as inputs for predictive models. Here's how you can use WoE:

$$\text{WoE} = \ln\left(\frac{\text{Distribution of Good}}{\text{Distribution of Bad}}\right)$$

Understand the purpose: WoE is used to measure the relationship between a categorical variable and the likelihood of an event occurring (e.g., defaulting on a loan or in our case a customer churn). It calculates the relative "evidence" provided by each category in predicting the event.

1. *Calculate the WoE*: To compute the WoE for each category of a categorical variable, follow these steps:

    a. For each category, calculate the proportion of events (e.g., customer churn) and non-events (e.g., stayed).

    b. Calculate the ratio of event proportion to non-event proportion for each category.

    c. Take the natural logarithm of the ratio obtained in step b.

    d. Multiply the result from step c by 100 to scale the WoE values.

    The formula for WoE is: WoE = ln(Event Proportion / Non-event Proportion)

2. *Replace categorical values with WoE*: Once you have calculated the WoE for each category, you can replace the original categorical values in your dataset with their corresponding WoE values. This transformation ensures that the categorical variable retains its predictive power while being expressed numerically.

3. *Handle missing values*: If you have missing values in your categorical variable, you can assign a separate category or use a special WoE value to represent those missing values.

4. *Interpretation*: After converting categorical values to WoE, you can interpret the magnitude and direction of the WoE values. Higher positive values indicate a higher likelihood of the event occurring, while lower negative values indicate a lower likelihood. A value of zero means that the event and non-event proportions are equal.

Information Value (IV) is a measure of the predictive power of an independent variable in a logistic regression model. It is calculated by summing the differences between the proportions of events and non-events for each category of the variable, multiplied by the WoE for that category. The higher the IV, the more predictive power the variable has. Here's how you can use IV:
$$\text{IV} = \sum_{i=1}^{n} (\text{Distribution of Good}_i - \text{Distribution of Bad}_i) \times \text{WoE}_i$$
1. *Calculate the IV*: To compute the IV for each category of a categorical variable, follow these steps:

    a. For each category, calculate the proportion of events (e.g., customer churn) and non-events (e.g., stayed).

    b. Calculate the difference between the event proportion and non-event proportion for each category.

    c. Multiply the result from step b by the WoE for that category.

    d. Sum the results from step c to obtain the IV for the variable.

    The formula for IV is: IV = sum((Event Proportion - Non-event Proportion) * WoE)
2. *Interpretation*: After calculating the IV for each variable, you can interpret the predictive power of each variable. According to the literature, the following guidelines can be used to interpret the IV:

    * $< 0.02$: Useless for prediction
    * $0.02 \text{ to } 0.1$: Weak predictor
    * $0.1 \text{ to } 0.3$: Medium predictor
    * $0.3 \text{ to } 0.5$: Strong predictor
    * $> 0.5$: Suspicious predictor

What makes a variable a suspicious predictor? According to the literature, a variable with an IV greater than 0.5 is too good to be true and may indicate data leakage or other problems. In this case, you should investigate further to determine the cause of the high IV.





In [None]:
# create a function that calculates the weight of evidence of each category in a feature
#  add information value of the feature
def calc_weight_of_evidence(df, target, num_bins=10):
    # create good and bad columns, by mapping the target feature to 1 and 0
    df[f'Good'] = np.where(df[target] == 0, 1, 0)
    df[f'Bad'] = np.where(df[target] == 1, 1, 0)
    total_good = df['Good'].sum()
    total_bad = df['Bad'].sum()
    iv = {}
    # after cleaning get the list of categorical features
    categorical_features = [col for col in df.columns if df[col].dtype == 'object']
    for feature in categorical_features:
        # ignore the target feature and Good and Bad columns
        if feature == target or feature == 'Good' or feature == 'Bad':
            continue

        # dealing with categorical
        if df[feature].dtype == 'object':
            # group by each category in the feature and calculate the WoE, binning the feature values
            grouped = df.groupby(feature).agg({'Good': 'sum', 'Bad': 'sum'})
            # create a DistributionGood and DistributionBad column to calculate the proportion of each category, add 0.5 * len(grouped) to avoid division by zero
            grouped['DistributionGood'] = (grouped['Good'] + 0.5) / (total_good + 0.5 * len(grouped))
            grouped['DistributionBad'] = (grouped['Bad'] + 0.5) / (total_bad + 0.5 * len(grouped))
            # calculate the WoE
            grouped['WoE'] = np.log(grouped['DistributionGood'] / grouped['DistributionBad'])
            # make a woe dictionary to map each category to its corresponding WoE value
            woe_dict = grouped['WoE'].to_dict()
            df[feature] = df[feature].map(woe_dict)
            # calculate the information value of the feature and add it to the iv dictionary
            information_value = ((grouped['DistributionGood'] - grouped['DistributionBad']) * grouped['WoE']).sum()
            iv[feature] = information_value
        # dealing with numerical
        if df[feature].dtype != 'object':
            # binning the feature values
            df[feature], bins = pd.cut(df[feature], bins=num_bins, retbins=True, labels=False)
            # group by each category in the feature and calculate the WoE
            grouped = df.groupby(feature).agg({'Good': 'sum', 'Bad': 'sum'})
            # create a DistributionGood and DistributionBad column to calculate the proportion of each category, add 0.5 * len(grouped) to avoid division by zero
            grouped['DistributionGood'] = (grouped['Good'] + 0.5) / (total_good + 0.5 * len(grouped))
            grouped['DistributionBad'] = (grouped['Bad'] + 0.5) / (total_bad + 0.5 * len(grouped))
            # calculate the WoE
            grouped['WoE'] = np.log(grouped['DistributionGood'] / grouped['DistributionBad'])
            # make a woe dictionary to map each category to its corresponding WoE value
            woe_dict = grouped['WoE'].to_dict()
            df[feature] = df[feature].map(woe_dict)
            # calculate the information value of the feature and add it to the iv dictionary
            information_value = ((grouped['DistributionGood'] - grouped['DistributionBad']) * grouped['WoE']).sum()
            iv[feature] = information_value
    df = df.drop(['Good', 'Bad'], axis=1)
    return df, iv

In [None]:
# calculate the weight of evidence and IV for the churn df
churn_df_woe, churn_iv = calc_weight_of_evidence(churn_df_clean, 'Churned')

In [None]:
churn_df_woe.info()

In [None]:
# print the information value of each feature
for feature in churn_iv:
    print(feature, churn_iv[feature])

In [None]:
# drop features with IV < 0.02 and > 0.5
bad_features = []
for feature in churn_iv:
    if churn_iv[feature] <= 0.02 or churn_iv[feature] >= 0.5:
        bad_features.append(feature)

for feature in bad_features:
    print(f'Dropping {feature} because its IV is {churn_iv[feature]}')

Look at bad features further, let's plot hist of each feature

In [None]:
for feature in bad_features:
    plt.figure(figsize=(8, 6))
    plt.hist(churn_df_woe[feature])
    plt.title(feature)
    plt.show()

In [None]:
churn_df_woe.drop(bad_features, axis=1, inplace=True)

In [None]:
churn_df_woe.info()


# 4. Principal Component Analysis (PCA)

PCA is a dimensionality reduction technique that can be used to reduce the number of features in a dataset while retaining most of the information.
It does this by creating new features that are combinations of the original features, and then dropping the original features.
The new features are known as principal components, and they are orthogonal (i.e., at right angles) to each other.
The first principal component captures the largest amount of variation in the data, and each subsequent component captures the largest amount of remaining variation that is orthogonal to the previous components.

In [None]:
# use PCA  for churn df
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# create a copy of the churn woe df
churn_df_pca = churn_df_woe.copy()

# drop the target feature
churn_df_pca.drop('Churned', axis=1, inplace=True)

# standardize the data
scaler = StandardScaler()
churn_df_pca = scaler.fit_transform(churn_df_pca)

# create a PCA object
num_components = 3
# these components are the new features

pca = PCA(n_components=num_components)

# fit the PCA object
pca.fit(churn_df_pca)

# transform the data
churn_df_pca = pca.transform(churn_df_pca)

# create a dataframe with the PCA data
#  call columns PC_x depending on the component number
churn_df_pca = pd.DataFrame(churn_df_pca, columns=[f'PC{x}' for x in range(1, num_components + 1)])


In [None]:
churn_df_pca['Churned'] = churn_df_woe['Churned']

In [None]:
churn_df_woe['Churned'].isnull().sum()

In [None]:
churn_df_pca['Churned'].isnull().sum()

In [None]:
# For some reason unknown to me, the PCA is dumping values in the Churned column
# So, we need to map the values of Churned in the PCA df back to the original values
# get the index for values in Churned that are missing
missing_index = churn_df_pca[churn_df_pca['Churned'].isnull()].index
# locate the Churned values in the original df that correspond to the missing values in the PCA df
churn_df_pca['Churned'].iloc[missing_index] = churn_df_woe['Churned'].iloc[missing_index]

In [None]:
churn_df_pca['Churned'].isnull().sum()

In [None]:
# just to make sure
churn_df_pca.info()

In [None]:
# plot the data px scatter plot
px.scatter(churn_df_pca, x='PC1', y='PC2', color='Churned')

# 5. Logistic Regression

Logistic regression is a statistical model that uses a logistic function to model a binary dependent variable.
In our case, the dependent variable is whether a customer has churned or not.
Logistic regression is a popular technique for modeling customer churn because it is easy to interpret and implement, and it performs well on simple datasets.

In [None]:
# use logistic regression for churn_df_pca

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# create a copy of the churn df
churn_df_lr = churn_df_pca.copy()

# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(churn_df_lr.drop('Churned', axis=1), churn_df_lr['Churned'], test_size=0.2, random_state=42)

# create a logistic regression object
lr = LogisticRegression()

# fit the model
lr.fit(X_train, y_train)

# make predictions
y_pred = lr.predict(X_test)





In [None]:
y_pred

## 5.2 Model Evaluation

**Accuracy** is the proportion of correct predictions out of all predictions made. It is a good measure when the classes are balanced, but it can be misleading when there is a large class imbalance.

**Precision** is the proportion of correct positive predictions out of all positive predictions made. It is a good measure when the cost of false positives is high.

**Recall** is the proportion of correct positive predictions out of all actual positive instances. It is a good measure when the cost of false negatives is high.

**F1 score** is the [harmonic mean](https://en.wikipedia.org/wiki/Harmonic_mean) of precision and recall. It is a good measure when you want to balance precision and recall, and when there is an uneven class distribution.

**Confusion matrix** is a table that shows the number of correct and incorrect predictions made by a model. It is a good way to evaluate the performance of a model.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def eval_model(y_test, y_pred):
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'Precision: {precision_score(y_test, y_pred, zero_division=1.0)}')
    print(f'Recall: {recall_score(y_test, y_pred)}')
    print(f'F1 Score: {f1_score(y_test, y_pred,zero_division=1.0)}')

In [None]:
eval_model(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)


# 6. Neural Networks
"Neural Networks (NNs) are a class of machine learning algorithms that draw inspiration from the structure and function of the human brain. They are widely used for solving complex problems in classification and regression tasks.

NNs are composed of interconnected layers of artificial neurons. Each neuron receives inputs from the neurons in the previous layer and applies a set of weights to those inputs, along with a bias term. These weighted inputs are then transformed using an activation function to produce an output.

One of the key strengths of NNs lies in their ability to learn complex relationships within the data. This is achieved through a process called training, where the NN adjusts its weights and biases based on a training dataset. The objective is to minimize a predefined loss function that measures the disparity between the predicted outputs and the true labels in the training data.


During the training process, NNs use optimization algorithms like gradient descent to update the weights and biases iteratively. The backpropagation algorithm plays a crucial role in this process, as it efficiently calculates the gradients of the loss function with respect to the weights and biases, enabling the network to make adjustments that reduce the error.
Choosing an appropriate loss function is critical and depends on the problem at hand. For regression tasks, common loss functions include mean squared error (MSE) and mean absolute error (MAE), while for classification tasks, cross-entropy loss is often used.
For this lecture we will be using Tensorflow and Keras to build a neural network model. Tensorflow is an open-source machine learning library developed by Google, and Keras is a high-level API that runs on top of Tensorflow. Keras provides a simple and intuitive interface for building neural networks, while Tensorflow provides the backend for executing the computations required by the network.

In [None]:
# Build a tensorflow model

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# create a copy of the churn df
churn_df_nn = churn_df_pca.copy()

# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(churn_df_nn.drop('Churned', axis=1), churn_df_nn['Churned'], test_size=0.2, random_state=42)

# create a tensorflow model
# use the shape of the df as the input shape
model = Sequential([
    Dense(32, activation='relu', input_shape=(churn_df_nn.shape[1]-1,)),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# compile the model
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# fit the model
model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1)




In [None]:
# evaluate the model
model.evaluate(X_test, y_test, verbose=0)
#  the output is the loss and accuracy of the model
#  for binary crossentropy loss, the lower the better

In [None]:
preds = model.predict(X_test)

In [None]:
preds = [1.0 if pred > 0.5 else 0.0 for pred in preds]

In [None]:
y_test

In [None]:
# use the custom eval_model function to evaluate the model
eval_model(y_test, preds)

Those models sucked !! Let's try training them with woe df instead of pca df

First let's create our train and test sets

In [None]:
# create a copy of the churn df
churn_df_lr = churn_df_woe.copy()

# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(churn_df_lr.drop('Churned', axis=1), churn_df_lr['Churned'], test_size=0.2, random_state=42)

# fit and transform the train set
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# transform the test set
scaler_test = StandardScaler()
X_test = scaler_test.fit_transform(X_test)


In [None]:
# create the logistic regression object
lr = LogisticRegression()

# fit the model
lr.fit(X_train, y_train)

# make predictions
y_pred = lr.predict(X_test)

# evaluate the model using our custom function
eval_model(y_test, y_pred)

In [None]:
# now let's create a new TF model
# since the shape of our df has changed, we need to update the input shape



# create a tensorflow model
# use the shape of the df as the input shape
model = Sequential([
    Dense(32, activation='relu', input_shape=(churn_df_lr.shape[1]-1,)),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# compile the model
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# fit the model
model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1)

In [None]:
# evaluate the model
model.evaluate(X_test, y_test, verbose=0)

In [None]:
preds = model.predict(X_test)
# don't forget to convert them to 0s and 1s
preds = [1.0 if pred > 0.5 else 0.0 for pred in preds]

In [None]:
# use the custom eval_model function to evaluate the model
eval_model(y_test, preds)