In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.graph_objects as go
from plotly import express as px

In [3]:
churn_df = pd.read_csv('data/telecom_customer_churn.csv')

In [5]:
churn_df.head(10)

Unnamed: 0,Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0,0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,...,Credit Card,65.6,593.3,0.0,0,381.51,974.81,Stayed,,
1,0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,...,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,Stayed,,
2,0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,...,Bank Withdrawal,73.9,280.85,0.0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
3,0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,...,Bank Withdrawal,98.0,1237.85,0.0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,...,Credit Card,83.9,267.4,0.0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability
5,0013-MHZWF,Female,23,No,3,Midpines,95345,37.581496,-119.972762,0,...,Credit Card,69.4,571.45,0.0,0,150.93,722.38,Stayed,,
6,0013-SMEOE,Female,67,Yes,0,Lompoc,93437,34.757477,-120.550507,1,...,Bank Withdrawal,109.7,7904.25,0.0,0,707.16,8611.41,Stayed,,
7,0014-BMAQU,Male,52,Yes,0,Napa,94558,38.489789,-122.27011,8,...,Credit Card,84.65,5377.8,0.0,20,816.48,6214.28,Stayed,,
8,0015-UOCOJ,Female,68,No,0,Simi Valley,93063,34.296813,-118.685703,0,...,Bank Withdrawal,48.2,340.35,0.0,0,73.71,414.06,Stayed,,
9,0016-QLJIS,Female,43,Yes,1,Sheridan,95681,38.984756,-121.345074,3,...,Credit Card,90.45,5957.9,0.0,0,1849.9,7807.8,Stayed,,


In [8]:
print('Number of columns (or features): ', len(churn_df.columns))
print('Number of rows: ', len(churn_df))
for col in churn_df.columns:
    print(col)

Number of columns (or features):  38
Number of rows:  7043
Customer ID
Gender
Age
Married
Number of Dependents
City
Zip Code
Latitude
Longitude
Number of Referrals
Tenure in Months
Offer
Phone Service
Avg Monthly Long Distance Charges
Multiple Lines
Internet Service
Internet Type
Avg Monthly GB Download
Online Security
Online Backup
Device Protection Plan
Premium Tech Support
Streaming TV
Streaming Movies
Streaming Music
Unlimited Data
Contract
Paperless Billing
Payment Method
Monthly Charge
Total Charges
Total Refunds
Total Extra Data Charges
Total Long Distance Charges
Total Revenue
Customer Status
Churn Category
Churn Reason


In [8]:
churn_df['City'].nunique()

1106

In [13]:
# print the 5 cities with the most customers
churn_df['City'].value_counts().head(10)


Los Angeles      293
San Diego        285
San Jose         112
Sacramento       108
San Francisco    104
Fresno            61
Long Beach        60
Oakland           52
Escondido         51
Stockton          44
Name: City, dtype: int64

In [22]:
churn_df.value_counts()

Customer ID  Gender  Age  Married  Number of Dependents  City          Zip Code  Latitude   Longitude    Number of Referrals  Tenure in Months  Offer    Phone Service  Avg Monthly Long Distance Charges  Multiple Lines  Internet Service  Internet Type  Avg Monthly GB Download  Online Security  Online Backup  Device Protection Plan  Premium Tech Support  Streaming TV  Streaming Movies  Streaming Music  Unlimited Data  Contract        Paperless Billing  Payment Method   Monthly Charge  Total Charges  Total Refunds  Total Extra Data Charges  Total Long Distance Charges  Total Revenue  Customer Status  Churn Category   Churn Reason                   
0004-TLHLJ   Male    50   No       0                     Costa Mesa    92627     33.645672  -117.922613  0                    4                 Offer E  Yes            33.65                              No              Yes               Fiber Optic    30.0                     No               No             Yes                     No           

In [34]:
# map the values of the feature 'Customer Status' to numerical values
churn_df['Customer Status'] = churn_df['Customer Status'].map({'Stayed': 1, 'Churned': 0})

In [55]:
# create a function that calculates the weight of evidence of each category in a feature
def calc_weight_of_evidence(df, feature, target):
    df[f'Good'] = np.where(df[target] == 0, 1, 0)
    df[f'Bad'] = np.where(df[target] == 1, 1, 0)
    total_good = df['Good'].sum()
    total_bad = df['Bad'].sum()
    grouped = df.groupby(feature).agg({'Good': 'sum', 'Bad': 'sum'})
    grouped['DistributionGood'] = grouped['Good'] / total_good
    grouped['DistributionBad'] = grouped['Bad'] / total_bad
    grouped['WoE'] = np.log(grouped['DistributionGood'] / grouped['DistributionBad'])
    return grouped['WoE'].to_dict()

In [56]:
# Example usage
calc_weight_of_evidence(churn_df, 'City', 'Customer Status')


divide by zero encountered in log



{'Acampo': 2.025017559792148,
 'Acton': -inf,
 'Adelanto': -0.4598890899958524,
 'Adin': 0.9264052711240383,
 'Agoura Hills': 1.6195524516839837,
 'Aguanga': -0.17220701754407142,
 'Ahwahnee': 0.2332580905640929,
 'Alameda': -inf,
 'Alamo': -inf,
 'Albany': 0.9264052711240383,
 'Albion': -inf,
 'Alderpoint': -0.17220701754407142,
 'Alhambra': 0.6387231986722574,
 'Aliso Viejo': -inf,
 'Alleghany': -inf,
 'Alpaugh': 2.025017559792148,
 'Alpine': 1.3318703792322026,
 'Alta': -inf,
 'Altadena': -0.17220701754407142,
 'Alturas': -inf,
 'Alviso': 0.2332580905640929,
 'Amador City': 2.025017559792148,
 'Amboy': -0.17220701754407142,
 'Anaheim': -0.018056337716813243,
 'Anderson': 1.6195524516839837,
 'Angels Camp': 0.9264052711240383,
 'Angelus Oaks': 0.2332580905640929,
 'Angwin': -0.17220701754407142,
 'Annapolis': -0.17220701754407142,
 'Antelope': 0.9264052711240383,
 'Antioch': 0.9264052711240383,
 'Anza': 0.9264052711240383,
 'Apple Valley': -0.17220701754407142,
 'Applegate': -0.17220

In [50]:
# use px.density_mapbox to show the number of customers in each city, center the map on California
fig = px.density_mapbox(churn_df, lat='Latitude', lon='Longitude', radius=10, zoom=5.5, mapbox_style='stamen-terrain', height=1200, width=900)
fig.show()

In [37]:
np.where(churn_df['Customer Status'] == 1)

(array([   0,    1,    5, ..., 7038, 7041, 7042], dtype=int64),)

In [51]:
df = churn_df.copy()

In [54]:
df['Good'] = np.where(df['Customer Status'] == 0, 1, 0)
df['Bad'] = np.where(df['Customer Status'] == 1, 1, 0)

In [53]:
df['Good']

0       0
1       0
2       1
3       1
4       1
       ..
7038    0
7039    1
7040    0
7041    0
7042    0
Name: Good, Length: 7043, dtype: int32