In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

pd.set_option('display.max_columns', 500)
import warnings
warnings.filterwarnings('ignore')

In [7]:
#Base Directories
os.chdir = r"C:\Users\wikto\OneDrive - SGH\Documents\SGH\Data_science_II"

In [8]:
#read all the available files / tables and create a set of all the unique columns available
list_of_files = os.listdir('./Telecommunications_Industry')
list_of_files

FileNotFoundError: [WinError 3] System nie może odnaleźć określonej ścieżki: './Telecommunications_Industry'

In [9]:
set_of_columns_available = set()

for file in list_of_files:
    if ".xlsx" in file:
        df = pd.read_excel('./Telecommunications_Industry/' + file)
        cols_in_df = df.columns.tolist()
        
        set_of_columns_available.update(cols_in_df)
        print("Columns in file:", file, "are", cols_in_df)
        print()
        
print("Total number of attributes / columns available :", len(set_of_columns_available))
print(set_of_columns_available)

NameError: name 'list_of_files' is not defined

In [None]:
# Combine multiple files using Customer ID as primary key
# first read all the tables dataframes

df = pd.read_excel('./Telecommunications_Industry/' + "Telco_customer_churn.xlsx")

# There are two ways "Customer ID" is written in column names: one with and one without space 
# Fix column name to "Customer ID" in "Telco_customer_churn.xlsx" file
df = df.rename(columns = {'CustomerID':'Customer ID'})

list_of_xlsxs = ['CustomerChurn.xlsx', 
                'Telco_customer_churn_demographics.xlsx',
                'Telco_customer_churn_location.xlsx',
                'Telco_customer_churn_population.xlsx',
                'Telco_customer_churn_services.xlsx',
                'Telco_customer_churn_status.xlsx']

for file in list_of_xlsxs:
    temp = pd.read_excel('./Telecommunications_Industry/' + file)

    if "Customer ID" in temp.columns.tolist():
        df = pd.merge(df, temp, on = "Customer ID", how = "left", suffixes=('', '_remove'))
        #df.join(temp.set_index("Customer ID"), on = "Customer ID") 
    else:
        df = pd.merge(df, temp, on = "Zip Code", how = "left", suffixes=('', '_remove'))
            
# remove the duplicate columns
df.drop([i for i in df.columns if 'remove' in i], axis = 1, inplace = True)

print("Total Number of columns : ", len(df.columns))
print("List of columns :", df.columns.tolist())

In [None]:
df.head()

In [None]:
df.info()

In [None]:
columns_with_null_values = [(col, df[col].isnull().sum()) for col in df.columns.tolist() if df[col].isnull().sum() > 0]
columns_with_null_values

In [None]:
for i in columns_with_null_values:
    # print(i[0])
    print(i[0],'\n', df[i[0]].unique(),'\n')

In [None]:
# replacing na values in "Churn Reason" with "Not Churned"
df["Churn Reason"] = df["Churn Reason"].fillna("Not Churned")

# replacing na values in "Churn Category" with "Not Applicable"
df["Churn Category"] = df["Churn Category"].fillna("Not Applicable")

# replacing na values in "Internet Type" with "Other"
df["Internet Type"] = df["Internet Type"].fillna("Other")

# replacing na values in "Offer" with "Offer F"
df["Offer"] = df["Offer"].fillna("Offer F")

In [None]:
for i in df.columns:
    print(i, df[i].unique())

In [None]:
df['Total Charges'].unique()

In [None]:
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')

In [None]:
df['Total Charges'].info()

In [None]:
# Replace that with 0.0
df["Total Charges"] = np.where(df["Total Charges"] == " ", '0.0', df["Total Charges"])

In [None]:
df.describe()

In [None]:
for col in df.columns.tolist():
    print(col)
    print("Number of unique values:", df[col].nunique())
    print("Unique Values:", df[col].unique()[:10])
    
    if(df[col].dtype == 'int64' or df[col].dtype == 'float64'):
        print("max :", df[col].max())
        print("min :", df[col].min())

    print()

In [None]:
#for "Multiple Lines" column
df["Multiple Lines"] = np.where(df["Multiple Lines"] == "No phone service", "No", df["Multiple Lines"])
# df["Multiple Lines"] = df["Multiple Lines"].replace({"No phone service": "No"}) 
df["Multiple Lines"].value_counts()

In [None]:
#columns in which we want to replace "No internet service" with "No"
cols_to_change = ["Online Security", "Online Backup", "Device Protection", "Tech Support", "Streaming TV", "Streaming Movies"]
df[cols_to_change] = np.where(df[cols_to_change] == "No internet service", "No", df[cols_to_change])

In [None]:
# group tnure into bins 
df["Tenure Bins"] = pd.cut(df['Tenure in Months'], [0, 12, 24, 48, 60, 72])
print(df.value_counts("Tenure Bins"))
df["Tenure Bins"].value_counts()

In [None]:
df['Population'] = df['Population'].astype(int)

## Removing unnecessary columns

In [None]:
list_of_columns_to_drop = []

In [None]:
for i in ["Customer ID"]:
    print(i, df[i].head())
    print("\n Number of Unique Values: ", df[i].nunique())

In [None]:
for i in ['Count']:
    print(i, df[i].head())
    print("\n Number of Unique Values: ", df[i].nunique())

In [None]:
list_of_columns_to_drop.append('Count')

In [None]:
for i in ['Country', 'State']:
    print(i, df[i].head())
    print("\n Number of Unique Values: ", df[i].nunique())

In [None]:
list_of_columns_to_drop.append('Country')
list_of_columns_to_drop.append('State')

In [None]:
for i in ['Churn', 'Churn Label']:
    print(i, df[i].head())
    print("\n Number of Unique Values: ", df[i].nunique())

In [None]:
(df["Churn"] == df["Churn Label"]).all()

In [None]:
list_of_columns_to_drop.append('Churn Label')

In [None]:
for i in ['Location ID', 'ID']:
    print(i, df[i].head())
    print("\n Number of Unique Values: ", df[i].nunique())

In [None]:
list_of_columns_to_drop.append('Location ID')

In [None]:
# According to the defination of column given in dataset - this column was just used to identify row number in population table and does not correspond to any info about our users 
list_of_columns_to_drop.append("ID")

In [None]:
for i in ['Service ID', 'Status ID']:
    print(i, df[i].head())
    print("\n Number of Unique Values: ", df[i].nunique())

In [None]:
list_of_columns_to_drop.append("Service ID")
list_of_columns_to_drop.append("Status ID")

In [None]:
for i in ['Quarter']:
    print(i, df[i].head())
    print("\n Number of Unique Values: ", df[i].nunique())

In [None]:
list_of_columns_to_drop.append("Quarter")

In [None]:
for i in ["LoyaltyID"]:
    print(i, df[i].head())
    print("\n Number of Unique Values: ", df[i].nunique())
    

In [None]:
list_of_columns_to_drop.append("LoyaltyID")

In [None]:
list_of_columns_to_drop.append("Tenure")
list_of_columns_to_drop.append("Tenure Months")
list_of_columns_to_drop.append("Churn")
list_of_columns_to_drop.append("Internet Type")
list_of_columns_to_drop.append("Monthly Charge")
list_of_columns_to_drop.append("Tenure in Months")

In [None]:
list_of_columns_to_drop.append("Lat Long")
list_of_columns_to_drop.append("Zip Code")
list_of_columns_to_drop.append("City")

In [None]:
# Data leakage
list_of_columns_to_drop.append("Churn Reason")
list_of_columns_to_drop.append("Churn Category")
list_of_columns_to_drop.append("Customer Status")
list_of_columns_to_drop.append("CLTV")
list_of_columns_to_drop.append("Churn Score")
list_of_columns_to_drop.append("Monthly Charges")
list_of_columns_to_drop.append("Total Charges")

In [None]:
list(set(list_of_columns_to_drop))

In [None]:
df.drop(list_of_columns_to_drop, axis = 1, inplace = True)

In [None]:
df['Satisfaction Score Label'] = df['Satisfaction Score'].astype('category')

In [None]:
df[['Satisfaction Score Label', 'Satisfaction Score']].info()

In [None]:
df.rename(columns = {'Churn Value' : 'Churn'}, inplace = True)

In [None]:
df.to_csv('./Telecommunications_Industry/' + "Selected_columns_customer_churn.csv")

## EDA (Exploratory Data Analysis)

In [None]:
df.info()

In [None]:
df.set_index('Customer ID', inplace=True)

In [None]:
#let's take a look at our target label - "Churn" first
counts = [(df["Churn"] == 1).sum(), (df["Churn"] == 0).sum()]
counts
plt.pie(counts, labels = ["Churn", "Not Churn"], autopct='%.0f%%')
plt.show()

In [None]:
for col in df.columns.tolist():
    if col != "Churn" and df[col].dtype == 'object':
        plt.figure()
        sns.histplot(df, hue="Churn", x = col, shrink=.7, multiple="layer")
        plt.show()
    elif df[col].dtype == 'category':
        temp = df.copy()
        temp[col] = df[col].astype(str)
        plt.figure()
        sns.histplot(temp, hue="Churn", x = col, shrink=.7, multiple="layer")
        plt.show()

In [None]:
for col in df.columns.tolist():
    if col != "Churn" and df[col].dtype == 'object':
        plt.figure()
        sns.histplot(df, hue="Churn", x = col, shrink=.7, multiple="fill")
        plt.show()
    elif df[col].dtype == 'category':
        temp = df.copy()
        temp[col] = df[col].astype(str)
        plt.figure()
        sns.histplot(temp, hue="Churn", x = col, shrink=.7, multiple="fill")
        plt.show()

In [None]:
# Check the correlation matrix of all features
columns_for_corr = ["Population", "Avg Monthly Long Distance Charges", "Total Refunds", "Total Extra Data Charges", "Total Long Distance Charges", "Total Revenue"]
df_corr = df[columns_for_corr].corr()
sns.heatmap(df_corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

## Data Preparation 

In [None]:
# There is Satisfaction score already
df.drop(["Satisfaction Score Label", 'Total Long Distance Charges'], axis = 1, inplace = True, errors='ignore')

In [None]:
columns_to_be_encoded = []

for col in df.columns.tolist():
    if(df[col].dtype == 'object' and "Yes" in df[col].unique()):
        columns_to_be_encoded.append(col)

df[columns_to_be_encoded] = np.where(df[columns_to_be_encoded] == "Yes", 1, 0)
df[columns_to_be_encoded] = df[columns_to_be_encoded].astype(int)

In [None]:
df["Gender"] = np.where(df["Gender"] == "Female", 1, 0)
df["Gender"] = df["Gender"].astype(int)

In [None]:
df = pd.get_dummies(df, columns = ["Tenure Bins", "Offer", "Payment Method", "Contract", "Internet Service"])

In [None]:
df.info()

## Model

In [None]:
X = df.drop(["Churn"], axis = 1)
y = df["Churn"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42, test_size = 0.2, shuffle = True)

In [None]:
#format the features names:

X.index.names = ['Customer_ID']
X_train.index.names = ['Customer_ID']
X_test.index.names = ['Customer_ID']

X.columns = [col.replace(' ', '_') for col in X.columns.tolist()]
X.columns = [col.replace('(', '_') for col in X.columns.tolist()]
X.columns = [col.replace(')', '') for col in X.columns.tolist()]
X.columns = [col.replace(']', '_') for col in X.columns.tolist()]
X.columns = [col.replace(',', '') for col in X.columns.tolist()]

X_train.columns = X.columns
X_test.columns = X.columns

In [None]:
#Confirm the splitting is correct:
print("Shape of Training Data : ", "\nfeatures:", X_train.shape, ", target:", y_train.shape)
print("Target Label Distribution in train set : ", "\nChurn:", y_train.sum(), "Not Churn", len(y_train) - y_train.sum())
print("Percentage of Churn:", y_train.sum() / len(y_train) * 100)
print()
print("Shape of Test Data : ", "\nfeatures:", X_test.shape, ", target:", y_test.shape)
print("Target Label Distribution in test set : ", "\nChurn:", y_test.sum(), "Not Churn", len(y_test) - y_test.sum())
print("Percentage of Churn:", y_test.sum() / len(y_test) * 100)

In [None]:
X.info()

In [None]:
list_of_models = {
    'logistic_regression' : LogisticRegression(random_state = 42, max_iter = 10000),
    'decision_tree' : DecisionTreeClassifier(max_depth = 5, random_state = 42),
    'Random_forest' : RandomForestClassifier(n_estimators = 150, max_depth = 4, random_state = 42),
    'GBDT' : GradientBoostingClassifier(n_estimators = 150, max_depth = 4, random_state = 42),
    "XGBoost" : xgb.XGBClassifier(n_estimators = 200, max_depth = 5, random_state = 42)
}

In [None]:
f1_train_scores = [] 
f1_test_scores = [] 
recall_test_scores = []

#model_names = list_of_models.keys()
model_names = ['logistic_regression', 'decision_tree', 'Random_forest', 'GBDT', "XGBoost"]
#model_names = ['GBDT']

for model in model_names:
    print("\nFor Model:", model)
    
    list_of_models[model].fit(X_train, y_train)

    print("\nFor Training Set:")

    y_train_pred = list_of_models[model].predict(X_train)

    f1_train = f1_score(y_train, y_train_pred, average='macro')
    print("\nMacro F1 Score:", f1_train)

    print("\nConfusion Matrix:") 
    confusion_matrix = metrics.confusion_matrix(y_train, y_train_pred)
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])
    cm_display.plot()
    plt.show()

    print("For Test Set:")

    y_test_pred = list_of_models[model].predict(X_test)
    
    f1_test = f1_score(y_test, y_test_pred, average='macro')
    print("\nMacro F1 Score:", f1_test)

    recall_test_score = recall_score(y_test, y_test_pred, average='macro')
    
    print("\nConfusion Matrix:")
    confusion_matrix = metrics.confusion_matrix(y_test, y_test_pred)
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])
    cm_display.plot()
    plt.show()
    
    f1_train_scores.append(f1_train)
    f1_test_scores.append(f1_test)
    recall_test_scores.append(recall_test_score)

In [None]:
res = pd.DataFrame({'model_names' : model_names, 'f1_train_scores' : f1_train_scores, 'f1_test_scores' : f1_test_scores, 'recall_test_scores' : recall_test_scores})
res

In [None]:
feature_importance = abs(list_of_models['logistic_regression'].coef_[0])
feature_importance = 100.0 * (feature_importance / feature_importance.max())

data = pd.DataFrame(data=feature_importance, index=X.columns, columns=["score"]).sort_values(by = "score", ascending=False)
data.nlargest(20, columns="score").plot(kind='barh', figsize = (20,10))

## PyCaret

In [None]:
# !pip install pycaret[full]
# ! pip install scipy==1.11.4
# !pip install --pre pycaret

In [None]:
from pycaret.classification import *
s = setup(df, target = 'Churn', session_id = 123, fold = 5)

In [None]:
best = compare_models()

In [None]:
print(best)

In [None]:
evaluate_model(best)

In [None]:
# https://www.kaggle.com/datasets/hanatuangud/bank-card-churn-rate/data?select=BankChurners.csv