In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

#### Importing Modules from Local Directory

In [None]:
project_root = os.path.abspath('..')

# Adding the project root to my system path
if project_root not in sys.path:
    sys.path.append(project_root)

from src.data_loader import telco_data_loader
from src.data_processor import telcoDataCleaner

#### Loading Raw Data from Kaggle

In [None]:
# running the loader
raw_telco_data = telco_data_loader()

## Initial Data Exploration

In [None]:
print(f"Dataframe shape: {raw_telco_data.shape}")
print(f"Dataframe columns: {raw_telco_data.columns}")
print(raw_telco_data.info())

Analyzing missing values from columns in the datset.

#### Cleaning up the data for visualization

In [None]:
cleaner = telcoDataCleaner()

cleaned_data = cleaner.clean_data(raw_telco_data)['clean_table']

In [None]:
raw_telco_data['InternetService'].unique()

#### Plotting

In [None]:
sns.histplot(data = cleaned_data, x = 'TotalCharges')
plt.title("Histogram of Total Charges")
plt.show()

In [None]:
#print(cleaned_data.describe())

In [None]:
sns.histplot(data = cleaned_data, x = 'MonthlyCharges')
plt.title("Histogram of Monthly Charges")
plt.show()

Appears to be a trimodal distribution: a cluster of monthly charges around $20, another group around $50, and a final around $80

In [None]:
cleaned_data.loc[:, ['TechSupport', 'Churn']].groupby('TechSupport').mean()

In [None]:
cleaned_data.loc[:, ['SeniorCitizen', 'Churn']].groupby('SeniorCitizen').mean()

In [None]:
cleaned_data.loc[:, ['SeniorCitizen', 'MonthlyCharges']].groupby('SeniorCitizen').mean()

In [None]:
sns.histplot(data = cleaned_data,
             x = 'MonthlyCharges',
             hue = 'SeniorCitizen',
             multiple = 'stack')
plt.title('Strip Plot of Total Charges by Senior Citizen')
plt.show()

In [None]:
cleaned_data.loc[:, ['tenure', 'Churn']].groupby('Churn').mean()

In [None]:
cleaned_data.loc[:, ['StreamingMovies', 'Churn']].groupby('StreamingMovies').mean()

In [None]:
sns.relplot(data = cleaned_data,
            x = 'tenure',
            y = 'TotalCharges',
            hue = 'Churn',
            palette = {1 : 'red', 0 : 'lightgrey'},
            alpha = 0.5)
plt.title("Tenure vs. Total Charges Colored by Churn")
plt.ylabel("Total Charges")
plt.xlabel("Tenure")
plt.show()

In [None]:
sns.relplot(cleaned_data,
            x = 'tenure',
            y = 'MonthlyCharges',
            hue = 'Churn',
            palette = {1 : 'red', 0 : 'lightgrey'},
            alpha = 0.5)
plt.title("Tenure vs. Monthly Charges Colored by Churn")
plt.ylabel("Monthly Charges")
plt.xlabel("Tenure")
plt.show()

In [None]:
sns.stripplot(cleaned_data,
            x = 'tenure',
            y = 'InternetService',
            hue = 'Churn',
            palette = {1 : 'red', 0 : 'lightgrey'},
            alpha = 0.5)
plt.title("Tenure Stratified by Internet Type Colored by Churn")
plt.ylabel("Monthly Charges")
plt.xlabel("Tenure")
plt.show()

In [None]:
# Scatterplot of tenure vs monthly charges colored by churn and styled by internet service
sns.relplot(cleaned_data,
            x = 'tenure',
            y = 'MonthlyCharges',
            hue = 'Churn',
            style = 'InternetService',
            palette = {1 : 'red', 0 : 'lightgrey'},
            alpha = 0.5)
plt.title("Tenure vs. Monthly Charges Colored by Churn")
plt.ylabel("Monthly Charges")
plt.xlabel("Tenure")
plt.show()

Observations: 
 - Customers being charged the most are the most likely to churn
 - New customers are the most likely to churn
 - Fiber optic customers are most likely to churn. However, fiber optic customers tend to have the highest monthly charges