# Telco Customer Churn Dataset - Exploratory Data Analysis

Explore the dataset of Telco Customer Churn in order to uncover patterns in the data and correlation between features/label.

# Setup Notebook

## Import

In [6]:
# Import Standard Libraries
import pandas as pd

import os
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

from colorama import Style, Fore

# Import Package Modules
from src.general_utils.general_utils import read_configuration

## Setup Plots Characteristics

In [4]:
# Define Seaborn theme parameters
theme_parameters =  {
    'axes.spines.right': False,
    'axes.spines.top': False,
    'grid.alpha':0.3,
    'figure.figsize': (16, 6),
    'font.family': 'Andale Mono',
    'axes.titlesize': 24,
    'figure.facecolor': '#E5E8E8',
    'axes.facecolor': '#E5E8E8'
}

# Set the theme
sns.set_theme(style='whitegrid',
              palette=sns.color_palette('deep'), 
              rc=theme_parameters)

In [5]:
# Define Colors
black = Style.BRIGHT + Fore.BLACK
magenta = Style.BRIGHT + Fore.MAGENTA
red = Style.BRIGHT + Fore.RED
blue = Style.BRIGHT + Fore.BLUE
reset_colors = Style.RESET_ALL

## Define Configuration

In [11]:
# Retrieve root path
root_path = Path(os.getcwd()).parents[0]

# Read configuration variables
config = read_configuration(root_path / 'configuration' / 'config.yaml')

# Extract configuration variables
dataset_config = config['dataset']

[05/25/2024 23:46:55 - general_utils] INFO - read_configuration - Start
[05/25/2024 23:46:55 - general_utils] INFO - read_configuration - Reading /Users/s.porreca/Projects/customer_churn_predictor/configuration/config.yaml
[05/25/2024 23:46:55 - general_utils] INFO - read_configuration - Configuration file /Users/s.porreca/Projects/customer_churn_predictor/configuration/config.yaml read successfully
[05/25/2024 23:46:55 - general_utils] INFO - read_configuration - End


# Read Data

In [14]:
# Read data
data = pd.read_csv(root_path.as_posix() + '/' + dataset_config['path'])

# Exploratory Data Analysis

## General

### Data Types

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


- Most of the features seem categorical &rarr; Encoding techniques might increase the data size

### Sample Data

In [17]:
data.sample(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
2382,6114-TCFID,Female,0,No,No,29,No,No phone service,DSL,Yes,...,No,Yes,No,No,One year,No,Credit card (automatic),39.5,1082.75,No
5601,2877-VDUER,Female,0,Yes,Yes,35,No,No phone service,DSL,No,...,No,Yes,Yes,No,One year,No,Mailed check,40.9,1383.6,No
2877,0310-MVLET,Female,0,Yes,Yes,61,Yes,Yes,Fiber optic,No,...,No,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.15,6010.05,Yes
2659,3166-PNEOF,Female,0,No,No,61,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,Two year,No,Credit card (automatic),86.45,5175.3,No
1243,5552-ZNFSJ,Male,0,No,No,1,Yes,No,DSL,No,...,No,No,No,No,Month-to-month,No,Electronic check,45.3,45.3,Yes


### Shape

In [18]:
# Print shapes information
print(f'{blue}Data Shapes:'
      f'{blue}\n- All Data -> {red}{data.shape}\n')

[1m[34mData Shapes:[1m[34m
- All Data -> [1m[31m(7043, 21)



### Null Values

In [19]:
# Print null values information
print(f'{blue}Data Columns with Null Values:'
      f'{blue}\n- All Data -> {red}{data.isnull().any().sum()}\n')

[1m[34mData Columns with Null Values:[1m[34m
- All Data -> [1m[31m0



- No null values &rarr; Imputation step not necessary