# **1. Import the libraries**

In [2]:
import pandas as pd  # Used for reading and manipulating data in tabular format
import numpy as np   # Provides support for numerical operations on arrays and matrices

# Data visualization libraries
import matplotlib.pyplot as plt  # For creating basic plots
import seaborn as sns         # For creating more advanced and visually appealing plots

# Libraries for building and evaluating models
from sklearn.model_selection import train_test_split  # Splits data into training and testing sets

# Decision Tree model
from sklearn.tree import DecisionTreeClassifier  # Decision tree algorithm

# Random Forest model
from sklearn.ensemble import RandomForestClassifier  # Random Forest algorithm (ensemble of decision trees)

# Libraries for model evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  # Functions for evaluating model performance

# **2. Load and process data**



In [3]:
from google.colab import files
uploaded = files.upload()
filename = next(iter(uploaded))

Saving WA_Fn-UseC_-Telco-Customer-Churn.csv to WA_Fn-UseC_-Telco-Customer-Churn.csv


In [4]:
df = pd.read_csv(filename)
print(df.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [5]:
print(df.dtypes) #check the type of columns

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


In [6]:
df.columns.values  # Returns an array containing the column names of the DataFrame.


array(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges',
       'TotalCharges', 'Churn'], dtype=object)

In [7]:
df.dtypes  # Returns a Series with the data type of each column.

Unnamed: 0,0
customerID,object
gender,object
SeniorCitizen,int64
Partner,object
Dependents,object
tenure,int64
PhoneService,object
MultipleLines,object
InternetService,object
OnlineSecurity,object


In [8]:
df = df.drop(['customerID'], axis = 1) # Delete "customerID" from DataFrame df.
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [9]:
df['SeniorCitizen'] = df['SeniorCitizen'].replace({0: 'No', 1: 'Yes'})


In [10]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,No,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [11]:
df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce')  # Converts the 'TotalCharges' column from string to numeric format.
df.isnull().sum()  # Counts the number of missing values (NaN) in each column of the DataFrame.

Unnamed: 0,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0
OnlineBackup,0


In [12]:
df[np.isnan(df['TotalCharges'])]  # Returns a DataFrame containing all rows where the 'TotalCharges' column has a NaN value.

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,Female,No,Yes,Yes,0,No,No phone service,DSL,Yes,No,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,Male,No,No,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
936,Female,No,Yes,Yes,0,Yes,No,DSL,Yes,Yes,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,Male,No,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No
1340,Female,No,Yes,Yes,0,No,No phone service,DSL,Yes,Yes,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,Male,No,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,No
3826,Male,No,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,No
4380,Female,No,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,No
5218,Male,No,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,No
6670,Female,No,Yes,Yes,0,Yes,Yes,DSL,No,Yes,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


In [13]:
df[df['tenure'] == 0].index  # Returns a Series containing the indices of rows where the 'tenure' column has a value of 0.

Index([488, 753, 936, 1082, 1340, 3331, 3826, 4380, 5218, 6670, 6754], dtype='int64')

In [14]:
df.drop(labels=df[df['tenure'] == 0].index, axis=0, inplace=True) # Drops the rows where the 'tenure' column is 0.
df[df['tenure'] == 0].index # Checks again to see if there are any rows with 'tenure' equal to 0.

Index([], dtype='int64')

In [15]:
df.fillna(df["TotalCharges"].mean()) # Fills all NaN values in the DataFrame with the mean of the 'TotalCharges' column.

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.850000,29.850000,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.950000,1889.500000,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.850000,108.150000,Yes
3,Male,No,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.300000,1840.750000,No
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.700000,151.650000,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,No,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.800000,1990.500000,No
7039,Female,No,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.200000,7362.900000,No
7040,Female,No,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.600000,346.450000,No
7041,Male,Yes,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.400000,306.600000,Yes


In [16]:
df.isnull().sum() # Counts the number of missing values (NaN) in each column of the DataFrame.

Unnamed: 0,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0
OnlineBackup,0


In [17]:
df["InternetService"].describe(include=['object', 'bool']) # Provides a descriptive summary of the 'InternetService' column, which is likely categorical.

Unnamed: 0,InternetService
count,7032
unique,3
top,Fiber optic
freq,3096


In [None]:
import scipy.stats as stats
from scipy.stats import chi2_contingency, ttest_ind
import pandas as pd
import numpy as np

# Set the display format for floating-point numbers to 6 decimal places
pd.options.display.float_format = '{:.6f}'.format

# Numerical columns to be analyzed
numerical_cols = ['MonthlyCharges', 'TotalCharges']

# Create a dictionary to store results
results = {}


for col in numerical_cols:
    # Calculate basic statistical values
    mean_val = df[col].mean()
    median_val = df[col].median()
    mode_val = df[col].mode().iloc[0] if not df[col].mode().empty else np.nan
    range_val = df[col].max() - df[col].min()
    variance_val = df[col].var()
    std_dev_val = df[col].std()

    # Chi-square test (data divided into bins)
    bins = np.linspace(df[col].min(), df[col].max(), 10)
    binned_data = pd.cut(df[col], bins=bins).value_counts()
    observed = binned_data.values

    # Calculate expected frequencies assuming a uniform distribution
    expected = np.full_like(observed, observed.sum() / len(observed), dtype=float)
    expected *= observed.sum() / expected.sum()

    # Chi-square funct
    def perform_chi_square_test(df, col1, col2):
        contingency_table = pd.crosstab(df[col1], df[col2])
        chi2, p, dof, expected = chi2_contingency(contingency_table)
        significant = p < 0.05 
        return chi2, p, significant

    chi2, p, significant = perform_chi_square_test(df, 'TotalCharges', 'MonthlyCharges')

    # T-test
    t_stat, p_value = ttest_ind(df['MonthlyCharges'], df['TotalCharges'], equal_var=False)


    # Save the results in dictionary
    results[col] = {
        'Mean': mean_val,
        'Median': median_val,
        'Mode': mode_val,
        'Range': range_val,
        'Variance': variance_val,
        'Standard Deviation': std_dev_val,
    }

# Convert the dictionary to a DataFrame
results_df = pd.DataFrame(results)

# Print
print(results_df)

alpha = 0.05
print(f"T-Statistic: {t_stat}")
print(f"P-Value: {p_value}")

# Conclusion 
if p_value < 0.05:
    print("There is a statistically significant difference between MonthlyCharges and TotalCharges.")
else:
    print("There is no statistically significant difference between MonthlyCharges and TotalCharges.")

# Print Chi-square
print("\nChi-square Test Results:")
print("Chi-square:", chi2)
print("P-value:", p)
print("Significant:", significant)

                    MonthlyCharges   TotalCharges
Mean                     64.798208    2283.300441
Median                   70.350000    1397.475000
Mode                     20.050000      20.200000
Range                   100.500000    8666.000000
Variance                905.165825 5138252.407054
Standard Deviation       30.085974    2266.771362
T-Statistic: -82.06412329277843
P-Value: 0.0
There is a statistically significant difference between MonthlyCharges and TotalCharges.

Chi-square Test Results:
Chi-square: 10776240.735234559
P-value: 0.0
Significant: True
