### Import all basic libraries

In [26]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


### Import Dataset

In [27]:
path = "dataset.csv"

df = pd.read_csv(path)
if (df.empty):
    print("Dataset not loaded")
else:
    print("Dataset loaded successfully")


print("Path to dataset files:", path)

df.head()

Dataset loaded successfully
Path to dataset files: dataset.csv


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Clean Data

In [28]:
# Check for missing values
df.isnull().sum()  # No missing values

# Drop customerID column
df.drop(columns=["customerID"], inplace=True)

# Convert TotalCharges to float
df["TotalCharges"] = df["TotalCharges"].replace(" ", "0")  # Replace empty strings
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")  # Convert to float

# Convert binary categorical columns to 1/0
df["gender"] = df["gender"].map({"Male": 1, "Female": 0})
df["Partner"] = df["Partner"].map({"Yes": 1, "No": 0})
df["Dependents"] = df["Dependents"].map({"Yes": 1, "No": 0})
df["PhoneService"] = df["PhoneService"].map({"Yes": 1, "No": 0})
df["MultipleLines"] = df["MultipleLines"].map({"Yes": 1, "No": 0, "No phone service": 0})

# Convert "No internet service" to "No" for consistency
df["OnlineSecurity"] = df["OnlineSecurity"].replace({"No internet service": "No"})
df["OnlineBackup"] = df["OnlineBackup"].replace({"No internet service": "No"})
df["DeviceProtection"] = df["DeviceProtection"].replace({"No internet service": "No"})
df["TechSupport"] = df["TechSupport"].replace({"No internet service": "No"})
df["StreamingTV"] = df["StreamingTV"].replace({"No internet service": "No"})
df["StreamingMovies"] = df["StreamingMovies"].replace({"No internet service": "No"})

# Convert remaining binary columns to 1/0
df["OnlineSecurity"] = df["OnlineSecurity"].map({"Yes": 1, "No": 0})
df["OnlineBackup"] = df["OnlineBackup"].map({"Yes": 1, "No": 0})
df["DeviceProtection"] = df["DeviceProtection"].map({"Yes": 1, "No": 0})
df["TechSupport"] = df["TechSupport"].map({"Yes": 1, "No": 0})
df["StreamingTV"] = df["StreamingTV"].map({"Yes": 1, "No": 0})
df["StreamingMovies"] = df["StreamingMovies"].map({"Yes": 1, "No": 0})


# One-Hot Encode categorical columns (drop first to avoid redundancy) --> e.g if we have a column for "Fibre Optic", "DSL", "No", we can drop the "No" column because if the other two are 0, then it's "No"
df = pd.get_dummies(df, columns=["InternetService", "Contract", "PaymentMethod"], drop_first=True)

# Convert PaperlessBilling and Churn to binary
df["PaperlessBilling"] = df["PaperlessBilling"].map({"Yes": 1, "No": 0})
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

# Final check
print(df.info())  # Verify all columns are numerical
df.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   gender                                 7043 non-null   int64  
 1   SeniorCitizen                          7043 non-null   int64  
 2   Partner                                7043 non-null   int64  
 3   Dependents                             7043 non-null   int64  
 4   tenure                                 7043 non-null   int64  
 5   PhoneService                           7043 non-null   int64  
 6   MultipleLines                          7043 non-null   int64  
 7   OnlineSecurity                         7043 non-null   int64  
 8   OnlineBackup                           7043 non-null   int64  
 9   DeviceProtection                       7043 non-null   int64  
 10  TechSupport                            7043 non-null   int64  
 11  Stre

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,MonthlyCharges,TotalCharges,Churn,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,1,0,0,0,1,0,...,29.85,29.85,0,0,0,0,0,0,1,0
1,1,0,0,0,34,1,0,1,0,1,...,56.95,1889.5,0,0,0,1,0,0,0,1
2,1,0,0,0,2,1,0,1,1,0,...,53.85,108.15,1,0,0,0,0,0,0,1
3,1,0,0,0,45,0,0,1,0,1,...,42.3,1840.75,0,0,0,1,0,0,0,0
4,0,0,0,0,2,1,0,0,0,0,...,70.7,151.65,1,1,0,0,0,0,1,0


0         29.85
1        1889.5
2        108.15
3       1840.75
4        151.65
         ...   
7038     1990.5
7039     7362.9
7040     346.45
7041      306.6
7042     6844.5
Name: TotalCharges, Length: 7043, dtype: object
