In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
file_path = "data/RawTelcoData.csv"

df = pd.read_csv(file_path)
df.info()

In [None]:
#DATA LOADER
def load_data(filepath):
    """
    reads csv file
    input: file path
    output: pandas dataframe
    """
    df = pd.read_csv(filepath)
    print(f"Shape of DF: {df.shape} (rows, cols")
    return df

#DATA EXPLORATION
def explore_data(df):
    """
    checks for:
    1. info
    2. missing values
    3. target balance
    4. specific data type check
    """
    #info, summary of the table
    print("\nDATA INFO:")
    print(df.info())

    #missing values
    print("\nMISSING VALUES:")
    print(df.isnull().sum())

    #balance of target variable (churn)
    #important as we need to know if no. of 'yes' or 'no' is equal
    print("\nTARGET BALANCE (CHURN)")
    print(df["Churn"].value_counts(normalize = True))

    #check specific data (total charges)
    #from df.info(), TotalCharges should be float instead of an object
    print("\nSPECIFIC DATA TYPE CHECK")
    if "TotalCharges" in df.columns:
        if df['TotalCharges'].dtype == "object":
            print("TotalCharges is an object")
            #convert to numeric, count how many fail to convert(might be empty)
            non_numeric_count = pd.to_numeric(df["TotalCharges"], errors = "coerce").isnull().sum()
            print(f"Count of non-numeric rows: {non_numeric_count}")
        else:
            print("TotalCharges is numeric")

In [None]:
"""
from data exploration, there are no missing values found,
Churn is not balanced so will have to adjust class weights later on. (73.462% NO, 26.537 YES)
for TotalCharges, there are 11 non-numeric rows
"""

In [None]:
#DATA CLEANING
def clean_data(df):
    """
    cleans df:
    1. changes TotalCharge to numeric
    2. Drops irrelevant cols
    3. encode categorical
    """

    #fix TotalCharges
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors = "coerce") #coerce coverts non numeric text to NaN
    df["TotalCharges"] = df["TotalCharges"].fillna(0) #fills converted NaNs to 0

    #drop customerID (noise)
    if "customerID" in df.columns:
        df = df.drop(columns = ["customerID"])

    #simplify "No Internet Service " to "No"
    cols_to_replace = [
        "OnlineSecurity",
        "OnlineBackup",
        "DeviceProtection",
        "TechSupport",
        "StreamingTV",
        "StreamingMovies"
    ]

    for col in cols_to_replace:
        if col in df.columns:
            df[col] = df[col].replace({"No Internet Service": "No"})

    #simplify "No Phone Service" to "No"
    if "MultipleLines" in df.columns:
        df["MultipleLines"] = df["MultipleLines"].replace({"No Phone Service": "No"})

    #binary encoding (yes no -> 1 0)
    yes_no_cols = [
        "Partner",
        "Dependents",
        "PhoneService",
        "MultipleLines",
        "OnlineSecurity",
        "OnlineBackup",
        "DeviceProtection",
        "TechSupport",
        "StreamingTV",
        "StreamingMovies",
        "PaperlessBilling",
        "Churn"
    ]

    for col in yes_no_cols:
        if col in df.columns:
            df[col] = df[col].replace({"Yes": 1, "No": 0})

    #one-hot encode categoricals
    df = pd.get_dummies(df, drop_first=True)

    df = df.astype(float)

    return df

In [None]:
if __name__ == "__main__":
    file_path = "data/RawTelcoData.csv"

    #1. load
    df = load_data(file_path)

    #2. explore
    if df is not None:
        explore_data(df)

    #3. clean
    df_clean = clean_data(df)

    print(df_clean.head())
    print(df_clean.dtypes)