In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Load dataset
df = pd.read_csv("../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [6]:
# Identify categorical and numerical features
cat_features = df.select_dtypes(include="object").columns
num_features = df.select_dtypes(include=["int64", "float64"]).columns

In [7]:
# Load project modules and ensure project root is in the Python path

import sys
from pathlib import Path


project_root = Path("C:/Users/rizos/Desktop/customer-churn-end2end")
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

    
from importlib import reload
import src.preprocessing
reload(src.preprocessing)    



# Load preprocessing utilities from src
from src.preprocessing import (
    handle_missing_values,
    encode_categorical_features,
    scale_numeric_features,
    convert_to_numeric
)


# Convert selected columns to numeric format
# Empty or invalid values are coerced to NaN
df = convert_to_numeric(df, columns=["TotalCharges"])

# Drop rows with missing values
# Missing TotalCharges rows are negligible (<0.2% of the dataset)
df = handle_missing_values(df)

# Convert categorical features to numeric using one-hot encoding
df_encoded = encode_categorical_features(df, cat_features)

# Scale numerical features for model compatibility
df_scaled = scale_numeric_features(df_encoded, num_features)

# Verify dataset after preprocessing
df_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Columns: 7062 entries, SeniorCitizen to Churn_Yes
dtypes: bool(7058), float64(4)
memory usage: 47.6 MB


In [8]:
# Test preprocessing on a small sample
df_sample = df.sample(100, random_state=42)

# Take a look at small sample before preprocessing
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 2481 to 3819
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        100 non-null    object 
 1   gender            100 non-null    object 
 2   SeniorCitizen     100 non-null    int64  
 3   Partner           100 non-null    object 
 4   Dependents        100 non-null    object 
 5   tenure            100 non-null    int64  
 6   PhoneService      100 non-null    object 
 7   MultipleLines     100 non-null    object 
 8   InternetService   100 non-null    object 
 9   OnlineSecurity    100 non-null    object 
 10  OnlineBackup      100 non-null    object 
 11  DeviceProtection  100 non-null    object 
 12  TechSupport       100 non-null    object 
 13  StreamingTV       100 non-null    object 
 14  StreamingMovies   100 non-null    object 
 15  Contract          100 non-null    object 
 16  PaperlessBilling  100 non-null    object 
 17

In [9]:
# Preprocessing of small sample data
df_sample = convert_to_numeric(df_sample, columns=["TotalCharges"])
df_sample = handle_missing_values(df_sample)
df_sample = encode_categorical_features(df_sample, cat_features)
df_sample = scale_numeric_features(df_sample, num_features)

# Verify sample dataset after preprocessing
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 2481 to 3819
Columns: 130 entries, SeniorCitizen to Churn_Yes
dtypes: bool(126), float64(4)
memory usage: 16.2 KB


In [10]:
# After preprocessing, the dataset is fully numeric and model-ready.
# Categorical features were expanded via one-hot encoding, increasing dimensionality.

In [11]:
# Save cleaned and preprocessed dataset for modeling
df_scaled.to_csv("../data/processed/clean_customer_churn.csv", index=False)