Step 1: Load and Clean Dataset

In [1]:
import pandas as pd
import numpy as np

# Load raw data
df = pd.read_csv(r"C:\Users\USER\PycharmProjects\coursework_ML\data\raw\WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Convert TotalCharges to numeric (blank strings â†’ NaN)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Fill missing TotalCharges with MonthlyCharges * tenure
df['TotalCharges'] = df['TotalCharges'].fillna(df['MonthlyCharges'] * df['tenure'])

# Drop any remaining missing rows just in case
df = df.dropna(subset=['TotalCharges'])

Step 2: Feature Engineering

In [2]:
# Estimated lifetime value
df['estimated_ltv'] = df['MonthlyCharges'] * df['tenure']

# Binary flags for internet and phone
df['has_internet'] = (df['InternetService'] != 'No').astype(int)
df['has_phone'] = (df['PhoneService'] == 'Yes').astype(int)

# Tenure groups
df['tenure_group'] = pd.cut(df['tenure'],
                            bins=[0,12,24,48,60,72],
                            labels=['0-12','12-24','24-48','48-60','60-72'])


Step 3: Separate Features and Target

In [3]:
# Target variable
y = (df['Churn'] == 'Yes').astype(int)

# Drop columns not needed
X = df.drop(columns=['customerID','Churn'])

# Identify numeric and categorical features
numeric_features = ['tenure','MonthlyCharges','TotalCharges','estimated_ltv']
categorical_features = [c for c in X.columns if c not in numeric_features]

# Ensure numeric columns are float
X[numeric_features] = X[numeric_features].astype(float)

Step 4: Preprocessing Pipelines

In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Numeric pipeline
num_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# Categorical pipeline  
cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine pipelines
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numeric_features),
    ('cat', cat_pipeline, categorical_features)
])

Step 5: Train-Test Split

In [5]:
from sklearn.model_selection import train_test_split

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Step 6: Fit & Transform Preprocessor

In [6]:
# Fit only on training set
preprocessor.fit(X_train_raw)

# Transform train and test sets
X_train = preprocessor.transform(X_train_raw)
X_test = preprocessor.transform(X_test_raw)


Step 7: Save Preprocessor and Processed Data

In [19]:
import os
import joblib

# Paths
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
models_path = os.path.join(project_root, 'models')
processed_path = os.path.join(project_root, 'data', 'processed')

# Create directories if not exist
os.makedirs(models_path, exist_ok=True)
os.makedirs(processed_path, exist_ok=True)

# Save preprocessor
joblib.dump(preprocessor, os.path.join(models_path, 'preprocessor.joblib'))

# Convert arrays to DataFrames
X_train_df = pd.DataFrame(X_train)
X_test_df  = pd.DataFrame(X_test)
y_train_df = pd.DataFrame(y_train, columns=["Churn"])
y_test_df  = pd.DataFrame(y_test, columns=["Churn"])

# Save CSV files using processed_path
X_train_df.to_csv(os.path.join(processed_path, "X_train.csv"), index=False)
X_test_df.to_csv(os.path.join(processed_path, "X_test.csv"), index=False)
y_train_df.to_csv(os.path.join(processed_path, "y_train.csv"), index=False)
y_test_df.to_csv(os.path.join(processed_path, "y_test.csv"), index=False)

print("Preprocessing completed. Processed datasets and preprocessor saved.")

Preprocessing completed. Processed datasets and preprocessor saved.


In [20]:
processed_csv_path = os.path.join(project_root, 'data', 'processed', 'cleaned_data.csv')
df.to_csv(processed_csv_path, index=False)
