# **Final Project - Computing for Data Science**

## Group Members:
1. **Tarang Kadyan**  
   <tarang.kadyan@bse.eu>

2. **Deepak Malik**  
   <deepak.malik@bse.eu>

3. **Enzo Infantes**  
   <enzo.infantes@bse.eu>

# **1. Libraries**

In [12]:
import sys
import os
sys.path.append('c:/Users/Enzo/Documents/BSE/T1/COMPUTING_DS/Final_Project/Final-Project/telco-churn-library')

import pandas as pd
import kagglehub
import joblib
import json

from telco_churn.data_preprocessing.data_loader import CSVDataLoader, DataPreparer
from telco_churn.data_preprocessing.preprocessor import HandleMissingValues,NormalizeData, EncodeCategoricalData, HandleOutliers,PreprocessingPipeline
from telco_churn.feature_engineering.feature_engineering import StatisticalFeatures, CategoricalEncoding, InteractionFeatures, TemporalFeatures, DerivedFeatures, FeaturePipeline
from telco_churn.modelling.model import LogisticRegressionModel, HyperparameterTuner, ModelingPipeline, CrossValidator

pd.set_option('display.max_columns', None)

## **Step 1: Load Data**

In [68]:
# Define our file path to upload the data
path = kagglehub.dataset_download("blastchar/telco-customer-churn")
file_path = os.path.join(path, 'WA_Fn-UseC_-Telco-Customer-Churn.csv')

required_columns = ['gender', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn']
csv_loader = CSVDataLoader(required_columns=required_columns)

preparer = DataPreparer(loaders=[csv_loader])

data = preparer.load_and_validate(file_path, loader_type="csv", column_to_convert="TotalCharges")

# Only to make the process faster with the most important information
cols = ['gender', 'SeniorCitizen', 'Partner', 'tenure', 'PhoneService', 
        'PaperlessBilling', 'MonthlyCharges', 'TotalCharges']

X = data[cols]
y = data['Churn'].map({'Yes': 1, 'No': 0}).squeeze() 

Data loaded successfully from C:\Users\Enzo\.cache\kagglehub\datasets\blastchar\telco-customer-churn\versions\1\WA_Fn-UseC_-Telco-Customer-Churn.csv.
Column 'TotalCharges' successfully converted to numeric and blanks replaced with NaN.
Data validation successful.


## **Step 2: Preprocessing**

In [70]:
pipeline = PreprocessingPipeline(preprocessors=[
           HandleMissingValues(strategy='mean'),  # Impute missing values with the mean
           NormalizeData(method='minmax'),        # Normalize data using MinMax scaling
           EncodeCategoricalData(),               # One-hot encode categorical variables
           HandleOutliers(method='iqr')           # Handle outliers using IQR method
])

processed_data = pipeline.apply(X)

## **Step 3: Feature Engineering**

In [71]:
pipeline = FeaturePipeline(transformers=[
           StatisticalFeatures(group_by_column=None),
           CategoricalEncoding(),
           InteractionFeatures(),
           TemporalFeatures(),
           DerivedFeatures()
])

X = pipeline.apply(processed_data) 

## **Step 4: Split, Train, and Evaluate**

In [5]:
# Initialize the LogisticRegression model
log_reg_model = LogisticRegressionModel(solver='liblinear')

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],
    'max_iter': [100, 200, 300]
}

# Initialize the hyperparameter tuner
hyperparameter_tuner = HyperparameterTuner(model=log_reg_model.model, param_grid=param_grid)

# Create the modeling pipeline with the model and tuner
modeling_pipeline = ModelingPipeline(model=log_reg_model, tuner=hyperparameter_tuner)

# Run the pipeline
metrics = modeling_pipeline.run(X, y)

# Print metrics (this will include accuracy, precision, recall, f1_score, roc_auc)
print("\nModel Performance Metrics:")
for metric, value in metrics.items():
    print(f"{metric.capitalize()}: {value:.4f}")

Performing hyperparameter tuning...
Best Parameters: {'C': 10, 'max_iter': 100}
Model Performance:
Accuracy: 0.7913
Precision: 0.6436
Recall: 0.4745
F1_score: 0.5463
Roc_auc: 0.8390

Model Performance Metrics:
Accuracy: 0.7913
Precision: 0.6436
Recall: 0.4745
F1_score: 0.5463
Roc_auc: 0.8390


## **Step 5: Save The Best Model - API**

In [6]:
best_model = log_reg_model.model
joblib.dump(best_model, "trained_model.pkl")

['trained_model.pkl']