# Regression models

## Importing Libraries

In [1]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from math import sqrt
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression, r_regression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Load train/test data

In [2]:
# Load the Mixed encoded data 
with open("processed_data/X_train_reg_mixed.pkl", "rb") as f: X_train_mixed = pickle.load(f)
with open("processed_data/X_test_reg_mixed.pkl", "rb") as f: X_test_mixed = pickle.load(f)
with open("processed_data/y_train_reg_mixed.pkl", "rb") as f: y_train_mixed = pickle.load(f)
with open("processed_data/y_test_reg_mixed.pkl", "rb") as f: y_test_mixed = pickle.load(f)
print("Mixed encoded data for tree-based models loaded.")
print(f"X_train_mixed shape: {X_train_mixed.shape}\n")

Mixed encoded data for tree-based models loaded.
X_train_mixed shape: (3944, 24)



In [3]:
X_train_mixed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3944 entries, 1075 to 860
Data columns (total 24 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   InternetService_DSL                      3944 non-null   float64
 1   InternetService_Fiber optic              3944 non-null   float64
 2   InternetService_No                       3944 non-null   float64
 3   PaymentMethod_Bank transfer (automatic)  3944 non-null   float64
 4   PaymentMethod_Credit card (automatic)    3944 non-null   float64
 5   PaymentMethod_Electronic check           3944 non-null   float64
 6   PaymentMethod_Mailed check               3944 non-null   float64
 7   Contract                                 3944 non-null   float64
 8   gender                                   3944 non-null   int64  
 9   Partner                                  3944 non-null   int64  
 10  Dependents                               3944 non-n

In [4]:
X_train_mixed.head(2)

Unnamed: 0,InternetService_DSL,InternetService_Fiber optic,InternetService_No,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Contract,gender,Partner,...,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,SeniorCitizen,Churn,TotalInternetServicesUsed
1075,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1,1,...,0,0,0,1,0,0,0,1,0,1
309,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [5]:
X_train_mixed_original = X_train_mixed.copy()
y_train_mixed_original = y_train_mixed.copy()

## Model Training

#### Evaluation function

In [6]:
def evaluate_regression(y_true, y_pred, model_name="model"):
    """Calculates and prints regression metrics."""
    mae = mean_absolute_error(y_true, y_pred)
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name}\nMAE: {mae:.4f}\nRMSE: {rmse:.4f}\nR²: {r2:.4f}")
    return {"model": model_name, "mae": mae, "rmse": rmse, "r2": r2}

#### Feature Selection

In [7]:
X_train_mixed.columns

Index(['InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'Contract', 'gender', 'Partner', 'Dependents', 'PhoneService',
       'PaperlessBilling', 'Tenure_(12-month_groups)', 'MultipleLines',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'SeniorCitizen', 'Churn',
       'TotalInternetServicesUsed'],
      dtype='object')

In [8]:
features_0 = ['Tenure_(12-month_groups)', 'TotalInternetServicesUsed', 'InternetService_DSL', 'InternetService_Fiber optic', 'PhoneService', 'MultipleLines']
features_1 = ['Tenure_(12-month_groups)', 'TotalInternetServicesUsed', 'InternetService_DSL', 'InternetService_Fiber optic', 'PhoneService', 'Churn']

## 1. Linear Regression

In [9]:
# Linear Regression Model 
lr = LinearRegression()
lr.fit(X_train_mixed[features_0], y_train_mixed)
y_pred_lr = lr.predict(X_test_mixed[features_0])
results_df = pd.DataFrame(data=[["Linear Regression_features_0", *evaluate_regression(y_test_mixed, y_pred_lr)]], 
                          columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])

model
MAE: 23.2190
RMSE: 28.0507
R²: -0.0030


In [10]:
pickle.dump(lr, open("processed_data/models/regression/linear_reg__features_0.pkl", "wb"))

In [11]:
#  Ridge Regression with Hyperparameter Tuning
ridge = Ridge()
param_grid_ridge = {'alpha': [0.1, 1.0, 10.0, 100.0, 200.0]}

grid_search_ridge = GridSearchCV(ridge, param_grid_ridge, cv=5, scoring='r2', n_jobs=-1, verbose=1)
grid_search_ridge.fit(X_train_mixed[features_0], y_train_mixed)

best_alpha_ridge = grid_search_ridge.best_params_['alpha']
print(f"Best alpha for Ridge: {best_alpha_ridge}")

y_pred_ridge = grid_search_ridge.predict(X_test_mixed[features_0])

results_df_1 = pd.DataFrame(data=[["Linear Ridge Regression_features_0", *evaluate_regression(y_test_mixed, y_pred_ridge)]], 
                          columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])
results_df = pd.concat([results_df, results_df_1], ignore_index=True)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best alpha for Ridge: 200.0
model
MAE: 23.2062
RMSE: 28.0405
R²: -0.0022


In [12]:
pickle.dump(grid_search_ridge, open("processed_data/models/regression/linear_ridge_features_0.pkl", "wb"))

In [13]:
# Lasso Regression with Hyperparameter Tuning
lasso = Lasso()
param_grid_lasso = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0]}
grid_search_lasso = GridSearchCV(lasso, param_grid_lasso, cv=5, scoring='r2', n_jobs=-1, verbose=1)
grid_search_lasso.fit(X_train_mixed[features_0], y_train_mixed)

best_alpha_lasso = grid_search_lasso.best_params_['alpha']
print(f"Best alpha for Lasso: {best_alpha_lasso}")

y_pred_lasso = grid_search_lasso.predict(X_test_mixed[features_0])
results_df_2 = pd.DataFrame(data=[["Linear Lasso Regression_features_0", *evaluate_regression(y_test_mixed, y_pred_lasso)]], 
                          columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])
results_df = pd.concat([results_df, results_df_2], ignore_index=True)



Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best alpha for Lasso: 1.0
model
MAE: 23.1776
RMSE: 28.0235
R²: -0.0010


In [14]:
pickle.dump(grid_search_lasso, open("processed_data/models/regression/linear_lasso_features_0.pkl", "wb"))

## 2. Random Forest

Random Forest without Hyperparameter tuning

In [15]:
rf = RandomForestRegressor(n_estimators=300, max_depth=10, min_samples_split=2, min_samples_leaf=2, max_features='sqrt', random_state=42, criterion='absolute_error')
rf.fit(X_train_mixed[features_0], y_train_mixed)

In [16]:
y_pred_rf = rf.predict(X_test_mixed[features_0])
results_df_3 = pd.DataFrame(data=[["Random Forest_features_0", *evaluate_regression(y_test_mixed, y_pred_rf)]], 
                          columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])
results_df = pd.concat([results_df, results_df_3], ignore_index=True)


model
MAE: 23.8206
RMSE: 28.7122
R²: -0.0508


In [17]:
pickle.dump(rf, open("processed_data/models/regression/rf_model_features_0.pkl", "wb"))

Random Forest with Hyperparameter tuning

In [18]:
rf_tuned = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 500],        # number of trees
    'max_depth': [5, 10, 20],               # tree depth
    'min_samples_split': [2, 5, 10],        # min samples to split
    'min_samples_leaf': [1, 2, 4],          # min samples in a leaf
    'max_features': ['sqrt', 'log2']  # feature selection per split
}

grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='r2', cv=3,n_jobs=-1,verbose=2)
grid_search_rf.fit(X_train_mixed[features_0], y_train_mixed)

print("Best Parameters:", grid_search_rf.best_params_)
print("Best R2 Score:", grid_search_rf.best_score_)

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best Parameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 500}
Best R2 Score: -0.011561817865485194


In [19]:
y_pred_rf_tuned = grid_search_rf.predict(X_test_mixed[features_0])
results_df_4 = pd.DataFrame(data=[["Random Forest Tuned_features_0", *evaluate_regression(y_test_mixed, y_pred_rf_tuned)]], 
                          columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])
results_df = pd.concat([results_df, results_df_4], ignore_index=True)

model
MAE: 23.3326
RMSE: 28.1908
R²: -0.0130


In [None]:
pickle.dump(grid_search_rf, open("processed_data/models/regression/rf_tuned_model_features_0.pkl", "wb"))

## Wrong data analysis

In [30]:
df = X_test_mixed.copy()
df['y_true'] = y_test_mixed
df['y_pred'] = y_pred_rf
df['diff'] = y_test_mixed - y_pred_rf

##### **User with PhoneSerivce and Total 4 InternetServices with fiber optic or DSL has (avg 71 rupees) monthly charges**

In [None]:
correct_charges = df[
    (df['InternetService_No'] == 0) &
    (df['MultipleLines'] == 0) &
    (df['PhoneService'] == 1) &
    (df['TotalInternetServicesUsed'] == 4) 
]
correct_charges

Unnamed: 0,InternetService_DSL,InternetService_Fiber optic,InternetService_No,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Contract,gender,Partner,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,SeniorCitizen,Churn,TotalInternetServicesUsed,y_true,y_pred,diff
3247,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1,0,...,1,0,1,1,0,0,4,79.72,75.761217,3.958783
4522,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1,1,...,1,1,1,1,1,0,4,81.9,63.38405,18.51595
23,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,1,...,1,0,1,0,0,1,4,116.64,69.935633,46.704367
179,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1,1,...,1,0,1,1,1,0,4,95.65,75.761217,19.888783
4099,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0,...,1,0,0,1,0,1,4,83.77,67.823517,15.946483
90,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,0,1,...,0,1,1,0,0,0,4,18.0,62.927333,-44.927333
1168,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1,1,...,1,1,0,0,1,1,4,77.65,62.927333,14.722667
2566,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1,0,...,0,1,1,1,1,1,4,50.62,62.927333,-12.307333
3529,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,...,1,1,1,0,1,0,4,41.42,55.319383,-13.899383
3425,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0,1,...,0,0,1,1,0,1,4,66.31,62.927333,3.382667


In [101]:
mean_correct = correct_charges['y_true'].mean()
print(mean_correct)

70.6938095238095


##### **User have no services used but high (avg 71 rupees) monthly charges**

In [None]:
# so here when user is not using any service still monthlycharges avg is 71 which confirms wrong data
high_charges_when_nothing_used = df[
    (df['InternetService_No'] == 1) &
    (df['MultipleLines'] == 0) &
    (df['PhoneService'] == 0) &
    (df['TotalInternetServicesUsed'] == 0) 
    # (df['y_true'] > 50) 
]
high_charges_when_nothing_used
# 172 records

Unnamed: 0,InternetService_DSL,InternetService_Fiber optic,InternetService_No,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Contract,gender,Partner,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,SeniorCitizen,Churn,TotalInternetServicesUsed,y_true,y_pred,diff
84,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1,0,...,0,0,0,0,1,1,0,89.24,66.815067,22.424933
3526,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0,0,...,0,0,0,0,0,0,0,44.37,70.048783,-25.678783
2104,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,1,1,...,0,0,0,0,1,0,0,24.65,70.521333,-45.871333
287,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,1,0,...,0,0,0,0,1,1,0,44.71,69.138833,-24.428833
3920,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,1,1,...,0,0,0,0,0,1,0,61.46,70.048783,-8.588783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1684,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1,0,...,0,0,0,0,1,1,0,49.36,69.138833,-19.778833
2470,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0,1,...,0,0,0,0,1,0,0,87.06,63.268033,23.791967
1918,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0,0,...,0,0,0,0,1,1,0,59.28,70.521333,-11.241333
31,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,1,0,...,0,0,0,0,0,1,0,63.10,69.138833,-6.038833


In [67]:
mean = high_charges_when_nothing_used['y_true'].mean()
print(mean)

71.05296511627907


So, When user is not using any service still their monthlycharges avg is 71 and there are 172 records like this which shows original data is wrong.

##### **User with fiber optic or DSL and only 1 service used but high (avg 70 rupees) monthly charges**

In [77]:
# so here when user is not using any service still monthlycharges avg is 71 which confirms wrong data
high_charges_when_1service_used = df[
    (df['InternetService_No'] == 0) &
    (df['MultipleLines'] == 0) &
    (df['PhoneService'] == 0) &
    (df['TotalInternetServicesUsed'] == 1) 
]
high_charges_when_1service_used
# 90 records

Unnamed: 0,InternetService_DSL,InternetService_Fiber optic,InternetService_No,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Contract,gender,Partner,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,SeniorCitizen,Churn,TotalInternetServicesUsed,y_true,y_pred,diff
2822,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1,...,0,0,0,0,0,1,1,108.11,64.102450,44.007550
1499,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0,...,0,0,0,0,1,0,1,38.64,66.782317,-28.142317
3126,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,1,1,18.00,65.505583,-47.505583
1436,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0,1,...,0,0,0,0,1,0,1,69.63,65.009550,4.620450
3375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1,1,...,0,1,0,0,0,0,1,18.00,67.701333,-49.701333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0,0,...,0,0,1,0,1,0,1,46.19,64.102450,-17.912450
2388,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0,0,...,0,0,0,1,0,0,1,105.70,78.774733,26.925267
1129,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0,1,...,0,0,0,0,0,0,1,119.74,62.329883,57.410117
1485,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,1,...,0,0,1,0,1,0,1,37.06,72.430900,-35.370900


In [75]:
mean_2 = high_charges_when_1service_used['y_true'].mean()
print(mean_2)

70.12188888888889


So, When user is using 1 Internet based service & Internetservice still their monthlycharges avg is 70 and there are 90 records like this which shows original data is wrong.


##### **User with fiber optic and 3 service used but low (avg 29 rupees) monthlycharges**

In [None]:
# user with fiber optic and 3 service used but low charges
low_charges_when_4service_used = df[
    (df['InternetService_Fiber optic'] == 1) &
    (df['MultipleLines'] == 0) &
    (df['PhoneService'] == 0) &
    (df['TotalInternetServicesUsed'] == 3) &
    (df['y_true'] < 40)
]
low_charges_when_4service_used
# 7 records

Unnamed: 0,InternetService_DSL,InternetService_Fiber optic,InternetService_No,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Contract,gender,Partner,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,SeniorCitizen,Churn,TotalInternetServicesUsed,y_true,y_pred,diff
3705,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0,...,1,0,1,1,0,0,3,18.0,76.008367,-58.008367
414,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,1,1,...,1,0,0,1,1,0,3,32.14,67.431433,-35.291433
1630,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,1,0,...,1,1,0,1,1,0,3,28.74,76.008367,-47.268367
4813,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1,1,...,0,1,1,0,1,0,3,37.56,76.12145,-38.56145
1135,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1,0,...,1,0,1,1,1,0,3,38.23,74.97095,-36.74095
4092,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1,1,...,1,1,0,0,0,0,3,30.28,67.431433,-37.151433
1592,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0,1,...,0,0,1,1,0,0,3,18.0,68.071367,-50.071367


In [94]:
mean_3 = low_charges_when_4service_used['y_true'].mean()
print(mean_3)

28.99285714285714


When user use Fiber optic with 3 services still their avg monthlycharges are 28 which is low according to services used, it should be greater.

## 3. XGBoost

In [22]:
xgb = XGBRegressor(n_estimators=500, learning_rate=0.1, max_depth=5, subsample=0.8, colsample_bytree=0.8, random_state=42)

In [23]:
xgb.fit(X_train_mixed[features_0], y_train_mixed)
y_pred_xgb = xgb.predict(X_test_mixed[features_0])

In [24]:
results_df_5 = pd.DataFrame(data=[["XGBoost_features_0", *evaluate_regression(y_test_mixed, y_pred_xgb)]], 
                          columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])
results_df = pd.concat([results_df, results_df_5], ignore_index=True)

model
MAE: 23.6865
RMSE: 28.6001
R²: -0.0427


In [25]:
pickle.dump(xgb, open("processed_data/models/regression/xgb_model_features_0.pkl", "wb"))

XGB with Hyperparameter tuning

In [26]:
param_grid = {
    'n_estimators': [200, 500],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='r2', cv=3, verbose=2, n_jobs=-1)

grid_search_xgb.fit(X_train_mixed[features_0], y_train_mixed)

print(f"Best Parameters: {grid_search_xgb.best_params_}")
print(f"Best R² Score from CV: {grid_search_xgb.best_score_}")

Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best Parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}
Best R² Score from CV: -0.00941831128871525


In [27]:
# Evaluate tuned model
best_xgb = grid_search_xgb.best_estimator_
y_pred_xgb_tuned = best_xgb.predict(X_test_mixed[features_0])

In [28]:
results_df_6 = pd.DataFrame(data=[["XGBoost tuned_features_0", *evaluate_regression(y_test_mixed, y_pred_xgb_tuned)]], 
                          columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square'])
results_df = pd.concat([results_df, results_df_6], ignore_index=True)

model
MAE: 23.2286
RMSE: 28.0642
R²: -0.0039


In [29]:
pickle.dump(best_xgb, open("processed_data/models/regression/xgb_tuned_model_features_0.pkl", "wb"))