# Model Building

---

1. Import packages
2. Load data
3. Modelling

---

## 1.) Import packages

In [29]:
import warnings
warnings.filterwarnings("ignore")

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Shows plots in jupyter notebook
%matplotlib inline

---
## 2.) Loading data with Pandas

In [31]:
df = pd.read_csv('../data/raw_data/Future_Value_Insights_data.csv')

#### Show Top 5 Records

In [32]:
df.head()

Unnamed: 0,id,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy,cltv
0,1,Male,Urban,Bachelor,5L-10L,1,5,5790,More than 1,A,Platinum,64308
1,2,Male,Rural,High School,5L-10L,0,8,5080,More than 1,A,Platinum,515400
2,3,Male,Urban,Bachelor,5L-10L,1,8,2599,More than 1,A,Platinum,64212
3,4,Female,Rural,High School,5L-10L,0,7,0,More than 1,A,Platinum,97920
4,5,Male,Urban,High School,More than 10L,1,6,3508,More than 1,A,Gold,59736


--------------------------------------------------
# 3. Data Preprocessing & Feature Engineering
--------------------------------------------------

In [33]:
df = pd.DataFrame(df)
# Drop unnecessary columns
df = df.drop(['id'], axis=1)

## Encoding the Data as per thier Type

In [34]:
# 1. Map 'num_policies' to numerical values
df['num_policies'] = df['num_policies'].map({'More than 1': 2, '1': 1})

# 2. Map 'income' to numerical values
income_mapping = {'<=2L': 200000, '2L-5L': 350000, '5L-10L': 750000, 'More than 10L': 1500000}
df['income'] = df['income'].map(income_mapping)

# 3. Label encode binary categorical features
binary_features = ['gender', 'area', 'marital_status']
label_encoder = LabelEncoder()
for feature in binary_features:
    df[feature] = label_encoder.fit_transform(df[feature])

# 4. One-hot encode multi-category features
multi_category_features = ['qualification', 'policy', 'type_of_policy']
df = pd.get_dummies(df, columns=multi_category_features, drop_first=True)

# 5. Scale numerical features
numerical_features = ['vintage', 'claim_amount', 'income']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [35]:
# Display encoded data
df.head()

Unnamed: 0,gender,area,income,marital_status,vintage,claim_amount,num_policies,cltv,qualification_High School,qualification_Others,policy_B,policy_C,type_of_policy_Platinum,type_of_policy_Silver
0,1,1,-0.023734,1,0.176531,0.44094,2,64308,False,False,False,False,True,False
1,1,0,-0.023734,0,1.486327,0.223305,2,515400,True,False,False,False,True,False
2,1,1,-0.023734,1,1.486327,-0.537192,2,64212,False,False,False,False,True,False
3,0,0,-0.023734,0,1.049728,-1.333859,2,97920,True,False,False,False,True,False
4,1,1,2.05598,1,0.613129,-0.258557,2,59736,True,False,False,False,False,False


In [36]:
# Convert boolean columns to integers
df= df.astype(int)
df.head()

Unnamed: 0,gender,area,income,marital_status,vintage,claim_amount,num_policies,cltv,qualification_High School,qualification_Others,policy_B,policy_C,type_of_policy_Platinum,type_of_policy_Silver
0,1,1,0,1,0,0,2,64308,0,0,0,0,1,0
1,1,0,0,0,1,0,2,515400,1,0,0,0,1,0
2,1,1,0,1,1,0,2,64212,0,0,0,0,1,0
3,0,0,0,0,1,-1,2,97920,1,0,0,0,1,0
4,1,1,2,1,0,0,2,59736,1,0,0,0,0,0


In [37]:
# Split data
X = df.drop(['cltv'], axis=1)
y = df['cltv']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [38]:
#  Feature Scaling: Standardization(StandardScaler) is used to scale the data to have a mean of 0 and a variance of 1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

--------------------------------------------------
# 4. Model Training & Tuning
--------------------------------------------------

In [39]:
models = {
    'Linear Regression': LinearRegression(),
    'KNN': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42)
}

params = {
    'KNN': {'n_neighbors': [3, 5, 7, 9, 11], 'weights': ['uniform', 'distance']},
    'Decision Tree': {'max_depth': [5, 10, 15, 20]},
    'Random Forest': {'n_estimators': [100, 200], 'max_depth': [10, 15, None]}
}

# Modified Model Training Section
best_models = {}
model_results = []

for name in models:
    print(f'\nTraining {name}...')
    
    # Always use processed data for all models
    if name in params:
        grid = GridSearchCV(models[name], params[name], cv=3, 
                           scoring='neg_mean_squared_error', n_jobs=-1)
        grid.fit(X_train, y_train)
        best_models[name] = grid.best_estimator_
        print(f'Best params: {grid.best_params_}')
    else:
        models[name].fit(X_train, y_train)
        best_models[name] = models[name]

    # Make predictions using processed test data
    y_pred = best_models[name].predict(X_test)

    # Calculate metrics
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    mae_val = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    model_results.append({
        'Model': name,
        'RMSE': rmse,
        'MAE': mae_val,
        'R2': r2
    })


Training Linear Regression...

Training KNN...
Best params: {'n_neighbors': 11, 'weights': 'uniform'}

Training Decision Tree...
Best params: {'max_depth': 5}

Training Random Forest...
Best params: {'max_depth': 10, 'n_estimators': 200}


--------------------------------------------------
# 5. Model Evaluation & Selection
--------------------------------------------------

In [40]:
results_df = pd.DataFrame(model_results).sort_values(by='RMSE')
print('\nModel Performance Comparison:')
print(results_df)

# Select best model
best_model_name = results_df.iloc[0]['Model']
best_model = best_models[best_model_name]
print(f'\nBest Model: {best_model_name}')


Model Performance Comparison:
               Model          RMSE           MAE        R2
2      Decision Tree  82594.774778  50334.283493  0.152284
3      Random Forest  82658.164984  50355.444173  0.150983
0  Linear Regression  82704.657668  51065.524883  0.150027
1                KNN  85997.401283  52555.750669  0.081000

Best Model: Decision Tree
