### Baseline model and evaluation metrics
- Using LinearRegression (from sklearn.linear_model package) as Base model.
- Fitting train data and checking for performance metrices on train and test data

## Table of Contents
- [Loading dataset & checking high-level details](#Step-1:-Loading-Dataset-and-Checking-details) <br>
- [Data Dictionary](#Data-Dictionary)
- [EDA](#EDA)
- [Data Cleaning](#Data-Cleaning)
- [Feature Engineering](#Feature-Engineering)


In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Load data from train and test files, which is saved after performing EDA

In [4]:
from google.colab import drive
drive.mount('/content/driver')

Mounted at /content/driver


In [6]:
df_sample = pd.read_csv('/content/driver/MyDrive/Capstone_Code_DB_GoogleCoLab/DataSets/sample_after_eda_WithoutLog.csv', index_col='Row_ID', low_memory=False)
df_sample.head()

Unnamed: 0_level_0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,NVVar1,NVVar2,...,NVCat_F,NVCat_G,NVCat_H,NVCat_I,NVCat_J,NVCat_K,NVCat_L,NVCat_M,NVCat_N,NVCat_O
Row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48,-0.392708,0.394228,0.426171,0.316372,-0.210252,-0.275857,-1.021362,-0.188486,-0.23153,2.783616,...,0,0,0,0,0,0,0,0,1,0
102,-0.665897,-0.816152,-0.823407,-0.749192,-0.198358,-0.650165,-0.861623,-0.527934,-0.23153,2.783616,...,0,0,0,0,0,0,0,0,0,0
155,2.130275,0.826507,1.629469,1.162057,2.067504,1.162886,1.374723,0.452562,-0.23153,1.563723,...,0,0,1,0,0,0,0,0,0,0
212,-0.754282,-1.680709,-1.101091,-1.679445,-0.971487,-1.405797,-0.837048,-1.193267,-0.23153,-0.266117,...,0,0,1,0,0,0,0,0,0,0
352,3.456046,0.307772,1.606328,0.40094,2.626535,1.360567,1.423873,-0.479145,-0.23153,-0.266117,...,0,0,0,0,0,0,0,1,0,0


###  Splitting data into Train and Test datasets using 80 - 20 ratio

In [7]:
X = df_sample.drop(columns=['Claim_Amount'])
y = df_sample['Claim_Amount']

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [18]:
print(f"Shape of Train data: {X_train.shape}, Shape of Test data: {X_test.shape}")

Shape of Train data: (76237, 157), Shape of Test data: (19060, 157)


In [19]:
print(f"Shape of Train data: {X_train.info()}, Shape of Test data: {X_test.shape}")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76237 entries, 8036582 to 10626120
Columns: 157 entries, Var1 to NVCat_O
dtypes: float64(12), int64(145)
memory usage: 91.9 MB
Shape of Train data: None, Shape of Test data: (19060, 157)


### 1. Baseline mode: LinearRegression

#### Creating Pipeline for applying scaling on data using StandartScalar and then creating LinearRegression model

In [11]:
#!pip install numpy==1.23.5

#import numpy as np

np.__version__

'1.23.5'

In [24]:
!pip install pandas --upgrade

pd.__version__



'1.5.3'

In [25]:
!pip install lightgbm --upgrade



In [26]:
!pip install xgboost --upgrade




In [15]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
#from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

In [27]:
# Create models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    #'LGBM': LGBMRegressor(),
    'XGBoost': XGBRegressor()
    #'Gradient Boosting': GradientBoostingRegressor()
}

# Create a dictionary of hyperparameters for grid search
param_grid = {
    'Linear Regression': {
        'scaler': [StandardScaler(), MinMaxScaler()],
        'dim_reducer__n_components':[8,10,15]
    },
    'Decision Tree':{
        'scaler': [StandardScaler(), MinMaxScaler()],
        'dim_reducer__n_components':[8,10,15],
        'regressor__random_state': [42],
    },
    'Random Forest': {
        'regressor__n_estimators': [10, 50, 100],
        'regressor__max_depth': [10, 20, 30],
        'scaler': [StandardScaler(), MinMaxScaler()],
        'regressor__random_state': [42],
        'dim_reducer__n_components':[8,10,15]
    },
    #'LGBM': {
    #    'regressor__n_estimators': [10, 50, 100],
    #    'regressor__max_depth': [10, 20, 30],
    #    'regressor__num_leaves': [31, 50, 100],
    #    'scaler': [StandardScaler(), MinMaxScaler()],
    #    'regressor__random_state': [42]
    #},
    'XGBoost': {
        'regressor__n_estimators': [10, 50, 100],
        'regressor__max_depth': [10, 20, 30],
        'scaler': [StandardScaler(), MinMaxScaler()],
        'dim_reducer__n_components':[8,10,15],
        'regressor__random_state': [42]
    }
    #,
    #'Gradient Boosting': {
    #    'regressor__n_estimators': [10, 50, 100],
    #    'regressor__max_depth': [10, 20, 30],
    #    'scaler': [StandardScaler(), MinMaxScaler()],
    #    'regressor__random_state': [42]
    #}
}

# Dictionary to store regression metrics
metrics = {
    'Model': [],
    'MSE': [],
    'R2 Score': []
}


In [28]:
# Evaluate each model using a pipeline with GridSearchCV

for model_name, model in models.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('dim_reducer', PCA()),
        ('regressor', model)
    ])
    grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    metrics['Model'].append(model_name)
    metrics['MSE'].append(mse)
    metrics['R2 Score'].append(r2)


# Create a DataFrame to display the metrics
metrics_df = pd.DataFrame(metrics)
print(metrics_df)
# params. best

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.

In [21]:
metrics

{'Model': [], 'MSE': [], 'R2 Score': []}

In [None]:
metrics_df = pd.DataFrame(metrics)

In [None]:
metrics_df

Unnamed: 0,Model,MSE,R2 Score
0,Linear Regression,3.119225,0.011823
1,Decision Tree,6.306466,-0.997901
2,Random Forest,3.12011,0.011543


In [None]:
for model_name, model in models.items():
    print(model_name)

Linear Regression
Decision Tree
Random Forest
XGBoost


Training XGBoost model with data

In [None]:
from xgboost import XGBRegressor

estimator_xg = [
    ('scaler',StandardScaler()),
    ('scaler1',MinMaxScaler()),
    ('PCA', PCA(n_components=9)),
    ('XGBoost',XGBRegressor(n_components=8, random_state=42))
]

pipe_XG = Pipeline(estimator_xg)

pipe_XG.fit(X_train, y_train)

train_score = pipe_XG.score(X_train, y_train)
test_score = pipe_XG.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

y_pred = pipe_XG.predict(X_test)

print(f"Train score : {train_score}")
print(f"Test score : {test_score}")
print(f"Test R2 score : {r2}")
print(f"MSE: {mse}")

metrics['Model'].append('XGBoost')
metrics['MSE'].append(mse)
metrics['R2 Score'].append(r2)


Parameters: { "n_components" } are not used.



Train score : 0.14015928602371286
Test score : -0.02419733301836602
Test R2 score : -0.01662295526548463
MSE: 3.209017022187752


In [None]:
metrics_df.to_csv(r'C:\Rutvika\BrainStation\CapstoneProject\AllstateClaimPredictionChallenge\DataSets\NewDataSet\metrics_df_forFirstRun.csv')