<a href="https://colab.research.google.com/github/robitussin/CCADMACL_EXERCISES/blob/main/Exercise2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Exercise 2: Use Gradient Boost for Regression

Instructions:

- Use the Dataset File to train your model
- Use the Test File to generate your results
- Use the Sample Submission file to generate the same format
Submit your results to:
https://www.kaggle.com/competitions/playground-series-s4e12/overview



In [209]:
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_log_error

## Dataset
Train, test and sample submission file can be found in this link
https://www.kaggle.com/competitions/playground-series-s4e12/data

## 1. Load the Data

In [210]:
train = pd.read_csv('train.csv')

In [211]:
test = pd.read_csv('test.csv')

In [212]:
sample_submission = pd.read_csv('sample_submission.csv')

## 2. Perform Data preprocessing

In [213]:
train.isnull().sum()

id                           0
Age                      18705
Gender                       0
Annual Income            44949
Marital Status           18529
Number of Dependents    109672
Education Level              0
Occupation              358075
Health Score             74076
Location                     0
Policy Type                  0
Previous Claims         364029
Vehicle Age                  6
Credit Score            137882
Insurance Duration           1
Policy Start Date            0
Customer Feedback        77824
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount               0
dtype: int64

In [214]:
test.isnull().sum()

id                           0
Age                      12489
Gender                       0
Annual Income            29860
Marital Status           12336
Number of Dependents     73130
Education Level              0
Occupation              239125
Health Score             49449
Location                     0
Policy Type                  0
Previous Claims         242802
Vehicle Age                  3
Credit Score             91451
Insurance Duration           2
Policy Start Date            0
Customer Feedback        52276
Smoking Status               0
Exercise Frequency           0
Property Type                0
dtype: int64

In [215]:
train.fillna({'Age': train['Age'].mode()[0],
              'Annual Income': train['Annual Income'].mode()[0],
              'Marital Status': train['Marital Status'].mode()[0],
              'Number of Dependents': train['Number of Dependents'].mode()[0],
              'Occupation': train['Occupation'].mode()[0],
              'Health Score': train['Health Score'].mode()[0],
              'Previous Claims': train['Previous Claims'].mode()[0],
              'Vehicle Age': train['Vehicle Age'].mode()[0],
              'Credit Score': train['Credit Score'].mode()[0],
              'Insurance Duration': train['Insurance Duration'].mode()[0],
              'Customer Feedback': train['Customer Feedback'].mode()[0],
              }, inplace=True)


In [216]:
test.fillna({'Age': test['Age'].mode()[0],
              'Annual Income': test['Annual Income'].mode()[0],
              'Marital Status': test['Marital Status'].mode()[0],
              'Number of Dependents': test['Number of Dependents'].mode()[0],
              'Occupation': test['Occupation'].mode()[0],
              'Health Score': test['Health Score'].mode()[0],
              'Previous Claims': test['Previous Claims'].mode()[0],
              'Vehicle Age': test['Vehicle Age'].mode()[0],
              'Credit Score': test['Credit Score'].mode()[0],
              'Insurance Duration': test['Insurance Duration'].mode()[0],
              'Customer Feedback': test['Customer Feedback'].mode()[0],
              }, inplace=True)

In [217]:
# Define categorical and numerical features
categorical_features = train.select_dtypes(
   include=["object"]
).columns.tolist()

numerical_features = train.select_dtypes(
   include=["float64", "int64"]
).columns.tolist()

In [218]:
preprocessor = ColumnTransformer(
   transformers=[
       ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
       ("num", StandardScaler(), numerical_features),
   ]
)

In [219]:
categorical_features = test.select_dtypes(
   include=["object"]
).columns.tolist()

numerical_features = test.select_dtypes(
   include=["float64", "int64"]
).columns.tolist()

In [220]:
preprocessor = ColumnTransformer(
   transformers=[
       ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
       ("num", StandardScaler(), numerical_features),
   ]
)

## 3. Create a Pipeline

In [225]:
params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
}

In [226]:
pipeline = Pipeline(
   [
       ("preprocessor", preprocessor),
       ("regressor", GradientBoostingRegressor(**params)),
   ]
)

## 4. Train the Model

In [229]:
# Split data into features and target

X = train.drop("Premium Amount", axis=1)
y = train["Premium Amount"]

X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.98, random_state=42
)

In [230]:
# Fit the model on the training data
pipeline.fit(X_train, y_train)

## 5. Evaluate the Model

In [231]:
# Predict on the test set
y_pred = pipeline.predict(X_test)

# Generate classification report
rmse = root_mean_squared_log_error(y_test, y_pred)
print(f"RMSE: {rmse}")


RMSE: 1.1656400575383472


## Generate Submission File

Choose the model that has the best performance to generate a submission file.

In [232]:
id = sample_submission.pop('id')
y_pred = pipeline.predict(test)

# Create a submission DataFrame
submission_df = pd.DataFrame({
    'id': id,
    'Premium Amount': y_pred
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission_file.csv', index=False)
print("Submission file created: submission_file.csv")

Submission file created: submission_file.csv
