In [1]:
import pandas as pd
import numpy as np
from joblib import dump

In [2]:
# Load the data
data = pd.read_csv('data/cleaned/motor_vehicle_insurance_data_feature_selected.csv')
data.head()

Unnamed: 0,Premium,Value_vehicle,Length,Weight,Age,Driving_experience
0,380.2,16030.0,3.999,1105,49,29
1,250.52,9927.0,4.25,1055,57,33
2,340.43,16550.0,4.245,1168,78,24
3,335.82,27330.0,4.547,1310,83,61
4,248.68,8548.0,4.25,1050,73,47


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37810 entries, 0 to 37809
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Premium             37810 non-null  float64
 1   Value_vehicle       37810 non-null  float64
 2   Length              37810 non-null  float64
 3   Weight              37810 non-null  int64  
 4   Age                 37810 non-null  int64  
 5   Driving_experience  37810 non-null  int64  
dtypes: float64(3), int64(3)
memory usage: 1.7 MB


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler


X = data.drop('Premium', axis=1)
y = data['Premium']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R2 Score: {r2}')

Mean Squared Error: 16955.52718295451
R2 Score: 0.09904188835165983


In [5]:
from sklearn.ensemble import RandomForestRegressor

# Instantiate the RandomForestRegressor
random_forest_model = RandomForestRegressor(random_state=42)

# Train the model
random_forest_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = random_forest_model.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Print the evaluation metrics
print(f'RandomForestRegressor Mean Squared Error: {mse_rf}')
print(f'RandomForestRegressor R2 Score: {r2_rf}')

# Compare with Linear Regression results
print(f'Linear Regression Mean Squared Error: {mse}')  # Assuming 'mse' is from your previous Linear Regression model
print(f'Linear Regression R2 Score: {r2}')  # Assuming 'r2' is from your previous Linear Regression model

RandomForestRegressor Mean Squared Error: 17252.713371587197
RandomForestRegressor R2 Score: 0.08325044144297
Linear Regression Mean Squared Error: 16955.52718295451
Linear Regression R2 Score: 0.09904188835165983


In [6]:
# Understand the Scale of the Target Variable
print(f"Minimum Premium: {data['Premium'].min()}")
print(f"Maximum Premium: {data['Premium'].max()}")
print(f"Standard Deviation of Premium: {data['Premium'].std()}")

# Baseline Comparison
mean_premium = data['Premium'].mean()
baseline_mse = ((data['Premium'] - mean_premium) ** 2).mean()
print(f"Baseline MSE: {baseline_mse}")

# Your model's MSE
model_mse = 18410.73697975743
print(f"Model MSE: {model_mse}")

# Comparing model MSE with baseline MSE
if model_mse < baseline_mse:
    print("Model is performing better than the baseline.")
else:
    print("Model is not performing better than the baseline.")

Minimum Premium: 81.65
Maximum Premium: 2797.51
Standard Deviation of Premium: 138.3019799244733
Baseline MSE: 19126.931767991882
Model MSE: 18410.73697975743
Model is performing better than the baseline.


In [7]:
# Example input data (replace these values with your actual input)
# Column names: ['Power', CylinderVolume', 'Value', 'Doors', 'Length', 'Weight', 'Age', 'Driving_Experience', 'Diesel', 'Petrol']
# input_data = np.array([[306, 3500, 9800, 4, 4.5, 1800, 29, 10, 0, 1]]) 



column_names = ['Value_vehicle', 'Length', 'Weight', 'Age', 'Driving_experience']
input_data = np.array([[8000, 4.3, 1300, 33, 10]])

input_df = pd.DataFrame(input_data, columns=column_names)

input_df_scaled = scaler.transform(input_df)

# Predict the output
predicted_premium = model.predict(input_df_scaled)

# Print the predicted premium
print(f"Predicted Premium: {predicted_premium[0]}")

Predicted Premium: 286.5523820003237


In [8]:
# Save the model and the scaler
dump(model, 'model.joblib')
dump(scaler, 'scaler.joblib')

['scaler.joblib']