In [43]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error

In [None]:
data = pd.read_csv('MachineLearningRating_v3.csv')
numeric_cols = data.select_dtypes(include=['number']).columns

for col in numeric_cols:
    mean = data[col].mean()
    data[col].fillna(mean, inplace=True)

# print(data.isnull().sum())
categorical_cols = data.select_dtypes(include=['object']).columns
categorical_imputer = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = categorical_imputer.fit_transform(data[categorical_cols])
print(data.isnull().sum())

In [None]:
data['VehicleAge'] = 2024 - data['RegistrationYear']
data['VehicleValue'] = data['SumInsured']  # Using SumInsured as a proxy
data['CoverageDuration'] = data['TermFrequency']
data['TermFrequency'] = data['TermFrequency'].replace('Annual', 12).replace('Monthly', 1).astype(float)
data['PremiumPerMonth'] = data['CalculatedPremiumPerTerm'] / data['TermFrequency']

data['VehicleAge_SumInsured'] = data['VehicleAge'] * data['SumInsured']
data = pd.get_dummies(data, columns=['Province', 'CoverCategory'], drop_first=True)
print(data.head())

In [None]:
categorical_cols = [
    'TransactionMonth', 'Citizenship', 'LegalType', 'Title', 'Language', 'Bank', 'AccountType',
    'MaritalStatus', 'Country', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode',
    'VehicleType', 'make', 'Model', 'bodytype',  'CoverType', 'CoverGroup',
    'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType'
]


data = pd.get_dummies(data, columns=categorical_cols, prefix_sep='_')

print(data.head())

In [28]:
data_train, data_test = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = pd.to_numeric(data[column], errors='coerce').fillna(0)



X = data.drop("TotalClaims", axis=1)  # Change column name here
y = data["TotalClaims"]  # Change column name here

# Split the data into training and testing sets (20/80 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_predictions)
lr_rmse = mean_squared_error(y_test, lr_predictions, squared=False)  # RMSE
lr_r2 = r2_score(y_test, lr_predictions)
lr_mae = mean_absolute_error(y_test, lr_predictions)

# 2. Random Forest
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_rmse = mean_squared_error(y_test, rf_predictions, squared=False)  # RMSE
rf_r2 = r2_score(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)

# 3. XGBoost
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_rmse = mean_squared_error(y_test, xgb_predictions, squared=False)  # RMSE
xgb_r2 = r2_score(y_test, xgb_predictions)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)

# Evaluate each model
def evaluate_model(model_name, predictions):
    print(f"\n{model_name}:")
    print(f"  Mean Squared Error: {mean_squared_error(y_test, predictions):.2f}")
    print(f"  Root Mean Squared Error: {mean_squared_error(y_test, predictions, squared=False):.2f}")
    print(f"  R-squared: {r2_score(y_test, predictions):.2f}")
    print(f"  Mean Absolute Error: {mean_absolute_error(y_test, predictions):.2f}")

evaluate_model("Linear Regression", lr_predictions)
evaluate_model("Random Forest", rf_predictions)
evaluate_model("XGBoost", xgb_predictions)


In [None]:
data = data.sample(frac=0.07, random_state=42) 
X = data.drop("TotalClaims", axis=1)  # Replace "Retention" with your target column
y = data["TotalClaims"]
 
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))

# Get feature importance scores
feature_importances = rf_model.feature_importances_
feature_names = X.columns

# Create a DataFrame for easier viewing
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)