<font size = '5' ><b>Data preparetion</b></font>

In [4]:
import pandas as pd
data = pd.read_csv('C:/Users/User/Desktop/10/MachineLearningRating_v3/MachineLearningRating_v3.txt', delimiter='|', low_memory=False)

print("\nData Types of Each Column:")
print(data.dtypes)




Data Types of Each Column:
UnderwrittenCoverID           int64
PolicyID                      int64
TransactionMonth             object
IsVATRegistered                bool
Citizenship                  object
LegalType                    object
Title                        object
Language                     object
Bank                         object
AccountType                  object
MaritalStatus                object
Gender                       object
Country                      object
Province                     object
PostalCode                    int64
MainCrestaZone               object
SubCrestaZone                object
ItemType                     object
mmcode                      float64
VehicleType                  object
RegistrationYear              int64
make                         object
Model                        object
Cylinders                   float64
cubiccapacity               float64
kilowatts                   float64
bodytype                     object


<font size = '5'><b>Handling Missing Data</b></font>

In [5]:
# Fill missing categorical data with a placeholder
categorical_cols = ['Bank', 'AccountType', 'VehicleType', 'make', 'Model', 'bodytype', 
                     'AlarmImmobiliser', 'TrackingDevice', 'CapitalOutstanding', 
                     'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 
                     'TermFrequency', 'ExcessSelected', 'CoverCategory', 'CoverType', 
                     'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType']

data[categorical_cols] = data[categorical_cols].fillna('Unknown')


In [6]:
# Fill missing numerical data with the median
numerical_cols = ['mmcode', 'Cylinders', 'cubiccapacity', 'kilowatts', 'NumberOfDoors', 
                   'CustomValueEstimate', 'NumberOfVehiclesInFleet', 'SumInsured', 
                   'CalculatedPremiumPerTerm', 'TotalPremium', 'TotalClaims']

data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].median())


In [7]:
data['TransactionMonth'] = pd.to_datetime(data['TransactionMonth'], errors='coerce')
data['VehicleIntroDate'] = pd.to_datetime(data['VehicleIntroDate'], errors='coerce')

data['TransactionMonth'].fillna(pd.Timestamp('1900-01-01'), inplace=True)
data['VehicleIntroDate'].fillna(pd.Timestamp('1900-01-01'), inplace=True)


<font size = '5'><b>Feature Engineering</b></font>

In [8]:
import numpy as np

data['CustomValuePerDoor'] = data['CustomValueEstimate'] / (data['NumberOfDoors'].replace(0, np.nan))
data['PremiumPerKilowatt'] = data['CalculatedPremiumPerTerm'] / (data['kilowatts'].replace(0, np.nan))
data['ClaimRatio'] = data['TotalClaims'] / (data['TotalPremium'].replace(0, np.nan))

data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)


<font size = '5'><b>Encoding Categorical Data</b></font>

In [9]:
# Convert categorical columns to numeric format using one-hot encoding
data_encoded = pd.get_dummies(data, columns=['Bank', 'AccountType', 'VehicleType', 'make', 'Model', 
                                             'bodytype', 'AlarmImmobiliser', 'TrackingDevice', 
                                             'CapitalOutstanding', 'NewVehicle', 'WrittenOff', 
                                             'Rebuilt', 'Converted', 'CrossBorder', 'TermFrequency', 
                                             'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 
                                             'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType'])


<font size = '5'><b>Train-Test Split</b></font>

In [10]:
from sklearn.model_selection import train_test_split

X = data_encoded.drop(['TotalPremium', 'TotalClaims'], axis=1)
y_premium = data_encoded['TotalPremium']
y_claims = data_encoded['TotalClaims']

X_train, X_test, y_train_premium, y_test_premium = train_test_split(X, y_premium, test_size=0.3, random_state=42)
X_train, X_test, y_train_claims, y_test_claims = train_test_split(X, y_claims, test_size=0.3, random_state=42)


<font size ='5'><b> Modeling Techniques</b></font>

In [11]:
X_train['Year'] = X_train['TransactionMonth'].dt.year
X_train['Month'] = X_train['TransactionMonth'].dt.month
X_train['Day'] = X_train['TransactionMonth'].dt.day

X_test['Year'] = X_test['TransactionMonth'].dt.year
X_test['Month'] = X_test['TransactionMonth'].dt.month
X_test['Day'] = X_test['TransactionMonth'].dt.day

X_train = X_train.drop('TransactionMonth', axis=1)
X_test = X_test.drop('TransactionMonth', axis=1)



In [13]:
X_train = X_train.apply(pd.to_numeric, errors='coerce')  


In [14]:
X_train = X_train.fillna(X_train.mean())  
y_train_premium = y_train_premium.fillna(y_train_premium.mean())  


In [15]:
print("X_train shape:", X_train.shape)
print("y_train_premium shape:", y_train_premium.shape)


X_train shape: (700068, 1296)
y_train_premium shape: (700068,)


In [16]:
X_train = X_train.select_dtypes(include=[np.number])  

In [22]:
import warnings
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

data_subset = data.sample(n=10000, random_state=1)  

X_subset = data_subset.drop(columns=['TotalPremium'])
y_subset_premium = data_subset['TotalPremium']

X_subset = X_subset.apply(pd.to_numeric, errors='coerce')
y_subset_premium = pd.to_numeric(y_subset_premium, errors='coerce')

imputer = SimpleImputer(strategy='mean')
X_subset = imputer.fit_transform(X_subset)

y_subset_premium = y_subset_premium.fillna(y_subset_premium.mean())

X_subset = np.array(X_subset)
y_subset_premium = np.array(y_subset_premium)

X_train_subset, X_test_subset, y_train_premium, y_test_premium = train_test_split(
    X_subset, y_subset_premium, test_size=0.2, random_state=42
)

print(f'X_train_subset shape: {X_train_subset.shape}')
print(f'y_train_premium shape: {y_train_premium.shape}')
print(f'X_test_subset shape: {X_test_subset.shape}')
print(f'y_test_premium shape: {y_test_premium.shape}')

# Linear Regression for TotalPremium
model_lr_premium = LinearRegression()
model_lr_premium.fit(X_train_subset, y_train_premium)
y_pred_lr_premium = model_lr_premium.predict(X_test_subset)
print(f'Linear Regression (Premium) RMSE: {np.sqrt(mean_squared_error(y_test_premium, y_pred_lr_premium))}')
print(f'Linear Regression (Premium) R^2: {r2_score(y_test_premium, y_pred_lr_premium)}')

# Random Forest for TotalPremium
model_rf_premium = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf_premium.fit(X_train_subset, y_train_premium)
y_pred_rf_premium = model_rf_premium.predict(X_test_subset)
print(f'Random Forest (Premium) RMSE: {np.sqrt(mean_squared_error(y_test_premium, y_pred_rf_premium))}')
print(f'Random Forest (Premium) R^2: {r2_score(y_test_premium, y_pred_rf_premium)}')

# XGBoost for TotalPremium
model_xgb_premium = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
model_xgb_premium.fit(X_train_subset, y_train_premium)
y_pred_xgb_premium = model_xgb_premium.predict(X_test_subset)
print(f'XGBoost (Premium) RMSE: {np.sqrt(mean_squared_error(y_test_premium, y_pred_xgb_premium))}')
print(f'XGBoost (Premium) R^2: {r2_score(y_test_premium, y_pred_xgb_premium)}')


X_train_subset shape: (8000, 21)
y_train_premium shape: (8000,)
X_test_subset shape: (2000, 21)
y_test_premium shape: (2000,)
Linear Regression (Premium) RMSE: 111.52849159701782
Linear Regression (Premium) R^2: 0.5035516173508323
Random Forest (Premium) RMSE: 109.04847225272125
Random Forest (Premium) R^2: 0.525384831076496
XGBoost (Premium) RMSE: 112.17332848636933
XGBoost (Premium) R^2: 0.49779427791251074


<font size='5'><b>Feature Importance Analysis</b></font>

In [33]:
feature_names = X_subset.columns if hasattr(X_subset, 'columns') else ['feature_' + str(i) for i in range(X_subset.shape[1])]


In [None]:
import shap

X_test_df = pd.DataFrame(X_test_subset, columns=feature_names)

# For Random Forest
explainer_rf = shap.TreeExplainer(model_rf_premium)
shap_values_rf = explainer_rf.shap_values(X_test_df)

# Summary plot for Random Forest
shap.summary_plot(shap_values_rf, X_test_df, feature_names=feature_names)

# For XGBoost
explainer_xgb = shap.TreeExplainer(model_xgb_premium)
shap_values_xgb = explainer_xgb.shap_values(X_test_df)

# Summary plot for XGBoost
shap.summary_plot(shap_values_xgb, X_test_df, feature_names=feature_names)
