In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder,FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
import numpy as np
import joblib



In [80]:

from scipy import stats
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler


# Define columns
num_cols = ['Age', 'Number of Dependents', 'Previous Claims', 'Health Score', 
            'Credit Score', 'Vehicle Age', 'Insurance Duration','Annual Income']
cat_cols = ['Marital Status', 'Occupation', 'Customer Feedback']
label_cols = ['Smoking Status','Education Level','Location','Policy Type','Exercise Frequency']
# Columns for one-hot encoding
onehot_cols = ['Property Type','Gender']

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(sparse_output=False))
])
skewess= Pipeline([
   ('power', PowerTransformer(method='yeo-johnson'))
])

cat_val=[['No','Yes'],['High School',"Bachelor's" ,"Master's", 'PhD'],['Rural','Suburban' ,'Urban'], ['Basic','Comprehensive','Premium'],['Rarely','Monthly' ,'Weekly' ,'Daily']]
preprocessor = ColumnTransformer([
    ('num', SimpleImputer(strategy='median'), num_cols),
    ('cat',num_pipeline, cat_cols),
    ('label', OrdinalEncoder(categories=cat_val), label_cols),
    ('onehot', OneHotEncoder(handle_unknown='ignore',sparse_output=False), onehot_cols),
    ('skewess',skewess,['Annual Income'])
], remainder='passthrough') 

# # pipeline steps 

# full_pipeline = Pipeline([
#     ('preprocessing', preprocessor),
#     ('model', RandomForestRegressor(n_estimators=100,max_depth=10,n_jobs=-1)),
    # ('scaler', StandardScaler(),num_cols)
# ])





In [81]:
from xgboost import XGBRegressor

full_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model',XGBRegressor(objective='reg:squarederror', n_estimators=50, learning_rate=0.1, max_depth=5))
])


### Spliting the dataset for train data and testing data

In [82]:

df=pd.read_csv(r"C:\Users\MY Laptop\Desktop\guvi_class\smart premium\train.csv")
outliers=['Annual Income','Previous Claims', 'Premium Amount']
for col in outliers:

    Q1 = df[col].quantile(0.25)  # 25th percentile
    Q3 = df[col].quantile(0.75)  # 75th percentile
    IQR = Q3 - Q1  # Interquartile range

    # Define lower and upper bound
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Cap values at lower and upper bound
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
    
df.drop(['Policy Start Date','id'],axis=1,inplace=True)
from scipy import stats
df['Premium Amount'], _ = stats.boxcox(df['Premium Amount'])

X=df.drop(['Premium Amount'],axis=1)
y = df['Premium Amount']
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
# Fit the pipeline
full_pipeline.fit(x_train, y_train)


In [83]:
import joblib
joblib.dump(full_pipeline, "ml_model.pkl")

['ml_model.pkl']

In [84]:
test_df=pd.read_csv(r"C:\Users\MY Laptop\Desktop\guvi_class\smart premium\test.csv")
model = joblib.load("ml_model.pkl")
test_df.drop(['Policy Start Date','id'],axis=1,inplace=True)
ntrain_pred=model.predict(test_df)

In [58]:
ntrain_pred

array([1342.1508, 1093.9669, 1076.0277, ..., 1074.3962, 1105.503 ,
       1046.1344], dtype=float32)

In [85]:

# Predict
train_pred=model.predict(x_train)
test_pred=model.predict(x_test)


print(f"**** # Evaluation *******")
print(f"MSE for Training Data : {mean_absolute_error(y_train,train_pred)}")
print(f"MSE for Test data: {mean_absolute_error(y_test,test_pred)}")

print("RMSE for training data", np.sqrt(mean_squared_error(y_train, train_pred)))
print("RMSE for test data", np.sqrt(mean_squared_error(y_test, test_pred)))

print(f"R2 score for train data: {r2_score(y_train,train_pred)}")
print(f"R2 score for test data: {r2_score(y_test,test_pred)}")


**** # Evaluation *******
MSE for Training Data : 13.01249997318544
MSE for Test data: 12.977932495603353
RMSE for training data 16.597660547322214
RMSE for test data 16.56462931033195
R2 score for train data: 0.06178932394306913
R2 score for test data: 0.061542876091059684
