In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder,FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
import numpy as np
import joblib



In [None]:
import xgboost as xgb

# Define columns
num_cols = ['Age', 'Number of Dependents', 'Previous Claims', 'Health Score', 
            'Credit Score', 'Vehicle Age', 'Insurance Duration']
cat_cols = ['Marital Status', 'Occupation', 'Customer Feedback']
label_cols = ['Gender', 'Smoking Status']
# Columns for one-hot encoding
onehot_cols = ['Property Type',  'Education Level' , 'Location', 'Policy Type', 'Exercise Frequency']

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', SimpleImputer(strategy='median'), num_cols),
    ('cat',num_pipeline, cat_cols),
    ('label', OrdinalEncoder(categories=[['Female', 'Male'], ['No', 'Yes']]), label_cols),
    ('onehot', OneHotEncoder(handle_unknown='ignore',sparse_output=False), onehot_cols)
], remainder='passthrough') 

# pipeline steps 

full_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100,max_depth=10,n_jobs=-1))
])



# full_pipeline = Pipeline([
#     ('preprocessing', preprocessor),
#     ('model', xgb.XGBRegressor(objective='reg:squarederror', n_estimators=40, learning_rate=0.1, max_depth=10))
# ])


['ml_model.pkl']

### Spliting the dataset for train data and testing data

In [None]:

df=pd.read_csv(r"C:\Users\MY Laptop\Desktop\guvi_class\smart premium\train.csv")
z = (df['Annual Income']-df['Annual Income'].mean())/df['Annual Income'].std()
df['z score'] = z
df = df[(df['z score']<=3) & (df['z score']>=-3)]
df.drop(['Policy Start Date','id','z score'],axis=1,inplace=True)

X=df.drop(['Premium Amount'],axis=1)
y = df['Premium Amount']
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
# Fit the pipeline
full_pipeline.fit(x_train, y_train)


In [5]:
joblib.dump(full_pipeline, "ml_model.pkl")

['ml_model.pkl']

In [7]:
test_df=pd.read_csv(r"C:\Users\MY Laptop\Desktop\guvi_class\smart premium\test.csv")
model = joblib.load("ml_model.pkl")
test_df.drop(['Policy Start Date'],axis=1,inplace=True)
ntrain_pred=model.predict(test_df)

In [8]:
ntrain_pred

array([1450.08614662, 1113.93031079, 1079.99863532, ..., 1081.32213977,
       1107.57712864, 1067.40426537])

In [None]:

# Predict
train_pred=full_pipeline.predict(x_train)
test_pred=full_pipeline.predict(x_test)


print(f"**** # Evaluation *******")
print(f"MSE for Training Data : {mean_absolute_error(y_train,train_pred)}")
print(f"MSE for Test data: {mean_absolute_error(y_test,test_pred)}")

print("RMSE for training data", np.sqrt(mean_squared_error(y_train, train_pred)))
print("RMSE for test data", np.sqrt(mean_squared_error(y_test, test_pred)))

print(f"R2 score for train data: {r2_score(y_train,train_pred)}")
print(f"R2 score for test data: {r2_score(y_test,test_pred)}")


**** # Evaluation *******
MSE for Training Data : 634.4397761645677
MSE for Test data: 636.7629809366913
RMSE for training data 834.1867198647358
RMSE for test data 838.5386809940351
R2 score for train data: 0.05073267078714627
R2 score for test data: 0.035966528545358756
