In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import shap
from imblearn.over_sampling import SMOTE
import joblib
import logging
from sklearn.impute import KNNImputer,SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from collections import defaultdict
from sklearn.metrics import accuracy_score,recall_score,precision_score,log_loss
import tensorflow
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [9]:
class Health_insurance_claim_prediction:
    def __init__(self,random_state=43):
        self.preprocessor=None
        self.model=None
    def preprocess_data(self,df):
        try:
            print('preprocessing started')
            df.drop(columns='ID',inplace=True)
            numeric_features = list(df.select_dtypes(include=['int64','float64']).columns)
            numeric_features = numeric_features[1:]
            categorical_features = list(df.select_dtypes(include=['object']).columns)
            numeric_transformer = Pipeline(steps=[
                ('imputer',KNNImputer(n_neighbors=5))
            ])
            categorical_transformer = Pipeline(steps=[
                ('imputer',SimpleImputer(strategy='most_frequent')),
                ('categorical encoding',TargetEncoder())
            ])
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num',numeric_transformer,numeric_features),
                    ('cat',categorical_transformer,categorical_features)
                ])
            X= df.drop('target',axis=1)
            y=df['target']
            features = list(X.columns)
            print('transformation started')
            X_transformed =preprocessor.fit_transform(X,y)
            print('transformation ended')
            X_transformed_df = pd.DataFrame(X_transformed,columns=features)
            print('preprocessing ended')
            return X_transformed_df,y
        except Exception as e:
            print(e)
            
    def train_model(self,x,y):
        try:
            smote = SMOTE(sampling_strategy='minority',random_state=42)
            x_resampled,y_resampled = smote.fit_resample(x,y.ravel())
            x_train,x_test,y_train,y_test=train_test_split(x_resampled,y_resampled,random_state=23,test_size=0.3)
            mlp_model = Sequential()
            mlp_model.add(Dense(32,activation='relu',input_dim = x_train.shape[1]))
            mlp_model.add(Dense(16,activation='relu'))
            mlp_model.add(Dense(8,activation='relu'))
            mlp_model.add(Dense(1,activation='sigmoid'))
            mlp_model.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])
            history = mlp_model.fit(x_train,y_train,batch_size=500,epochs=100,verbose=0,validation_split=0.2)
            print('model training started')
            y_pred_probs = mlp_model.predict(x_test).flatten()
            y_pred = mlp_model.predict(x_test)
            y_pred = (y_pred > 0.5).astype(int) 
            accuracy = accuracy_score(y_test,y_pred)
            precision_score1= precision_score(y_pred,y_test)
            recall_score1 = recall_score(y_pred,y_test)
            logloss = log_loss(y_test,y_pred_probs)
            print('accuracy_score',accuracy)
            print('precision',precision_score1)
            print('recall',recall_score1)
            print('logloss',logloss)
            print('model training ended')
            return mlp_model
        except Exception as e:
            print(e)
                
    def save_model(self,filepath):
        try:
            model_data = {
                'model':self.model,
                'preprocessor':self.preprocessor
            }
            joblib.dump(model_data,filepath)
            print('model saved')
        except Exception as e:
            print(e)
    
 
    def load_model(self,filepath):
        try:
            model_data = joblib.load(filepath)
            self.model = model_data['model']
            self.preprocessor = model_data['preprocessor']
        except Exception as e:
            print(e)
    def predict(self,X):
        try:
            x_transformed = self.preprocessor.transform(X)
            y_prob = self.model.predict(x_transformed)[:,1]
            return y_prob
        except Exception as e:
            print(e)

In [10]:
def main():
    train_data = pd.read_csv(r"C:\ML\datasets\bnp paribas\train.csv")
    # train_data = train_data.iloc[:2000,:]
    health_insurance_claim_system = Health_insurance_claim_prediction(random_state=43)
    x,y = health_insurance_claim_system.preprocess_data(train_data)
    health_insurance_claim_system.train_model(x,y)
    # print('\ncross_validation_results',cross_validation_results)
    
    health_insurance_claim_system.save_model('health_model.pkl')
    # health_insurance_claim_system.load_model('health_model.pkl')
    #sample_data = pd.read_csv(r"C:\ML\datasets\bnp paribas\test.csv")
    #sample_data.drop('ID',inplace=True)
    
if __name__=='__main__':
    main()

preprocessing started
transformation started
transformation ended
preprocessing ended


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


model training started
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
accuracy_score 0.6925601750547046
precision 0.600418410041841
recall 0.7612732095490716
logloss 0.568133153255109
model training ended
model saved
