# Modelling

In [29]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier

In [30]:
df = pd.read_csv('featuure_engineered.csv')
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,...,policy_bind_date_year,policy_bind_date_cosine_month,policy_bind_date_sine_month,policy_bind_date_cosine_day,policy_bind_date_sine_day,incident_date_year,incident_date_cosine_month,incident_date_sine_month,incident_date_cosine_day,incident_date_sine_day
0,328,48,521585,OH,250/500,1000,1406.91,0,466132,MALE,...,2014,0.5,-0.8660254,-0.8660254,0.5,2015,0.866025,0.5,0.8660254,0.5
1,228,42,342868,IN,250/500,2000,1197.22,5000000,468176,MALE,...,2006,-1.0,1.224647e-16,5.510911e-16,1.0,2015,0.866025,0.5,-4.286264e-16,-1.0
2,134,29,687698,OH,100/300,2000,1413.14,5000000,430632,FEMALE,...,2000,-1.83697e-16,-1.0,-1.0,1.224647e-16,2015,0.5,0.866025,0.5,-0.866025
3,256,41,227811,IL,250/500,2000,1415.74,6000000,608117,FEMALE,...,1990,-0.8660254,0.5,0.8660254,0.5,2015,0.866025,0.5,0.5,-0.866025
4,228,44,367455,IL,500/1000,1000,1583.91,6000000,610706,MALE,...,2014,-1.0,1.224647e-16,-1.0,1.224647e-16,2015,0.5,0.866025,-0.8660254,0.5


In [31]:
print('Duplicated rows:', df.duplicated().sum())
print('Missing values:', df.isna().sum().sum())

Duplicated rows: 0
Missing values: 0


In [32]:

from sklearn.pipeline import Pipeline
import numpy as np

In [33]:
from utils.modelling import get_preprocessor
preprocessor = get_preprocessor(df, 'fraud_reported')

## Splitting to train and test data

In [34]:
# Defining X and y
X = df.drop(columns=['fraud_reported']) 
y = df['fraud_reported']

#performing LabelEncoding
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y = lb.fit_transform(y)
y.shape = (-1,)


# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=42)

In [35]:
# preprocessing the features
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [36]:
lb.classes_

array(['N', 'Y'], dtype='<U1')

`0 represents N: not fraud case`

`1 represents Y: fraud case`

In [37]:
df['fraud_reported'].value_counts(normalize=True)

fraud_reported
N    0.753
Y    0.247
Name: proportion, dtype: float64

Noted some class imbalance in the target column. 

In [38]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.metrics import (
    recall_score,
    precision_score,
    f1_score,
    accuracy_score
)

In [39]:
class Modeling:
    """Class to carry out model fiting and cross validation
    Attributes:
        model: The model 
        X_train: training features
        y_train: target
    """
    def __init__(self, model, preprocessor, X_train, y_train):
        self.model = model
        self.X_train = X_train
        self.y_train = y_train
        self.preprocessor = preprocessor
        self.fitted_base_model = None
        self.fitted_smote_model = None

    def train_base(self):
        """Fif with only preprocessing which only includes scaling and ohe"""
        # Fit the model
        model_pipe = Pipeline([
            ('preprocessing', clone(self.preprocessor)),
            ('model', clone(self.model))
        ])
        score = cross_val_score(model_pipe,
                                self.X_train,
                                self.y_train,
                                scoring='accuracy',
                                ).mean() 
        print('**************************************************************************')
        print("\n"f"The model(with normal preprocessing) has an accuracy of {score*100:.2f}%")
        self.fitted_base_model = model_pipe
        self.fitted_base_model.fit(self.X_train, self.y_train)
        return self.fitted_base_model
        
    def train_with_smote(self):
        """Includes oversampling with smote to reduce class imbalance"""
        # Perform custom Cross-Validation to include smote
        scores = []
        # splits for cross validation
        skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=42)
        for i, (train_idx, test_idx) in enumerate(skf.split(self.X_train, self.y_train)):
            # train and test data for the validation
            cv_X_train = self.X_train.iloc[train_idx, :]
            cv_X_test = self.X_train.iloc[test_idx, :]
            cv_y_train = self.y_train[train_idx]
            cv_y_test = self.y_train[test_idx]
            
            preprocessor_clone = clone(self.preprocessor)
            
            # preprocessing the features
            cv_X_train_preprocessed = preprocessor_clone.fit_transform(cv_X_train)
            cv_X_test_preprocessed = preprocessor_clone.transform(cv_X_test)
            
            
            # cloning the model
            model_clone = clone(self.model)
            
            # oversampling with smote to fix class imbalance
            X_oversampled, y_oversampled = (
                SMOTE(random_state=42).fit_resample(cv_X_train_preprocessed,
                                                    cv_y_train))
            
            # convert to dataframe
            X_oversampled = pd.DataFrame(X_oversampled, 
                                         columns=preprocessor_clone.get_feature_names_out())
            
            # fit and obtain score for the current fold
            model_clone.fit(X_oversampled, y_oversampled)
            score = model_clone.score(cv_X_test_preprocessed, cv_y_test)
            print(f"Fold {i+1}: Accuracy = {score * 100:.2f}%")
            scores.append(score)
        
        # define model to return
        
        # preprocess on whole train data 
        cv_X_train_preprocessed = preprocessor.fit_transform(self.X_train)
        # oversample whole training data
        X_oversampled, y_oversampled = (
            SMOTE(random_state=42).fit_resample(cv_X_train_preprocessed,
                                                self.y_train))
        
        print('**********************************************************************')
        print("\n"f"The model(with SMOTE) has an accuracy of {np.mean(scores)*100:.2f}%")
        
        # fit and return the model
        self.fitted_smote_model = clone(self.model)
        self.fitted_smote_model.fit(X_oversampled, y_oversampled)
        return self.fitted_smote_model
    
    def evaluate_on_test(self, X_test, y_test):
        """evaluates the model on unseen test data"""
        if self.fitted_base_model:
            print('Without SMOTE:')
            y_pred = self.fitted_base_model.predict(X_test)
            print('\tAccuracy score:', accuracy_score(y_test, y_pred))
            print('\tRecall Score:', recall_score(y_test, y_pred))
            print('\tPrecision Score:', precision_score(y_test, y_pred))
            print('\tF1 Score:', f1_score(y_test, y_pred))
            
        if self.fitted_smote_model:
            print('\n With SMOTE:')
            X_test_preprocessed = self.preprocessor.transform(X_test)
            y_pred = self.fitted_smote_model.predict(X_test_preprocessed)
            print('\tAccuracy score:', accuracy_score(y_test, y_pred))
            print('\tRecall Score:', recall_score(y_test, y_pred))
            print('\tPrecision Score:', precision_score(y_test, y_pred))
            print('\tF1 Score:', f1_score(y_test, y_pred))    
            

In [40]:
log_reg = LogisticRegression(solver='liblinear')

model_1 = Modeling(log_reg, preprocessor, X_train, y_train)
print('Logistic Regression model')
fitted_model_1 = model_1.train_base()

Logistic Regression model
**************************************************************************

The model(with normal preprocessing) has an accuracy of 81.73%


In [41]:
fitted_model_1_smote = model_1.train_with_smote()



Fold 1: Accuracy = 80.00%




Fold 2: Accuracy = 75.20%
Fold 3: Accuracy = 84.80%




Fold 4: Accuracy = 83.20%
Fold 5: Accuracy = 84.00%
Fold 6: Accuracy = 81.60%
**********************************************************************

The model(with SMOTE) has an accuracy of 81.47%




Fold 1: Accuracy = 80.00%
Fold 2: Accuracy = 75.20%
Fold 3: Accuracy = 84.80%
Fold 4: Accuracy = 83.20%
Fold 5: Accuracy = 84.00%
Fold 6: Accuracy = 81.60%
**********************************************************************

The model(with SMOTE) has an accuracy of 81.47%


In [42]:
rf_clf = RandomForestClassifier()
model_2 = Modeling(rf_clf, preprocessor, X_train, y_train)

In [43]:
print('Random Forest model')
fitted_model_2 = model_2.train_base()

Random Forest model
**************************************************************************

The model(with normal preprocessing) has an accuracy of 76.80%


In [44]:
fitted_model_2_smote = model_2.train_with_smote()



Fold 1: Accuracy = 78.40%




Fold 2: Accuracy = 77.60%




Fold 3: Accuracy = 76.80%




Fold 4: Accuracy = 79.20%




Fold 5: Accuracy = 82.40%




Fold 6: Accuracy = 77.60%
**********************************************************************

The model(with SMOTE) has an accuracy of 78.40%


In [45]:
xgb = XGBClassifier(
    colsample_bytree=1.0,
    gamma=0,
    learning_rate=0.2,
    max_depth=7,
    n_estimators=100,
    subsample=1.0,
    scale_pos_weight=3,
    eval_metric='logloss',  # Optional, based on your metric of choice
    #use_label_encoder=False
)
model_3 = Modeling(xgb, preprocessor, X_train, y_train)

In [46]:
print('XGBoost classifier model')
fitted_model_3 = model_3.train_base()

XGBoost classifier model
**************************************************************************

The model(with normal preprocessing) has an accuracy of 83.73%


In [47]:
fitted_model_3_smote = model_3.train_with_smote()

Fold 1: Accuracy = 80.80%
Fold 2: Accuracy = 83.20%
Fold 3: Accuracy = 87.20%
Fold 4: Accuracy = 87.20%
Fold 5: Accuracy = 84.80%
Fold 6: Accuracy = 87.20%
**********************************************************************

The model(with SMOTE) has an accuracy of 85.07%


In [48]:
fitted_model_3_smote = model_3.train_with_smote()

Fold 1: Accuracy = 80.80%
Fold 2: Accuracy = 83.20%
Fold 3: Accuracy = 87.20%
Fold 4: Accuracy = 87.20%
Fold 5: Accuracy = 84.80%
Fold 6: Accuracy = 87.20%
**********************************************************************

The model(with SMOTE) has an accuracy of 85.07%


In [49]:
ada = AdaBoostClassifier()
model_4 = Modeling(ada, preprocessor, X_train, y_train)

In [50]:
print('AdaBoost model')
fitted_model_4 = model_4.train_base()

AdaBoost model
**************************************************************************

The model(with normal preprocessing) has an accuracy of 80.27%




In [51]:
fitted_model_4_smote = model_4.train_with_smote()



Fold 1: Accuracy = 83.20%




Fold 2: Accuracy = 76.80%




Fold 3: Accuracy = 80.80%




Fold 4: Accuracy = 81.60%




Fold 5: Accuracy = 80.80%




Fold 6: Accuracy = 84.00%
**********************************************************************

The model(with SMOTE) has an accuracy of 81.20%


The logistic regression model and the xgboost are the best performing with higher accuracy.

# Evaluating on unseen test Data

In [52]:
print('*********Logistic Regression******************')
model_1.evaluate_on_test(X_test, y_test)

*********Logistic Regression******************
Without SMOTE:
	Accuracy score: 0.716
	Recall Score: 0.3283582089552239
	Precision Score: 0.4583333333333333
	F1 Score: 0.3826086956521739

 With SMOTE:
	Accuracy score: 0.748
	Recall Score: 0.6716417910447762
	Precision Score: 0.5232558139534884
	F1 Score: 0.5882352941176472


In [53]:
print('*********XGBoost Classifier*****************')
model_3.evaluate_on_test(X_test, y_test)

*********XGBoost Classifier*****************
Without SMOTE:
	Accuracy score: 0.804
	Recall Score: 0.6268656716417911
	Precision Score: 0.6363636363636364
	F1 Score: 0.6315789473684211

 With SMOTE:
	Accuracy score: 0.804
	Recall Score: 0.6567164179104478
	Precision Score: 0.6285714285714286
	F1 Score: 0.6423357664233577


# Evaluation

After the modelling process and evaluation on unseen test data, the XGBoost classifier has higher accuracy but we want a sensitive model and therefore we choose the `Logistic regression model `as the final model due the the high sesnitivity(recall = 67.16%).

In [78]:
#Gridsearch
log_reg =LogisticRegression()
# Define parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10,100,1000],          # Regularization strength
    'penalty': ['l1', 'l2'],  # Regularization type
    'solver': ['saga', 'liblinear']  # Solvers compatible with chosen penalties
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, scoring='accuracy', n_jobs=-1, verbose=1)

X_train_oversampled ,y_train_oversampled =SMOTE(random_state=42).fit_resample(X_train_preprocessed,y_train)

# Fit GridSearchCV to the data
grid_search.fit(X_train_oversampled , y_train_oversampled)

# Get best parameters and recall score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation recall score: {:.2f}".format(grid_search.best_score_))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters found:  {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validation recall score: 0.90


In [83]:
final_model = LogisticRegression(C=1 ,penalty ='l1',solver='liblinear')
final_model.fit(X_train_oversampled,y_train_oversampled)


## Testing on Unseen Data .

In [84]:
y_preds = final_model.predict(X_test_preprocessed)
recall_score(y_test,y_preds)


0.7313432835820896