# Modelling

In [32]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.model_selection import GridSearchCV

In [33]:
df = pd.read_csv('featuure_engineered.csv')
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,...,policy_bind_date_year,policy_bind_date_cosine_month,policy_bind_date_sine_month,policy_bind_date_cosine_day,policy_bind_date_sine_day,incident_date_year,incident_date_cosine_month,incident_date_sine_month,incident_date_cosine_day,incident_date_sine_day
0,328,48,521585,OH,250/500,1000,1406.91,0,466132,MALE,...,2014,0.5,-0.8660254,-0.8660254,0.5,2015,0.866025,0.5,0.8660254,0.5
1,228,42,342868,IN,250/500,2000,1197.22,5000000,468176,MALE,...,2006,-1.0,1.224647e-16,5.510911e-16,1.0,2015,0.866025,0.5,-4.286264e-16,-1.0
2,134,29,687698,OH,100/300,2000,1413.14,5000000,430632,FEMALE,...,2000,-1.83697e-16,-1.0,-1.0,1.224647e-16,2015,0.5,0.866025,0.5,-0.866025
3,256,41,227811,IL,250/500,2000,1415.74,6000000,608117,FEMALE,...,1990,-0.8660254,0.5,0.8660254,0.5,2015,0.866025,0.5,0.5,-0.866025
4,228,44,367455,IL,500/1000,1000,1583.91,6000000,610706,MALE,...,2014,-1.0,1.224647e-16,-1.0,1.224647e-16,2015,0.5,0.866025,-0.8660254,0.5


In [34]:
print('Duplicated rows:', df.duplicated().sum())
print('Missing values:', df.isna().sum().sum())

Duplicated rows: 0
Missing values: 91


In [35]:

from sklearn.pipeline import Pipeline
import numpy as np

In [36]:
#Select categorical columns for one-hot encoding and numeric columns for scaling
numeric_features = df.select_dtypes(exclude='object').columns.tolist()
categorical_features = df.select_dtypes(include='object').columns.tolist()

# Remove target feature from categorica list
categorical_features.remove('fraud_reported')
# Preprocessing the features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat_encoder', OneHotEncoder(drop='first', sparse_output=False),
         categorical_features),
        ('scaler', StandardScaler(), numeric_features)
    ]
)

In [37]:
# Defining X and y
X = df.drop(columns=['fraud_reported']) 
y = df['fraud_reported']

#performing LabelEncoding
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y = lb.fit_transform(y)


# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [38]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [39]:
lb.classes_

array(['N', 'Y'], dtype='<U1')

`0 represents N: not fraud case`

`1 represents Y: fraud case`

In [40]:
df['fraud_reported'].value_counts(normalize=True)

fraud_reported
N    0.753
Y    0.247
Name: proportion, dtype: float64

Noted some class imbalance in the target column. 

In [41]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.base import clone
from sklearn.metrics import recall_score, precision_score, f1_score

In [42]:
class Modeling:
    """Class to carry out model fiting and cross validation
    Attributes:
        model: The model 
        X_train: training features
        y_train: target
    """
    def __init__(self, model, preprocessor, X_train, y_train):
        self.model = model
        self.X_train = X_train
        self.y_train = y_train
        self.preprocessor = preprocessor

    def train_base(self):
        """Fif with only preprocessing which only includes scaling and ohe"""
        # Fit the model
        model_pipe = Pipeline([
            ('preprocessing', self.preprocessor),
            ('model', self.model)
        ])
        score = cross_val_score(model_pipe,
                                self.X_train,
                                self.y_train,
                                scoring='accuracy',
                                ).mean() 
        
        print("\n"f"The model(with normal preprocessing) has an accuracy of {score*100:.2f}%")
        self.model = model_pipe
        self.model.fit(self.X_train, self.y_train)
        return self.model
        
    def train_with_smote(self):
        """Includes oversampling with smote to reduce class imbalance"""
        # Perform Cross-Validation
        scores = []
        # splits for cross validation
        skf = StratifiedKFold(n_splits=3, random_state=42)
        for i, (train_idx, test_idx) in enumerate(skf.split(self.X_train, self.y_train)):
            # train and test data for the validation
            cv_X_train = self.X_train.iloc[train_idx, :]
            cv_X_test = self.X_train.iloc[test_idx, :]
            cv_y_train = self.y_train[train_idx]
            cv_y_test = self.y_train[test_idx]
            
            preprocessor_clone = clone(preprocessor)
            
            # preprocessing the features
            cv_X_train_preprocessed = preprocessor_clone.fit_transform(cv_X_train)
            cv_X_test_preprocessed = preprocessor_clone.transform(cv_X_test)
            
            # Retrieve column names from each transformer
            if i == 0:
                column_names = []
                for name, transformer, columns in preprocessor_clone.transformers_:
                    if hasattr(transformer, 'get_feature_names_out'):
                        column_names.extend(transformer.get_feature_names_out(columns))
                    else:
                        column_names.extend(columns) 
            
            # cloning the model
            model_clone = clone(self.model)
            print(column_names)
            return
            
            # oversampling with smote to fix class imbalance
            X_oversampled, y_oversampled = SMOTE().fit_resample(cv_X_train_preprocessed, cv_y_train)
            X_oversampled = pd.DataFrame(X_oversampled, columns=column_names)
            model_clone.fit(X_oversampled, y_oversampled)
            score = model_clone.score(cv_X_test_preprocessed, cv_y_test)
            scores.append(score)
        
        cv_X_train_preprocessed = preprocessor.fit_transform(self.X_train)
        
        X_oversampled, y_oversampled = SMOTE().fit_resample(cv_X_train_preprocessed, self.y_train)
        
        print("\n"f"The model(with SMOTE) has an accuracy of {np.mean(scores)*100:.2f}%")
        self.model.fit(X_oversampled, y_oversampled)
        return self.model
    
    def evaluate_on_test(self, X_test, y_test):
        """evaluates the model on unseen test data"""
        y_pred = self.model.predict(X_test)
        print('Recall Score:', recall_score(y_test, y_pred))
        print('Precision Score:', precision_score(y_test, y_pred))
        print('F1 Score:', f1_score(y_test, y_pred))
            
            

In [43]:
log_reg = LogisticRegression(solver='liblinear')

model_1 = Modeling(log_reg, preprocessor, X_train, y_train)

fitted_model_1 = model_1.train_base()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



The model(with normal preprocessing) has an accuracy of 81.73%


  y = column_or_1d(y, warn=True)


In [44]:
# fitted_model_1_smote = model_1.train_with_smote()

In [45]:
rf_clf = RandomForestClassifier()
model_2 = Modeling(rf_clf, preprocessor, X_train, y_train)

In [46]:
fitted_model_2 = model_2.train_base()

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)



The model(with normal preprocessing) has an accuracy of 76.80%


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [47]:
# fitted_model_2_smote = model_2.train_with_smote()

In [58]:
xgb = XGBClassifier(
    colsample_bytree=1.0,
    gamma=0,
    learning_rate=0.2,
    max_depth=7,
    n_estimators=100,
    subsample=1.0,
    scale_pos_weight=3,
    eval_metric='logloss',  # Optional, based on your metric of choice
    #use_label_encoder=False
)
model_3 = Modeling(xgb, preprocessor, X_train, y_train)

In [49]:
# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(xgb, param_grid=param_grid, 
                           scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

# Fit GridSearchCV to the data
grid_search.fit(X_train_preprocessed, y_train)

# Get the best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best parameters found:  {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100, 'subsample': 1.0}
Best cross-validation score: 0.83


In [59]:
fitted_model_3 = model_3.train_base()


The model(with normal preprocessing) has an accuracy of 83.73%


In [51]:
#fitted_model_3_smote = model_3.train_with_smote()

In [52]:
from sklearn.ensemble import AdaBoostClassifier


In [53]:
ada = AdaBoostClassifier()
model_4 = Modeling(ada, preprocessor, X_train, y_train)

In [54]:
fitted_model_4 = model_4.train_base()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



The model(with normal preprocessing) has an accuracy of 80.27%


  y = column_or_1d(y, warn=True)


In [55]:
# fitted_model_4_smote = model_4.train_with_smote()

The logistic regression model and the xgboost are the best performing with an accuracy of about 82%.

In [56]:
model_1.evaluate_on_test(X_test, y_test)

Recall Score: 0.3283582089552239
Precision Score: 0.4583333333333333
F1 Score: 0.3826086956521739


In [57]:
model_3.evaluate_on_test(X_test, y_test)

Recall Score: 0.5074626865671642
Precision Score: 0.576271186440678
F1 Score: 0.5396825396825398
