In [437]:
# importing modules 
import pandas as pd
import numpy as np

In [438]:
df = pd.read_csv('insurance_claims.csv')
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,insured_education_level,insured_occupation,insured_hobbies,insured_relationship,capital-gains,capital-loss,incident_date,incident_type,collision_type,incident_severity,authorities_contacted,incident_state,incident_city,incident_location,incident_hour_of_the_day,number_of_vehicles_involved,property_damage,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported,_c39
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,MALE,MD,craft-repair,sleeping,husband,53300,0,2015-01-25,Single Vehicle Collision,Side Collision,Major Damage,Police,SC,Columbus,9935 4th Drive,5,1,YES,1,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y,
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,MALE,MD,machine-op-inspct,reading,other-relative,0,0,2015-01-21,Vehicle Theft,?,Minor Damage,Police,VA,Riverwood,6608 MLK Hwy,8,1,?,0,0,?,5070,780,780,3510,Mercedes,E400,2007,Y,
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,FEMALE,PhD,sales,board-games,own-child,35100,0,2015-02-22,Multi-vehicle Collision,Rear Collision,Minor Damage,Police,NY,Columbus,7121 Francis Lane,7,3,NO,2,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N,
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,FEMALE,PhD,armed-forces,board-games,unmarried,48900,-62400,2015-01-10,Single Vehicle Collision,Front Collision,Major Damage,Police,OH,Arlington,6956 Maple Drive,5,1,?,1,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y,
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,MALE,Associate,sales,board-games,unmarried,66000,-46000,2015-02-17,Vehicle Theft,?,Minor Damage,,NY,Arlington,3041 3rd Ave,20,1,NO,0,1,NO,6500,1300,650,4550,Accura,RSX,2009,N,


In [439]:
df.isna().sum().sort_values(ascending=False)

_c39                           1000
authorities_contacted            91
fraud_reported                    0
auto_year                         0
incident_state                    0
incident_city                     0
incident_location                 0
incident_hour_of_the_day          0
number_of_vehicles_involved       0
property_damage                   0
bodily_injuries                   0
witnesses                         0
age                               0
total_claim_amount                0
injury_claim                      0
property_claim                    0
vehicle_claim                     0
auto_make                         0
auto_model                        0
police_report_available           0
months_as_customer                0
collision_type                    0
insured_zip                       0
policy_number                     0
policy_bind_date                  0
policy_state                      0
policy_csl                        0
policy_deductable           

In [440]:
df.drop('_c39', axis=1, inplace=True)

# Identifying the numeric and categorical features.

We start by identifying the numericc and the categorical features

## 1. Numeric features

In [441]:
numeric_df = df.select_dtypes(exclude='object')
numeric_df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_year
0,328,48,521585,1000,1406.91,0,466132,53300,0,5,1,1,2,71610,6510,13020,52080,2004
1,228,42,342868,2000,1197.22,5000000,468176,0,0,8,1,0,0,5070,780,780,3510,2007
2,134,29,687698,2000,1413.14,5000000,430632,35100,0,7,3,2,3,34650,7700,3850,23100,2007
3,256,41,227811,2000,1415.74,6000000,608117,48900,-62400,5,1,1,2,63400,6340,6340,50720,2014
4,228,44,367455,1000,1583.91,6000000,610706,66000,-46000,20,1,0,1,6500,1300,650,4550,2009


From this we note that some of the numeric features should be treated as categorical features. This include;
- policy_number
- insured_zip

In [442]:
df['policy_number'] = df['policy_number'].astype('object')
df['insured_zip'] = df['insured_zip'].astype('object')

In [443]:
df['incident_hour_of_the_day'].value_counts().sort_index().index

Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23],
      dtype='int64', name='incident_hour_of_the_day')

We note that for incident hour of the day, the hours range from 0 to 23.

Since this feature is more cyclic than linear, we can introduce sine_hour and cosine_hour to capture the cyclic behaviour.

### Cyclical Encoding for Hour of the Day

To represent the hour of the day as a cyclical feature, we use the following transformations:

1. **Sine of the hour**:
   $$\text{sine\_hour} = \sin\left(\frac{2 \pi \times \text{hour}}{24}\right)$$

2. **Cosine of the hour**:
   $$\text{cosine\_hour} = \cos\left(\frac{2 \pi \times \text{hour}}{24}\right)$$

In [444]:
df['incident_sine_hour'] = df['incident_hour_of_the_day'].map(
    lambda x: np.sin(2 * np.pi * x / 24)
)

df['incident_cosine_hour'] = df['incident_hour_of_the_day'].map(
    lambda x: np.cos(2 * np.pi * x / 24)
)
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,insured_education_level,insured_occupation,insured_hobbies,insured_relationship,capital-gains,capital-loss,incident_date,incident_type,collision_type,incident_severity,authorities_contacted,incident_state,incident_city,incident_location,incident_hour_of_the_day,number_of_vehicles_involved,property_damage,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported,incident_sine_hour,incident_cosine_hour
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,MALE,MD,craft-repair,sleeping,husband,53300,0,2015-01-25,Single Vehicle Collision,Side Collision,Major Damage,Police,SC,Columbus,9935 4th Drive,5,1,YES,1,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y,0.965926,0.258819
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,MALE,MD,machine-op-inspct,reading,other-relative,0,0,2015-01-21,Vehicle Theft,?,Minor Damage,Police,VA,Riverwood,6608 MLK Hwy,8,1,?,0,0,?,5070,780,780,3510,Mercedes,E400,2007,Y,0.866025,-0.5
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,FEMALE,PhD,sales,board-games,own-child,35100,0,2015-02-22,Multi-vehicle Collision,Rear Collision,Minor Damage,Police,NY,Columbus,7121 Francis Lane,7,3,NO,2,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N,0.965926,-0.258819
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,FEMALE,PhD,armed-forces,board-games,unmarried,48900,-62400,2015-01-10,Single Vehicle Collision,Front Collision,Major Damage,Police,OH,Arlington,6956 Maple Drive,5,1,?,1,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y,0.965926,0.258819
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,MALE,Associate,sales,board-games,unmarried,66000,-46000,2015-02-17,Vehicle Theft,?,Minor Damage,,NY,Arlington,3041 3rd Ave,20,1,NO,0,1,NO,6500,1300,650,4550,Accura,RSX,2009,N,-0.866025,0.5


In [445]:
# dropping the incident_hour _of_day dolumn
df.drop('incident_hour_of_the_day', axis=1, inplace=True)

## 2. Categorical features

In [446]:
pd.set_option('display.max_columns', None)
categorical_df =  df.select_dtypes(include='object')
categorical_df.head()

Unnamed: 0,policy_number,policy_bind_date,policy_state,policy_csl,insured_zip,insured_sex,insured_education_level,insured_occupation,insured_hobbies,insured_relationship,incident_date,incident_type,collision_type,incident_severity,authorities_contacted,incident_state,incident_city,incident_location,property_damage,police_report_available,auto_make,auto_model,fraud_reported
0,521585,2014-10-17,OH,250/500,466132,MALE,MD,craft-repair,sleeping,husband,2015-01-25,Single Vehicle Collision,Side Collision,Major Damage,Police,SC,Columbus,9935 4th Drive,YES,YES,Saab,92x,Y
1,342868,2006-06-27,IN,250/500,468176,MALE,MD,machine-op-inspct,reading,other-relative,2015-01-21,Vehicle Theft,?,Minor Damage,Police,VA,Riverwood,6608 MLK Hwy,?,?,Mercedes,E400,Y
2,687698,2000-09-06,OH,100/300,430632,FEMALE,PhD,sales,board-games,own-child,2015-02-22,Multi-vehicle Collision,Rear Collision,Minor Damage,Police,NY,Columbus,7121 Francis Lane,NO,NO,Dodge,RAM,N
3,227811,1990-05-25,IL,250/500,608117,FEMALE,PhD,armed-forces,board-games,unmarried,2015-01-10,Single Vehicle Collision,Front Collision,Major Damage,Police,OH,Arlington,6956 Maple Drive,?,NO,Chevrolet,Tahoe,Y
4,367455,2014-06-06,IL,500/1000,610706,MALE,Associate,sales,board-games,unmarried,2015-02-17,Vehicle Theft,?,Minor Damage,,NY,Arlington,3041 3rd Ave,NO,NO,Accura,RSX,N


We start by combining the make and model as a single feature

In [447]:
df['auto_make_model'] = df['auto_make'] + '_' + df['auto_model']
df.drop(['auto_make', 'auto_model'], axis=1, inplace=True)
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,insured_education_level,insured_occupation,insured_hobbies,insured_relationship,capital-gains,capital-loss,incident_date,incident_type,collision_type,incident_severity,authorities_contacted,incident_state,incident_city,incident_location,number_of_vehicles_involved,property_damage,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_year,fraud_reported,incident_sine_hour,incident_cosine_hour,auto_make_model
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,MALE,MD,craft-repair,sleeping,husband,53300,0,2015-01-25,Single Vehicle Collision,Side Collision,Major Damage,Police,SC,Columbus,9935 4th Drive,1,YES,1,2,YES,71610,6510,13020,52080,2004,Y,0.965926,0.258819,Saab_92x
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,MALE,MD,machine-op-inspct,reading,other-relative,0,0,2015-01-21,Vehicle Theft,?,Minor Damage,Police,VA,Riverwood,6608 MLK Hwy,1,?,0,0,?,5070,780,780,3510,2007,Y,0.866025,-0.5,Mercedes_E400
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,FEMALE,PhD,sales,board-games,own-child,35100,0,2015-02-22,Multi-vehicle Collision,Rear Collision,Minor Damage,Police,NY,Columbus,7121 Francis Lane,3,NO,2,3,NO,34650,7700,3850,23100,2007,N,0.965926,-0.258819,Dodge_RAM
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,FEMALE,PhD,armed-forces,board-games,unmarried,48900,-62400,2015-01-10,Single Vehicle Collision,Front Collision,Major Damage,Police,OH,Arlington,6956 Maple Drive,1,?,1,2,NO,63400,6340,6340,50720,2014,Y,0.965926,0.258819,Chevrolet_Tahoe
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,MALE,Associate,sales,board-games,unmarried,66000,-46000,2015-02-17,Vehicle Theft,?,Minor Damage,,NY,Arlington,3041 3rd Ave,1,NO,0,1,NO,6500,1300,650,4550,2009,N,-0.866025,0.5,Accura_RSX


## Handling dates

In [448]:
date_columns = ['policy_bind_date', 'incident_date']

# converting to datetime
df[date_columns] = df[date_columns].apply(pd.to_datetime)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 39 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   months_as_customer           1000 non-null   int64         
 1   age                          1000 non-null   int64         
 2   policy_number                1000 non-null   object        
 3   policy_bind_date             1000 non-null   datetime64[ns]
 4   policy_state                 1000 non-null   object        
 5   policy_csl                   1000 non-null   object        
 6   policy_deductable            1000 non-null   int64         
 7   policy_annual_premium        1000 non-null   float64       
 8   umbrella_limit               1000 non-null   int64         
 9   insured_zip                  1000 non-null   object        
 10  insured_sex                  1000 non-null   object        
 11  insured_education_level      1000 non-null  

In [449]:
# function to split date to year month and day
def split_date_col(df, date_cols):
    """Splits the dates of the specified columns 
    and adds them as new features also deletes original date column
    
    Parameters:
        df(Pd.DataFrame): the dataframe containing the features
        date_cols([str]): list of the date columns
    Returns:
        Pd.Dataframe: edited df with new features
    """
    
    # iterating through the cols
    for col in date_cols:
        # convert to datetime
        date = df[col].apply(pd.to_datetime)
        # split the date
        df[f'{col}_year'] = df[col].dt.year
        # make month and day as cosine and sine to show the cyclic nature
        df[f'{col}_cosine_month'] = df[col].dt.month.apply(
            lambda x: np.cos(2 * np.pi * x / 12)
        )
        df[f'{col}_sine_month'] = df[col].dt.month.apply(
            lambda x: np.sin(2 * np.pi * x / 12)
        )
        df[f'{col}_cosine_day'] = df[col].dt.day.apply(
            lambda x: np.cos(2 * np.pi * x / 12)
        )
        df[f'{col}_sine_day'] = df[col].dt.day.apply(
            lambda x: np.sin(2 * np.pi * x / 12)
        )
        
        
        # drop original_date
        df.drop(col, axis=1, inplace=True)
        
    return df

In [450]:
df = split_date_col(df, date_columns)
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,insured_education_level,insured_occupation,insured_hobbies,insured_relationship,capital-gains,capital-loss,incident_type,collision_type,incident_severity,authorities_contacted,incident_state,incident_city,incident_location,number_of_vehicles_involved,property_damage,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_year,fraud_reported,incident_sine_hour,incident_cosine_hour,auto_make_model,policy_bind_date_year,policy_bind_date_cosine_month,policy_bind_date_sine_month,policy_bind_date_cosine_day,policy_bind_date_sine_day,incident_date_year,incident_date_cosine_month,incident_date_sine_month,incident_date_cosine_day,incident_date_sine_day
0,328,48,521585,OH,250/500,1000,1406.91,0,466132,MALE,MD,craft-repair,sleeping,husband,53300,0,Single Vehicle Collision,Side Collision,Major Damage,Police,SC,Columbus,9935 4th Drive,1,YES,1,2,YES,71610,6510,13020,52080,2004,Y,0.965926,0.258819,Saab_92x,2014,0.5,-0.8660254,-0.8660254,0.5,2015,0.866025,0.5,0.8660254,0.5
1,228,42,342868,IN,250/500,2000,1197.22,5000000,468176,MALE,MD,machine-op-inspct,reading,other-relative,0,0,Vehicle Theft,?,Minor Damage,Police,VA,Riverwood,6608 MLK Hwy,1,?,0,0,?,5070,780,780,3510,2007,Y,0.866025,-0.5,Mercedes_E400,2006,-1.0,1.224647e-16,5.510911e-16,1.0,2015,0.866025,0.5,-4.286264e-16,-1.0
2,134,29,687698,OH,100/300,2000,1413.14,5000000,430632,FEMALE,PhD,sales,board-games,own-child,35100,0,Multi-vehicle Collision,Rear Collision,Minor Damage,Police,NY,Columbus,7121 Francis Lane,3,NO,2,3,NO,34650,7700,3850,23100,2007,N,0.965926,-0.258819,Dodge_RAM,2000,-1.83697e-16,-1.0,-1.0,1.224647e-16,2015,0.5,0.866025,0.5,-0.866025
3,256,41,227811,IL,250/500,2000,1415.74,6000000,608117,FEMALE,PhD,armed-forces,board-games,unmarried,48900,-62400,Single Vehicle Collision,Front Collision,Major Damage,Police,OH,Arlington,6956 Maple Drive,1,?,1,2,NO,63400,6340,6340,50720,2014,Y,0.965926,0.258819,Chevrolet_Tahoe,1990,-0.8660254,0.5,0.8660254,0.5,2015,0.866025,0.5,0.5,-0.866025
4,228,44,367455,IL,500/1000,1000,1583.91,6000000,610706,MALE,Associate,sales,board-games,unmarried,66000,-46000,Vehicle Theft,?,Minor Damage,,NY,Arlington,3041 3rd Ave,1,NO,0,1,NO,6500,1300,650,4550,2009,N,-0.866025,0.5,Accura_RSX,2014,-1.0,1.224647e-16,-1.0,1.224647e-16,2015,0.5,0.866025,-0.8660254,0.5


From the eda, the total claim amount was hihgly corelated with all the other claim amount features.
We remove it to avoid collinearity.

In [451]:
df.drop('total_claim_amount', axis=1, inplace=True)

## Data cleaning

In [452]:
df['collision_type'].value_counts()

collision_type
Rear Collision     292
Side Collision     276
Front Collision    254
?                  178
Name: count, dtype: int64

Notice that some categorical features contain '?' as a catrgory, We can replace ? with 'unknown'

In [453]:
df.replace('?', 'unknown', inplace=True)

In [454]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 46 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   months_as_customer             1000 non-null   int64  
 1   age                            1000 non-null   int64  
 2   policy_number                  1000 non-null   object 
 3   policy_state                   1000 non-null   object 
 4   policy_csl                     1000 non-null   object 
 5   policy_deductable              1000 non-null   int64  
 6   policy_annual_premium          1000 non-null   float64
 7   umbrella_limit                 1000 non-null   int64  
 8   insured_zip                    1000 non-null   object 
 9   insured_sex                    1000 non-null   object 
 10  insured_education_level        1000 non-null   object 
 11  insured_occupation             1000 non-null   object 
 12  insured_hobbies                1000 non-null   ob

Since incident location is unique  for every row we drop it .

In [455]:
df.drop('incident_location' ,axis=1,inplace=True)

In [456]:
# convert the dataframe to a csv
# df.to_csv('featuure_engineered.csv', index=False) 

## One hot encoding and scaling

In [457]:

df_data = pd.read_csv('featuure_engineered.csv')

In [458]:
#Select categorical columns for one-hot encoding and numeric columns for scaling
numeric_features = df_data.select_dtypes(exclude='object').columns.tolist()
categorical_features = df_data.select_dtypes(include='object').columns.tolist()


In [459]:
# Remove target feature
categorical_features.remove('fraud_reported')

In [460]:

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# Set up ColumnTransformer for OHE and Scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)


In [461]:
df_data.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,insured_education_level,insured_occupation,insured_hobbies,insured_relationship,capital-gains,capital-loss,incident_type,collision_type,incident_severity,authorities_contacted,incident_state,incident_city,number_of_vehicles_involved,property_damage,bodily_injuries,witnesses,police_report_available,injury_claim,property_claim,vehicle_claim,auto_year,fraud_reported,incident_sine_hour,incident_cosine_hour,auto_make_model,policy_bind_date_year,policy_bind_date_cosine_month,policy_bind_date_sine_month,policy_bind_date_cosine_day,policy_bind_date_sine_day,incident_date_year,incident_date_cosine_month,incident_date_sine_month,incident_date_cosine_day,incident_date_sine_day
0,328,48,521585,OH,250/500,1000,1406.91,0,466132,MALE,MD,craft-repair,sleeping,husband,53300,0,Single Vehicle Collision,Side Collision,Major Damage,Police,SC,Columbus,1,YES,1,2,YES,6510,13020,52080,2004,Y,0.965926,0.258819,Saab_92x,2014,0.5,-0.8660254,-0.8660254,0.5,2015,0.866025,0.5,0.8660254,0.5
1,228,42,342868,IN,250/500,2000,1197.22,5000000,468176,MALE,MD,machine-op-inspct,reading,other-relative,0,0,Vehicle Theft,unknown,Minor Damage,Police,VA,Riverwood,1,unknown,0,0,unknown,780,780,3510,2007,Y,0.866025,-0.5,Mercedes_E400,2006,-1.0,1.224647e-16,5.510911e-16,1.0,2015,0.866025,0.5,-4.286264e-16,-1.0
2,134,29,687698,OH,100/300,2000,1413.14,5000000,430632,FEMALE,PhD,sales,board-games,own-child,35100,0,Multi-vehicle Collision,Rear Collision,Minor Damage,Police,NY,Columbus,3,NO,2,3,NO,7700,3850,23100,2007,N,0.965926,-0.258819,Dodge_RAM,2000,-1.83697e-16,-1.0,-1.0,1.224647e-16,2015,0.5,0.866025,0.5,-0.866025
3,256,41,227811,IL,250/500,2000,1415.74,6000000,608117,FEMALE,PhD,armed-forces,board-games,unmarried,48900,-62400,Single Vehicle Collision,Front Collision,Major Damage,Police,OH,Arlington,1,unknown,1,2,NO,6340,6340,50720,2014,Y,0.965926,0.258819,Chevrolet_Tahoe,1990,-0.8660254,0.5,0.8660254,0.5,2015,0.866025,0.5,0.5,-0.866025
4,228,44,367455,IL,500/1000,1000,1583.91,6000000,610706,MALE,Associate,sales,board-games,unmarried,66000,-46000,Vehicle Theft,unknown,Minor Damage,,NY,Arlington,1,NO,0,1,NO,1300,650,4550,2009,N,-0.866025,0.5,Accura_RSX,2014,-1.0,1.224647e-16,-1.0,1.224647e-16,2015,0.5,0.866025,-0.8660254,0.5


# Performing train test split

In [462]:

from sklearn.model_selection import train_test_split
# Defining X and y
X = df_data.drop(columns=['fraud_reported']) 
y = df_data['fraud_reported']

#performing LabelEncoding
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y = lb.fit_transform(y)


# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [463]:

# Fit and transform the training data, transform the test data
X_train_preprocessed= preprocessor.fit_transform(X_train)
X_test_preprocessed= preprocessor.transform(X_test)

In [464]:
lb.get_params()

{'neg_label': 0, 'pos_label': 1, 'sparse_output': False}

In [465]:
lb.classes_

array(['N', 'Y'], dtype='<U1')

N - 0
Y - 1

## Modelling .

Importing the necessary libraries to use in this part

In [466]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.model_selection import GridSearchCV

Lets create a 'modeling' class which provides a structure for training and evaluating machine learning models, specifically using a classifiers such as 
* RandomForest
* XGboost

In [467]:
class Modeling:
    def __init__(self):
        pass

    def models(self,classifier,X_train_preprocessed,y_train,X_test_preprocessed,y_test):
            # Fit the model
            classifier.fit(X_train_preprocessed,y_train)

            #Make predictions
            y_hat_train = classifier.predict(X_train_preprocessed)
            y_hat_test = classifier.predict(X_test_preprocessed)  

            #Print the accuracy scores for the model test
            train_acc = accuracy_score(y_train,y_hat_train)
            test_acc = accuracy_score(y_test,y_hat_test)
            print("\n"f"The model has an accuracy of {test_acc*100:.2f}% on the test test")
            print(f"The Model has an accuracy of {train_acc*100:.2f}% on the train test")
            
            return classifier.score(X_test_preprocessed, y_test)
        
    def cross_val(self, classifier, X_train, y_train):
        # Perform Cross-Validation
        cv = KFold(n_splits=5, shuffle=True, random_state=42)

        # Evaluate the model using cross-validation
        cv_results = cross_val_score(classifier, X_train, y_train, cv=cv, scoring='accuracy')

        print(f'Cross-Validation Accuracy Scores: {cv_results}')
        print(f'Mean Accuracy: {cv_results.mean()}')
        print(f'Standard Deviation: {cv_results.std()}')          


In [468]:
modeler =Modeling()

   ### (1)Baseline Models

### I.Logistic Regression

In [469]:

# Initialize the model
logreg = LogisticRegression(class_weight='balanced',
                            penalty='l2',
                            fit_intercept=False,
                            C=1e12, solver='liblinear') 



In [470]:
modeler.models(logreg, X_train_preprocessed, y_train, X_test_preprocessed, y_test)


The model has an accuracy of 74.50% on the test test
The Model has an accuracy of 91.50% on the train test


  y = column_or_1d(y, warn=True)


0.745

In [471]:
modeler.cross_val(logreg,X_train_preprocessed, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Cross-Validation Accuracy Scores: [0.6875  0.75625 0.78125 0.8     0.79375]
Mean Accuracy: 0.7637500000000002
Standard Deviation: 0.04096492402043485


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


It looks like Our logistic regression model achieved an overall accuracy of 74%, which is decent but could potentially be improved, especially for the minority class (1), as seen in the metrics.

## II)RandomForest

In [472]:
'''
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}
'''
forest_model= RandomForestClassifier(n_estimators=100 ,
                                max_depth = 5,
                                class_weight='balanced_subsample',
                                )



In [473]:
# # Use GridSearchCV to test different combinations
# grid_search = GridSearchCV(forest_model, param_grid=param_grid, cv=2, scoring='accuracy',n_jobs=-1)
# grid_search.fit(X_train_preprocessed, y_train.ravel())


In [474]:
modeler.models(forest_model, X_train_preprocessed, y_train, X_test_preprocessed, y_test)


  classifier.fit(X_train_preprocessed,y_train)



The model has an accuracy of 80.00% on the test test
The Model has an accuracy of 90.50% on the train test


0.8

In [475]:
modeler.cross_val(forest_model,X_train_preprocessed, y_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Cross-Validation Accuracy Scores: [0.825   0.81875 0.8     0.85625 0.825  ]
Mean Accuracy: 0.825
Standard Deviation: 0.018114220932736774


In [476]:
from sklearn.ensemble import AdaBoostClassifier
aboost = AdaBoostClassifier()

In [477]:
modeler.models(aboost, X_train_preprocessed, y_train, X_test_preprocessed, y_test)

  y = column_or_1d(y, warn=True)



The model has an accuracy of 72.50% on the test test
The Model has an accuracy of 89.38% on the train test


0.725

In [478]:
Xg = XGBClassifier()


In [479]:
modeler.models(Xg, X_train_preprocessed, y_train, X_test_preprocessed, y_test) 


The model has an accuracy of 77.50% on the test test
The Model has an accuracy of 100.00% on the train test


0.775

#### Feedforward neural network (FFNN)

In [480]:
import tensorflow
from tensorflow.keras import models
from tensorflow.keras import layers


In [482]:
ts_model = models.Sequential()
ts_model.add(layers.Dense(10,input_shape=(X_train_preprocessed.shape[1],)))
ts_model.add(layers.Dense(1, activation='sigmoid'))
ts_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [486]:
ts_model.fit(X_train_preprocessed,y_train,epochs=100)


Epoch 1/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9236 - loss: 0.2147
Epoch 2/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9105 - loss: 0.2280  
Epoch 3/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9132 - loss: 0.2188
Epoch 4/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9150 - loss: 0.2404 
Epoch 5/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9060 - loss: 0.2391 
Epoch 6/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9227 - loss: 0.2050
Epoch 7/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9229 - loss: 0.2283
Epoch 8/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9042 - loss: 0.2432
Epoch 9/100
[1m25/25[0m [32m━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x157d4497d90>

In [487]:
ts_model.evaluate(X_train_preprocessed,y_train)

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8992 - loss: 0.2549


[0.2089017629623413, 0.925000011920929]

In [488]:
ts_model.evaluate(X_test_preprocessed,y_test)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7332 - loss: 0.7471 


[0.8079506158828735, 0.7250000238418579]