In [593]:
import pandas as pd
import numpy as np

In [594]:
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv")
variable_data = pd.read_csv("VariableDescription.csv")

In [595]:
train_data.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053,0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053,0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053,0
3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.,1053,0
4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3,1053,0


In [596]:
test_data.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code
0,H11920,2013,1.0,0,V,N,O,R,300.0,1,1960.0,3,3310
1,H11921,2016,0.997268,0,V,N,O,R,300.0,1,1960.0,3,3310
2,H9805,2013,0.369863,0,V,V,V,U,790.0,1,1960.0,.,3310
3,H7493,2014,1.0,0,V,N,O,R,1405.0,1,2004.0,3,3321
4,H7494,2016,1.0,0,V,N,O,R,1405.0,1,2004.0,3,3321


In [597]:
train_data.columns

Index(['Customer Id', 'YearOfObservation', 'Insured_Period', 'Residential',
       'Building_Painted', 'Building_Fenced', 'Garden', 'Settlement',
       'Building Dimension', 'Building_Type', 'Date_of_Occupancy',
       'NumberOfWindows', 'Geo_Code', 'Claim'],
      dtype='object')

In [598]:
test_data.columns

Index(['Customer Id', 'YearOfObservation', 'Insured_Period', 'Residential',
       'Building_Painted', 'Building_Fenced', 'Garden', 'Settlement',
       'Building Dimension', 'Building_Type', 'Date_of_Occupancy',
       'NumberOfWindows', 'Geo_Code'],
      dtype='object')

In [599]:
test_data.shape, train_data.shape

((3069, 13), (7160, 14))

In [600]:
variable_data

Unnamed: 0,Variable,Description
0,Customer Id,Identification number for the Policy holder
1,YearOfObservation,year of observation for the insured policy
2,Insured_Period,duration of insurance policy in Olusola Insura...
3,Residential,is the building a residential building or not
4,Building_Painted,"is the building painted or not (N-Painted, V-N..."
5,Building_Fenced,"is the building fence or not (N-Fenced, V-Not ..."
6,Garden,building has garden or not (V-has garden; O-no...
7,Settlement,Area where the building is located. (R- rural ...
8,Building Dimension,Size of the insured building in m2
9,Building_Type,"The type of building (Type 1, 2, 3, 4)"


In [601]:
# first we want to correct the columns so that they become lower cased and remove spaces
train_data.columns = train_data.columns.str.lower().str.replace(" ", "_")

test_data.columns = test_data.columns.str.lower().str.replace(" ", "_")

# check if thy are corrected
train_data.columns

Index(['customer_id', 'yearofobservation', 'insured_period', 'residential',
       'building_painted', 'building_fenced', 'garden', 'settlement',
       'building_dimension', 'building_type', 'date_of_occupancy',
       'numberofwindows', 'geo_code', 'claim'],
      dtype='object')

In [602]:
# we describe our data to check if there any missing values
train_data.describe()

Unnamed: 0,yearofobservation,insured_period,residential,building_dimension,building_type,date_of_occupancy,claim
count,7160.0,7160.0,7160.0,7054.0,7160.0,6652.0,7160.0
mean,2013.669553,0.909758,0.305447,1883.72753,2.186034,1964.456404,0.228212
std,1.383769,0.239756,0.460629,2278.157745,0.940632,36.002014,0.419709
min,2012.0,0.0,0.0,1.0,1.0,1545.0,0.0
25%,2012.0,0.997268,0.0,528.0,2.0,1960.0,0.0
50%,2013.0,1.0,0.0,1083.0,2.0,1970.0,0.0
75%,2015.0,1.0,1.0,2289.75,3.0,1980.0,0.0
max,2016.0,1.0,1.0,20940.0,4.0,2016.0,1.0


In [603]:
# we use is null to see
train_data.isnull().sum()

customer_id             0
yearofobservation       0
insured_period          0
residential             0
building_painted        0
building_fenced         0
garden                  7
settlement              0
building_dimension    106
building_type           0
date_of_occupancy     508
numberofwindows         0
geo_code              102
claim                   0
dtype: int64

In [604]:
# we can also check the datatype to see the column of the missing values this will help on imputation method
train_data.dtypes

customer_id            object
yearofobservation       int64
insured_period        float64
residential             int64
building_painted       object
building_fenced        object
garden                 object
settlement             object
building_dimension    float64
building_type           int64
date_of_occupancy     float64
numberofwindows        object
geo_code               object
claim                   int64
dtype: object

In [605]:
# for garden and geocode we going to use countfrequencey encoder
from feature_engine.imputation import CategoricalImputer

ci = CategoricalImputer(imputation_method="frequent")
train_data[["garden", "geo_code"]] = ci.fit_transform(
    train_data[["garden", "geo_code"]]
)
test_data[["garden", "geo_code"]] = ci.fit_transform(test_data[["garden", "geo_code"]])

# for date and bulding we can use mean median imputter
from feature_engine.imputation import MeanMedianImputer

mmi = MeanMedianImputer(imputation_method="median")
train_data[["building_dimension", "date_of_occupancy"]] = mmi.fit_transform(
    train_data[["building_dimension", "date_of_occupancy"]]
)
test_data[["building_dimension", "date_of_occupancy"]] = mmi.fit_transform(
    test_data[["building_dimension", "date_of_occupancy"]]
)

In [606]:
# check if there still any missing
train_data.isnull().sum()

customer_id           0
yearofobservation     0
insured_period        0
residential           0
building_painted      0
building_fenced       0
garden                0
settlement            0
building_dimension    0
building_type         0
date_of_occupancy     0
numberofwindows       0
geo_code              0
claim                 0
dtype: int64

In [607]:
test_data.shape, train_data.shape

((3069, 13), (7160, 14))

In [608]:
# we need to rename columns year and windows

train_data.rename(
    columns={
        "yearofobservation": "year_of_observation",
        "numberofwindows": "number_of_windows",
    },
    inplace=True,
)
test_data.rename(
    columns={
        "yearofobservation": "year_of_observation",
        "numberofwindows": "number_of_windows",
    },
    inplace=True,
)
# check
test_data.columns

Index(['customer_id', 'year_of_observation', 'insured_period', 'residential',
       'building_painted', 'building_fenced', 'garden', 'settlement',
       'building_dimension', 'building_type', 'date_of_occupancy',
       'number_of_windows', 'geo_code'],
      dtype='object')

In [609]:
train_data.number_of_windows = train_data.number_of_windows.str.replace(
    "   .", "0"
).str.replace(">=10", "10")

test_data.number_of_windows = test_data.number_of_windows.str.replace(
    "   .", "0"
).str.replace(">=10", "10")

  train_data.number_of_windows = train_data.number_of_windows.str.replace(
  test_data.number_of_windows = test_data.number_of_windows.str.replace(


In [610]:
# check if windows is now corrected
train_data.number_of_windows.unique()

array(['0', '4', '3', '2', '5', '10', '6', '7', '9', '8', '1'],
      dtype=object)

In [611]:
train_data.date_of_occupancy=train_data.date_of_occupancy.astype(int)
test_data.date_of_occupancy=test_data.date_of_occupancy.astype(int)

In [612]:
# set date as integer
train_data[["year_of_observation", "date_of_occupancy"]] = train_data[
    ["year_of_observation", "date_of_occupancy"]
].astype('object')
test_data[["year_of_observation", "date_of_occupancy"]] = test_data[
    ["year_of_observation", "date_of_occupancy"]
].astype('object')
train_data.dtypes

customer_id             object
year_of_observation     object
insured_period         float64
residential              int64
building_painted        object
building_fenced         object
garden                  object
settlement              object
building_dimension     float64
building_type            int64
date_of_occupancy       object
number_of_windows       object
geo_code                object
claim                    int64
dtype: object

In [613]:
test_data.shape, train_data.shape

((3069, 13), (7160, 14))

In [614]:
# i will assume everything happen on the fist day of the month
train_data[["year_of_observation", "date_of_occupancy"]] = train_data[
    ["year_of_observation", "date_of_occupancy"]
].apply(lambda x: pd.to_datetime(x, format="%Y", errors="coerce"))
test_data[["year_of_observation", "date_of_occupancy"]] = test_data[
    ["year_of_observation", "date_of_occupancy"]
].apply(lambda x: pd.to_datetime(x, format="%Y", errors="coerce"))
test_data.year_of_observation.dtypes

dtype('<M8[ns]')

In [615]:
train_data.dropna(axis=0,inplace=True)
test_data.dropna(axis=0,inplace=True)

In [616]:
train_data.isnull().sum()

customer_id            0
year_of_observation    0
insured_period         0
residential            0
building_painted       0
building_fenced        0
garden                 0
settlement             0
building_dimension     0
building_type          0
date_of_occupancy      0
number_of_windows      0
geo_code               0
claim                  0
dtype: int64

In [617]:
train_data.head()

Unnamed: 0,customer_id,year_of_observation,insured_period,residential,building_painted,building_fenced,garden,settlement,building_dimension,building_type,date_of_occupancy,number_of_windows,geo_code,claim
0,H14663,2013-01-01,1.0,0,N,V,V,U,290.0,1,1960-01-01,0,1053,0
1,H2037,2015-01-01,1.0,0,V,N,O,R,490.0,1,1850-01-01,4,1053,0
2,H3802,2014-01-01,1.0,0,N,V,V,U,595.0,1,1960-01-01,0,1053,0
3,H3834,2013-01-01,1.0,0,V,V,V,U,2840.0,1,1960-01-01,0,1053,0
4,H5053,2014-01-01,1.0,0,V,N,O,R,680.0,1,1800-01-01,3,1053,0


In [618]:
train_data.columns


Index(['customer_id', 'year_of_observation', 'insured_period', 'residential',
       'building_painted', 'building_fenced', 'garden', 'settlement',
       'building_dimension', 'building_type', 'date_of_occupancy',
       'number_of_windows', 'geo_code', 'claim'],
      dtype='object')

In [619]:
#we need to encode the categorical values
#for painted,fenced,garden,settlement we use categorical encode
from feature_engine.encoding import OrdinalEncoder

oe = OrdinalEncoder(encoding_method='arbitrary')

train_data[
    [
        'building_painted', 'building_fenced', 'garden', 'settlement'
    ]
] = oe.fit_transform(train_data[
    [
        'building_painted', 'building_fenced', 'garden', 'settlement'
    ]
])

test_data[
    [
        'building_painted', 'building_fenced', 'garden', 'settlement'
    ]
] = oe.fit_transform(test_data[
    [
        'building_painted', 'building_fenced', 'garden', 'settlement'
    ]
])

test_data.head()

Unnamed: 0,customer_id,year_of_observation,insured_period,residential,building_painted,building_fenced,garden,settlement,building_dimension,building_type,date_of_occupancy,number_of_windows,geo_code
0,H11920,2013-01-01,1.0,0,0,0,0,0,300.0,1,1960-01-01,3,3310
1,H11921,2016-01-01,0.997268,0,0,0,0,0,300.0,1,1960-01-01,3,3310
2,H9805,2013-01-01,0.369863,0,0,1,1,1,790.0,1,1960-01-01,0,3310
3,H7493,2014-01-01,1.0,0,0,0,0,0,1405.0,1,2004-01-01,3,3321
4,H7494,2016-01-01,1.0,0,0,0,0,0,1405.0,1,2004-01-01,3,3321


In [620]:
test_data.shape, train_data.shape

((3069, 13), (7152, 14))

In [621]:
#discretization of the period
from feature_engine.discretisation import EqualFrequencyDiscretiser

ef =EqualFrequencyDiscretiser()

train_data[['insured_period']] = ef.fit_transform(train_data[['insured_period']])

test_data[['insured_period']] = ef.fit_transform(test_data[['insured_period']])

In [622]:
test_data.shape, train_data.shape

((3069, 13), (7152, 14))

In [623]:
variable_data

Unnamed: 0,Variable,Description
0,Customer Id,Identification number for the Policy holder
1,YearOfObservation,year of observation for the insured policy
2,Insured_Period,duration of insurance policy in Olusola Insura...
3,Residential,is the building a residential building or not
4,Building_Painted,"is the building painted or not (N-Painted, V-N..."
5,Building_Fenced,"is the building fence or not (N-Fenced, V-Not ..."
6,Garden,building has garden or not (V-has garden; O-no...
7,Settlement,Area where the building is located. (R- rural ...
8,Building Dimension,Size of the insured building in m2
9,Building_Type,"The type of building (Type 1, 2, 3, 4)"


In [624]:
#need to create new features
train_data = train_data.assign(
    year_occupied_before_insurance = train_data.year_of_observation.dt.year - train_data.date_of_occupancy.dt.year
)

test_data = test_data.assign(
    year_occupied_before_insurance = test_data.year_of_observation.dt.year - test_data.date_of_occupancy.dt.year
)
test_data.head()

Unnamed: 0,customer_id,year_of_observation,insured_period,residential,building_painted,building_fenced,garden,settlement,building_dimension,building_type,date_of_occupancy,number_of_windows,geo_code,year_occupied_before_insurance
0,H11920,2013-01-01,2,0,0,0,0,0,300.0,1,1960-01-01,3,3310,53
1,H11921,2016-01-01,1,0,0,0,0,0,300.0,1,1960-01-01,3,3310,56
2,H9805,2013-01-01,0,0,0,1,1,1,790.0,1,1960-01-01,0,3310,53
3,H7493,2014-01-01,2,0,0,0,0,0,1405.0,1,2004-01-01,3,3321,10
4,H7494,2016-01-01,2,0,0,0,0,0,1405.0,1,2004-01-01,3,3321,12


In [625]:
test_data.head()

Unnamed: 0,customer_id,year_of_observation,insured_period,residential,building_painted,building_fenced,garden,settlement,building_dimension,building_type,date_of_occupancy,number_of_windows,geo_code,year_occupied_before_insurance
0,H11920,2013-01-01,2,0,0,0,0,0,300.0,1,1960-01-01,3,3310,53
1,H11921,2016-01-01,1,0,0,0,0,0,300.0,1,1960-01-01,3,3310,56
2,H9805,2013-01-01,0,0,0,1,1,1,790.0,1,1960-01-01,0,3310,53
3,H7493,2014-01-01,2,0,0,0,0,0,1405.0,1,2004-01-01,3,3321,10
4,H7494,2016-01-01,2,0,0,0,0,0,1405.0,1,2004-01-01,3,3321,12


In [626]:
train_data.isnull().sum()

customer_id                       0
year_of_observation               0
insured_period                    0
residential                       0
building_painted                  0
building_fenced                   0
garden                            0
settlement                        0
building_dimension                0
building_type                     0
date_of_occupancy                 0
number_of_windows                 0
geo_code                          0
claim                             0
year_occupied_before_insurance    0
dtype: int64

In [627]:
train_data['year_occupied_before_insurance'] = train_data['year_occupied_before_insurance'].fillna(0)
test_data['year_occupied_before_insurance'] = test_data['year_occupied_before_insurance'].fillna(0)

In [628]:
len(train_data.geo_code.unique())

1305

In [629]:
#using log transformer on numeric variables
from feature_engine.transformation import PowerTransformer
lt = PowerTransformer()
train_data[
    [
       'building_dimension','year_occupied_before_insurance'
    ]
] = lt.fit_transform(
   train_data[
    [
       'building_dimension','year_occupied_before_insurance'
    ]
] 
)

test_data[
    [
       'building_dimension','year_occupied_before_insurance'
    ]
] = lt.fit_transform(
   test_data[
    [
       'building_dimension','year_occupied_before_insurance'
    ]
] 
)
test_data.head()


Unnamed: 0,customer_id,year_of_observation,insured_period,residential,building_painted,building_fenced,garden,settlement,building_dimension,building_type,date_of_occupancy,number_of_windows,geo_code,year_occupied_before_insurance
0,H11920,2013-01-01,2,0,0,0,0,0,17.320508,1,1960-01-01,3,3310,7.28011
1,H11921,2016-01-01,1,0,0,0,0,0,17.320508,1,1960-01-01,3,3310,7.483315
2,H9805,2013-01-01,0,0,0,1,1,1,28.106939,1,1960-01-01,0,3310,7.28011
3,H7493,2014-01-01,2,0,0,0,0,0,37.48333,1,2004-01-01,3,3321,3.162278
4,H7494,2016-01-01,2,0,0,0,0,0,37.48333,1,2004-01-01,3,3321,3.464102


In [630]:
test_data.shape, train_data.shape

((3069, 14), (7152, 15))

In [631]:
train_data.isnull().sum()

customer_id                       0
year_of_observation               0
insured_period                    0
residential                       0
building_painted                  0
building_fenced                   0
garden                            0
settlement                        0
building_dimension                0
building_type                     0
date_of_occupancy                 0
number_of_windows                 0
geo_code                          0
claim                             0
year_occupied_before_insurance    0
dtype: int64

In [632]:
train_data.dtypes
train_data.number_of_windows = train_data.number_of_windows.astype(int)
test_data.number_of_windows = test_data.number_of_windows.astype(int)

In [633]:
train_data.isnull().sum()

customer_id                       0
year_of_observation               0
insured_period                    0
residential                       0
building_painted                  0
building_fenced                   0
garden                            0
settlement                        0
building_dimension                0
building_type                     0
date_of_occupancy                 0
number_of_windows                 0
geo_code                          0
claim                             0
year_occupied_before_insurance    0
dtype: int64

In [634]:
train_data.shape,test_data.shape

((7152, 15), (3069, 14))

In [635]:
from feature_engine.datetime import DatetimeFeatures
def create_date_features_and_concat(
        train:pd.DataFrame,

):
    dtf = DatetimeFeatures(features_to_extract = ["month","day_of_month"],drop_original=False)
    train = dtf.fit_transform(train)
    return train

In [636]:
train_data.columns

Index(['customer_id', 'year_of_observation', 'insured_period', 'residential',
       'building_painted', 'building_fenced', 'garden', 'settlement',
       'building_dimension', 'building_type', 'date_of_occupancy',
       'number_of_windows', 'geo_code', 'claim',
       'year_occupied_before_insurance'],
      dtype='object')

In [637]:

train_data = create_date_features_and_concat(train_data
)

test_data = create_date_features_and_concat(test_data
)
train_data.drop(
    columns=['year_of_observation','date_of_occupancy'],inplace=True
)
test_data.drop(
    columns=['year_of_observation','date_of_occupancy'],inplace=True
)



In [638]:
test_data

Unnamed: 0,customer_id,insured_period,residential,building_painted,building_fenced,garden,settlement,building_dimension,building_type,number_of_windows,geo_code,year_occupied_before_insurance,year_of_observation_month,year_of_observation_day_of_month,date_of_occupancy_month,date_of_occupancy_day_of_month
0,H11920,2,0,0,0,0,0,17.320508,1,3,3310,7.280110,1,1,1,1
1,H11921,1,0,0,0,0,0,17.320508,1,3,3310,7.483315,1,1,1,1
2,H9805,0,0,0,1,1,1,28.106939,1,0,3310,7.280110,1,1,1,1
3,H7493,2,0,0,0,0,0,37.483330,1,3,3321,3.162278,1,1,1,1
4,H7494,2,0,0,0,0,0,37.483330,1,3,3321,3.464102,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3064,H11583,2,0,0,1,1,1,30.000000,4,0,6083,10.723805,1,1,1,1
3065,H11720,2,0,0,1,1,1,30.000000,2,0,6083,8.000000,1,1,1,1
3066,H11721,2,0,0,1,1,1,30.000000,2,0,6083,4.358899,1,1,1,1
3067,H12408,2,0,0,1,1,1,30.000000,1,0,6083,14.594520,1,1,1,1


In [639]:
train_data.shape,test_data.shape

((7152, 17), (3069, 16))

In [640]:
train_data.isnull().sum()

customer_id                         0
insured_period                      0
residential                         0
building_painted                    0
building_fenced                     0
garden                              0
settlement                          0
building_dimension                  0
building_type                       0
number_of_windows                   0
geo_code                            0
claim                               0
year_occupied_before_insurance      0
year_of_observation_month           0
year_of_observation_day_of_month    0
date_of_occupancy_month             0
date_of_occupancy_day_of_month      0
dtype: int64

In [641]:
'''
train_data[
    [
        'year_of_observation_year', 'date_of_occupancy_year
    ]
] = oe.fit_transform(train_data[
    [
        'year_of_observation_year', 'date_of_occupancy_year'
    ]
].astype(str))
test_data[
    [
        'year_of_observation_year', 'date_of_occupancy_year'
    ]
] = oe.fit_transform(test_data[
    [
        'year_of_observation_year', 'date_of_occupancy_year'
    ]
].astype(str))'''

"\ntrain_data[\n    [\n        'year_of_observation_year', 'date_of_occupancy_year\n    ]\n] = oe.fit_transform(train_data[\n    [\n        'year_of_observation_year', 'date_of_occupancy_year'\n    ]\n].astype(str))\ntest_data[\n    [\n        'year_of_observation_year', 'date_of_occupancy_year'\n    ]\n] = oe.fit_transform(test_data[\n    [\n        'year_of_observation_year', 'date_of_occupancy_year'\n    ]\n].astype(str))"

In [642]:
train_data

Unnamed: 0,customer_id,insured_period,residential,building_painted,building_fenced,garden,settlement,building_dimension,building_type,number_of_windows,geo_code,claim,year_occupied_before_insurance,year_of_observation_month,year_of_observation_day_of_month,date_of_occupancy_month,date_of_occupancy_day_of_month
0,H14663,2,0,0,0,0,0,17.029386,1,0,1053,0,7.280110,1,1,1,1
1,H2037,2,0,1,1,1,1,22.135944,1,4,1053,0,12.845233,1,1,1,1
2,H3802,2,0,0,0,0,0,24.392622,1,0,1053,0,7.348469,1,1,1,1
3,H3834,2,0,1,0,0,0,53.291650,1,0,1053,0,7.280110,1,1,1,1
4,H5053,2,0,1,1,1,1,26.076810,1,3,1053,0,14.628739,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7155,H5290,2,1,1,0,0,0,32.908965,1,0,6088,0,3.316625,1,1,1,1
7156,H5926,2,0,1,0,0,0,32.908965,2,0,6088,1,5.744563,1,1,1,1
7157,H6204,0,0,1,0,0,0,32.908965,1,0,6088,0,4.898979,1,1,1,1
7158,H6537,2,0,1,0,0,0,32.908965,1,0,6088,0,6.403124,1,1,1,1


In [643]:
train_data.columns

Index(['customer_id', 'insured_period', 'residential', 'building_painted',
       'building_fenced', 'garden', 'settlement', 'building_dimension',
       'building_type', 'number_of_windows', 'geo_code', 'claim',
       'year_occupied_before_insurance', 'year_of_observation_month',
       'year_of_observation_day_of_month', 'date_of_occupancy_month',
       'date_of_occupancy_day_of_month'],
      dtype='object')

In [644]:
#feature creation
from feature_engine.creation import CyclicalFeatures
cf = CyclicalFeatures(variables=['year_occupied_before_insurance'])
train_data = cf.fit_transform(train_data)
test_data = cf.fit_transform(test_data)

In [645]:
from feature_engine.timeseries.forecasting import LagFeatures
#lf = LagFeatures(periods=[1,2],variables=['insured_period','year_occupied_before_insurance','number_of_windows','building_dimension'])
#train_data = lf.fit_transform(train_data)
#test_data = lf.fit_transform(test_data)

In [646]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import make_pipeline

In [647]:
X=train_data.drop(["claim","customer_id"],axis=1)
X_t=test_data.drop(["customer_id",],axis=1)
y=train_data["claim"]

In [648]:
X.shape , X_t.shape

((7152, 17), (3069, 17))

In [649]:
#encode the geocode feature
from feature_engine.encoding import MeanEncoder
woe = MeanEncoder()
X.drop(columns='geo_code',inplace=True)
X_t.drop(columns='geo_code',inplace=True)
#X[['geo_code']] =woe.fit_transform(X[['geo_code']],X[['settlement']])
#X_t[['geo_code']] = woe.fit_transform(X_t[['geo_code']],X_t[['settlement']])

In [650]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2, random_state=42)

In [651]:
X_train

Unnamed: 0,insured_period,residential,building_painted,building_fenced,garden,settlement,building_dimension,building_type,number_of_windows,year_occupied_before_insurance,year_of_observation_month,year_of_observation_day_of_month,date_of_occupancy_month,date_of_occupancy_day_of_month,year_occupied_before_insurance_sin,year_occupied_before_insurance_cos
2849,2,1,0,0,0,0,54.772256,3,0,5.744563,1,1,1,1,0.896204,-0.443642
578,1,1,1,0,0,0,46.754679,4,0,6.000000,1,1,1,1,0.852554,-0.522640
60,2,0,1,0,0,0,42.918527,1,0,5.196152,1,1,1,1,0.964878,-0.262700
3171,2,0,1,0,0,0,26.267851,1,0,5.656854,1,1,1,1,0.909525,-0.415650
789,2,0,1,1,1,1,43.817805,1,4,6.082763,1,1,1,1,0.836902,-0.547352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3779,0,0,1,1,1,1,20.000000,2,4,5.099020,1,1,1,1,0.973326,-0.229425
5199,2,1,1,1,1,1,30.822070,4,4,7.937254,1,1,1,1,0.329823,-0.944043
5234,2,1,0,0,0,0,42.895221,4,0,6.480741,1,1,1,1,0.751894,-0.659284
5398,0,1,0,0,0,0,58.480766,2,0,5.291503,1,1,1,1,0.955478,-0.295063


In [652]:
parameters = {
            'nthread':4,
            'n_estimators':50,
            'learning_rate':0.036,
            'num_leaves':30,
            'colsample_bytree':0.9497036,
            'subsample':0.8715623,
            'max_depth':0,
            'reg_alpha':0,
            'reg_lambda':0.0735294,
            'min_split_gain':0.0222415,
            'min_child_weight':39.3259775,
            'silent':-1,
            'verbose':-1,
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': 'auc'}

In [653]:

train_data_ = lgb.Dataset(X_train, label=y_train)
test_data_ = lgb.Dataset(X_test, label=y_test)


param = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 30,
    "n_estimators":50,
    'learning_rate': 0.036,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'reg_alpha': 1,
    'reg_lambda': 0,
    "max_depth":0,
    "lambda_l2":.999
}
clf =lgb.train(params=parameters, 
                train_set=train_data_,
                valid_sets=[test_data_])

# Make predictions on the test set
y_pred = clf.predict(X_test, num_iteration=clf.best_iteration)

# Evaluate the model using ROC AUC score
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_test, y_pred)
print(f'ROC AUC score: {roc_auc:.4f}')

ROC AUC score: 0.7124




In [654]:
'''from sklearn.model_selection import KFold
# Define the number of folds for cross-validation
num_folds = 5

# Create a KFold object
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize an empty list to store the ROC AUC scores
roc_auc_scores = []

# Perform cross-validation
for train_idx, val_idx in kf.split(X):
    #print(f'Fold {fold+1}')

    # Get the training and validation data for this fold
    X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]

    X_val_fold, y_val_fold = X.iloc[val_idx], y.iloc[val_idx]

    # Create the LightGBM datasets for this fold
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold)


    # Train the LightGBM model
    clf_fold = lgb.train(params=param,
                    train_set=train_data,
                    valid_sets=[val_data])

    # Make predictions on the validation set
    y_pred_fold = clf_fold.predict(X_val_fold, num_iteration=clf_fold.best_iteration)

    # Calculate the ROC AUC score for this fold and append to the list
    roc_auc_fold = roc_auc_score(y_val_fold, y_pred_fold)
    roc_auc_scores.append(roc_auc_fold)

# Calculate the mean and standard deviation of the ROC AUC scores
roc_auc_mean = np.mean(roc_auc_scores)
roc_auc_std = np.std(roc_auc_scores)

print(f'ROC AUC mean score: {roc_auc_mean:.4f} +/- {roc_auc_std:.4f}')'''

"from sklearn.model_selection import KFold\n# Define the number of folds for cross-validation\nnum_folds = 5\n\n# Create a KFold object\nkf = KFold(n_splits=num_folds, shuffle=True, random_state=42)\n\n# Initialize an empty list to store the ROC AUC scores\nroc_auc_scores = []\n\n# Perform cross-validation\nfor train_idx, val_idx in kf.split(X):\n    #print(f'Fold {fold+1}')\n\n    # Get the training and validation data for this fold\n    X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]\n\n    X_val_fold, y_val_fold = X.iloc[val_idx], y.iloc[val_idx]\n\n    # Create the LightGBM datasets for this fold\n    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)\n    val_data = lgb.Dataset(X_val_fold, label=y_val_fold)\n\n\n    # Train the LightGBM model\n    clf_fold = lgb.train(params=param,\n                    train_set=train_data,\n                    valid_sets=[val_data])\n\n    # Make predictions on the validation set\n    y_pred_fold = clf_fold.predict(X_val

In [655]:
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# Define your hyperparameters
param = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 30,
    'n_estimators': 50,
    'learning_rate': 0.03599,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'reg_alpha': 1,
    'reg_lambda': 0,
    'max_depth': 0,
    'lambda_l2': 0.999
}

# Initialize K-fold cross-validation
n_splits = 15  # You can adjust the number of folds as needed
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize an array to store cross-validation results
roc_auc_scores = []

# Perform cross-validation
for train_index, val_index in cv.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    train_data_ = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data_ = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data_)

    clf_fold = lgb.train(params=param, train_set=train_data_, valid_sets=[val_data_])

    y_pred = clf_fold.predict(X_val_fold, num_iteration=clf_fold.best_iteration)

    roc_auc = roc_auc_score(y_val_fold, y_pred)
    roc_auc_scores.append(roc_auc)

# Calculate the mean and standard deviation of ROC AUC scores
mean_roc_auc = np.mean(roc_auc_scores)
std_roc_auc = np.std(roc_auc_scores)

print(f'Mean ROC AUC score: {mean_roc_auc:.4f}')
print(f'Standard Deviation of ROC AUC scores: {std_roc_auc:.4f}')






Mean ROC AUC score: 0.7092
Standard Deviation of ROC AUC scores: 0.0269


In [656]:
# Make predictions on the test set using the best LightGBM model
y_pred_c = clf.predict(X_t, num_iteration=clf.best_iteration)


In [657]:
test_data

Unnamed: 0,customer_id,insured_period,residential,building_painted,building_fenced,garden,settlement,building_dimension,building_type,number_of_windows,geo_code,year_occupied_before_insurance,year_of_observation_month,year_of_observation_day_of_month,date_of_occupancy_month,date_of_occupancy_day_of_month,year_occupied_before_insurance_sin,year_occupied_before_insurance_cos
0,H11920,2,0,0,0,0,0,17.320508,1,3,3310,7.280110,1,1,1,1,0.320588,-0.947219
1,H11921,1,0,0,0,0,0,17.320508,1,3,3310,7.483315,1,1,1,1,0.245243,-0.969462
2,H9805,0,0,0,1,1,1,28.106939,1,0,3310,7.280110,1,1,1,1,0.320588,-0.947219
3,H7493,2,0,0,0,0,0,37.483330,1,3,3321,3.162278,1,1,1,1,0.940079,0.340955
4,H7494,2,0,0,0,0,0,37.483330,1,3,3321,3.464102,1,1,1,1,0.973388,0.229162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3064,H11583,2,0,0,1,1,1,30.000000,4,0,6083,10.723805,1,1,1,1,-0.844343,-0.535802
3065,H11720,2,0,0,1,1,1,30.000000,2,0,6083,8.000000,1,1,1,1,0.047948,-0.998850
3066,H11721,2,0,0,1,1,1,30.000000,2,0,6083,4.358899,1,1,1,1,0.993417,-0.114552
3067,H12408,2,0,0,1,1,1,30.000000,1,0,6083,14.594520,1,1,1,1,-0.596743,0.802432


In [658]:
test_id = test_data['customer_id']


In [659]:
d = {"customer_id": test_id, 'claim':y_pred_c}
test_predictions = pd.DataFrame(data=d)
test_predictions = test_predictions[["customer_id", 'claim']]

In [660]:
test_predictions.to_csv('2023_first_submission_13.csv', index=False)