In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error
import warnings

warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('train.csv', index_col='id', engine='pyarrow')
test_df = pd.read_csv('test.csv', index_col='id', engine='pyarrow')

In [3]:
train_df.head()

Unnamed: 0_level_0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [4]:
test_df.head()

Unnamed: 0_level_0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1200000,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
1200003,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
1200004,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House


In [5]:
def date_separator(x):
    return pd.Series([x.day, x.month, x.year])

train_df[['day', 'month', 'year']] = train_df['Policy Start Date'].apply(date_separator)
test_df[['day', 'month', 'year']] = test_df['Policy Start Date'].apply(date_separator)

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1200000 entries, 0 to 1199999
Data columns (total 23 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   Age                   1181295 non-null  float64       
 1   Gender                1200000 non-null  object        
 2   Annual Income         1155051 non-null  float64       
 3   Marital Status        1181471 non-null  object        
 4   Number of Dependents  1090328 non-null  float64       
 5   Education Level       1200000 non-null  object        
 6   Occupation            841925 non-null   object        
 7   Health Score          1125924 non-null  float64       
 8   Location              1200000 non-null  object        
 9   Policy Type           1200000 non-null  object        
 10  Previous Claims       835971 non-null   float64       
 11  Vehicle Age           1199994 non-null  float64       
 12  Credit Score          1062118 non-null  float64

In [7]:
train_df.isna().sum()

Age                      18705
Gender                       0
Annual Income            44949
Marital Status           18529
Number of Dependents    109672
Education Level              0
Occupation              358075
Health Score             74076
Location                     0
Policy Type                  0
Previous Claims         364029
Vehicle Age                  6
Credit Score            137882
Insurance Duration           1
Policy Start Date            0
Customer Feedback        77824
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount               0
day                          0
month                        0
year                         0
dtype: int64

In [8]:
target = 'Premium Amount'

In [9]:
numerical_features = train_df.drop(target, axis=1).select_dtypes(include=np.number).columns.values
numerical_features

array(['Age', 'Annual Income', 'Number of Dependents', 'Health Score',
       'Previous Claims', 'Vehicle Age', 'Credit Score',
       'Insurance Duration', 'day', 'month', 'year'], dtype=object)

In [10]:
categorical_features = train_df.drop(target, axis=1).select_dtypes(include='object').columns.values
categorical_features

array(['Gender', 'Marital Status', 'Education Level', 'Occupation',
       'Location', 'Policy Type', 'Customer Feedback', 'Smoking Status',
       'Exercise Frequency', 'Property Type'], dtype=object)

In [11]:
train_df.duplicated().sum()

0

In [12]:
train_df[numerical_features].astype(np.float_).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,1181295.0,41.145563,13.53995,18.0,30.0,41.0,53.0,64.0
Annual Income,1155051.0,32745.217777,32179.506124,1.0,8001.0,23911.0,44634.0,149997.0
Number of Dependents,1090328.0,2.009934,1.417338,0.0,1.0,2.0,3.0,4.0
Health Score,1125924.0,25.613908,12.203462,2.012237,15.918959,24.578648,34.527209,58.975914
Previous Claims,835971.0,1.002689,0.98284,0.0,0.0,1.0,2.0,9.0
Vehicle Age,1199994.0,9.569889,5.776189,0.0,5.0,10.0,15.0,19.0
Credit Score,1062118.0,592.92435,149.981945,300.0,468.0,595.0,721.0,849.0
Insurance Duration,1199999.0,5.018219,2.594331,1.0,3.0,5.0,7.0,9.0
day,1200000.0,15.706179,8.811727,1.0,8.0,16.0,23.0,31.0
month,1200000.0,6.492671,3.432689,1.0,4.0,6.0,9.0,12.0


In [13]:
train_df.describe(include='O').T

Unnamed: 0,count,unique,top,freq
Gender,1200000,2,Male,602571
Marital Status,1181471,3,Single,395391
Education Level,1200000,4,Master's,303818
Occupation,841925,3,Employed,282750
Location,1200000,3,Suburban,401542
Policy Type,1200000,3,Premium,401846
Customer Feedback,1122176,3,Average,377905
Smoking Status,1200000,2,Yes,601873
Exercise Frequency,1200000,4,Weekly,306179
Property Type,1200000,3,House,400349


In [14]:
from sklearn.preprocessing import StandardScaler, FunctionTransformer, LabelEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
import category_encoders as ce

preprocessing = ColumnTransformer([
    ('num', make_pipeline(SimpleImputer(strategy='mean'), FunctionTransformer(), StandardScaler()), numerical_features),
    ('cat', make_pipeline(SimpleImputer(strategy='most_frequent'), ce.cat_boost.CatBoostEncoder()), categorical_features)
], remainder='drop')

In [15]:
X = train_df.copy()
y = X.pop(target)
y = np.log1p(y)

In [16]:
X = preprocessing.fit_transform(X, y)
testProcessed = preprocessing.transform(test_df)

In [17]:
from lightgbm import LGBMRegressor

In [18]:
lgb_model = LGBMRegressor(
    boosting_type='gbdt', 
    num_leaves=31, 
    max_depth=-1, 
    learning_rate=0.1, 
    n_estimators=1000, 
    random_state=42
)

In [19]:
from xgboost import XGBRegressor

In [20]:
xgb_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)

In [21]:
from sklearn.ensemble import VotingRegressor

In [22]:
model = VotingRegressor([('LGB', lgb_model), ('XGB', xgb_model)])

In [23]:
model.fit(X,y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051617 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3462
[LightGBM] [Info] Number of data points in the train set: 1200000, number of used features: 21
[LightGBM] [Info] Start training from score 6.593889


In [24]:
y_pred = model.predict(testProcessed)

In [27]:
sub = pd.read_csv("sample_submission.csv")
sub[target] = np.expm1(y_pred)
sub.to_csv("submission.csv", index=False)

In [33]:
# pd.read_csv('submission.csv')