In [1]:
import pandas as pd 
import os
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import xgboost as XGB
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.metrics import root_mean_squared_log_error
from lightgbm import LGBMRegressor

## Обработка данных (очистка, преобразования)

In [2]:
df_test = pd.read_csv('kaggle competitions download -c playground-series-s4e12')
df_train = pd.read_csv('kaggle competitions download -c playground-series-s4e12')

In [None]:
df_test.head(5)

In [None]:
df_train.head(5)

In [12]:
df_train.isna().mean() *100

Age                     0.0
Gender                  0.0
Annual Income           0.0
Marital Status          0.0
Number of Dependents    0.0
Education Level         0.0
Health Score            0.0
Location                0.0
Policy Type             0.0
Vehicle Age             0.0
Credit Score            0.0
Insurance Duration      0.0
Policy Start Date       0.0
Customer Feedback       0.0
Smoking Status          0.0
Exercise Frequency      0.0
Property Type           0.0
Premium Amount          0.0
dtype: float64

In [11]:
df_test.isna().mean() * 100

id                      0.0
Age                     0.0
Gender                  0.0
Annual Income           0.0
Marital Status          0.0
Number of Dependents    0.0
Education Level         0.0
Health Score            0.0
Location                0.0
Policy Type             0.0
Vehicle Age             0.0
Credit Score            0.0
Insurance Duration      0.0
Policy Start Date       0.0
Customer Feedback       0.0
Smoking Status          0.0
Exercise Frequency      0.0
Property Type           0.0
dtype: float64

In [3]:
#Очищаем пропуски
df_train = df_train.dropna(subset=['Age', 'Insurance Duration', 'Vehicle Age', 'Marital Status'])

In [4]:
#Заменяем значения Material Status на mode. Заменяем значения в col на median
col = ['Age', 'Insurance Duration', 'Vehicle Age']

df_test['Marital Status'] = df_test['Marital Status'].fillna(df_test['Marital Status'].mode()[0])
df_test[col] = df_test[col].fillna(df_test[col].median())

In [5]:
#Заменяем пропуски новой категорие "Unknown"
df_train['Customer Feedback'] = df_train['Customer Feedback'].fillna('Unknown')
df_test['Customer Feedback'] = df_test['Customer Feedback'].fillna('Unknown')

In [6]:
#Заменяем nan значения на медиану  в пределах групп, определенных по столбцу 'Premium Amount'

df_train['Annual Income'] = df_train['Annual Income'].fillna(df_train.groupby('Premium Amount')['Annual Income'].transform('median'))
df_test['Annual Income'] = df_test['Annual Income'].fillna(df_test['Annual Income'].median())

# Заменяем nan на медиану без групировки. Так как 7 значений остались nan.
df_train['Annual Income'] = df_train['Annual Income'].fillna(df_train['Annual Income'].median())

In [7]:
#Заменяем nan значения на медиану каждого столбца 
columns = ['Number of Dependents', 'Health Score', 
                                 'Credit Score',]

df_train.loc[:,columns] = df_train[columns].fillna(df_train[columns].median()) 
df_test.loc[:,columns] = df_test[columns].fillna(df_test[columns].median()) 

In [8]:
#Удаляем столбцы которые содержат большое количество пропусков. 
df_train = df_train.drop(['Occupation', 'Previous Claims', 'id'], axis = 1)
df_test = df_test.drop(['Occupation', 'Previous Claims'], axis = 1)

In [9]:
#Определяем уникальные значения категориальных данных
columns_enc = ['Marital Status', 'Education Level', 'Gender', 'Location', 'Policy Type',
               'Smoking Status', 'Exercise Frequency', 'Property Type', 'Customer Feedback']
for col in columns_enc:
    print(df_train[col].unique())

['Married' 'Divorced' 'Single']
["Bachelor's" "Master's" 'High School' 'PhD']
['Female' 'Male']
['Urban' 'Rural' 'Suburban']
['Premium' 'Comprehensive' 'Basic']
['No' 'Yes']
['Weekly' 'Monthly' 'Daily' 'Rarely']
['House' 'Apartment' 'Condo']
['Poor' 'Average' 'Good' 'Unknown']


In [10]:
#Заменяем категориальные данные на численные с помощью LableEncoding
#Выводим каким образом он преобразовал данные 

encoder = LabelEncoder()

for column in columns_enc:
    df_train[column] = encoder.fit_transform(df_train[column])
    df_test[column] = encoder.transform(df_test[column])
    print(column)
    print(dict(zip(encoder.classes_, range(len(encoder.classes_)))), '\n')





Marital Status
{'Divorced': 0, 'Married': 1, 'Single': 2} 

Education Level
{"Bachelor's": 0, 'High School': 1, "Master's": 2, 'PhD': 3} 

Gender
{'Female': 0, 'Male': 1} 

Location
{'Rural': 0, 'Suburban': 1, 'Urban': 2} 

Policy Type
{'Basic': 0, 'Comprehensive': 1, 'Premium': 2} 

Smoking Status
{'No': 0, 'Yes': 1} 

Exercise Frequency
{'Daily': 0, 'Monthly': 1, 'Rarely': 2, 'Weekly': 3} 

Property Type
{'Apartment': 0, 'Condo': 1, 'House': 2} 

Customer Feedback
{'Average': 0, 'Good': 1, 'Poor': 2, 'Unknown': 3} 



In [16]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1163057 entries, 0 to 1199999
Data columns (total 18 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   Age                   1163057 non-null  float64
 1   Gender                1163057 non-null  int64  
 2   Annual Income         1163057 non-null  float64
 3   Marital Status        1163057 non-null  int64  
 4   Number of Dependents  1163057 non-null  float64
 5   Education Level       1163057 non-null  int64  
 6   Health Score          1163057 non-null  float64
 7   Location              1163057 non-null  int64  
 8   Policy Type           1163057 non-null  int64  
 9   Vehicle Age           1163057 non-null  float64
 10  Credit Score          1163057 non-null  float64
 11  Insurance Duration    1163057 non-null  float64
 12  Policy Start Date     1163057 non-null  object 
 13  Customer Feedback     1163057 non-null  int64  
 14  Smoking Status        1163057 non-null 

In [None]:
sns.boxplot(x='Annual Income', data = df_train)

In [None]:
sns.histplot(df_train['Annual Income'], bins=50)

In [11]:
# Преобразуем 'Policy Start Date' в формат datetime
df_train['Policy Start Date'] = pd.to_datetime(df_train['Policy Start Date']).dt.floor('D')
df_test['Policy Start Date'] = pd.to_datetime(df_test['Policy Start Date']).dt.floor('D')

In [None]:
df_train['Policy Start Date'].astype

In [12]:
#Округляем значения в столбце Health Score.
df_train['Health Score']  = round(df_train['Health Score'], 2)
df_test['Health Score'] = round(df_test['Health Score'], 2)

In [19]:
df_test.isna().sum()

id                      0
Age                     0
Gender                  0
Annual Income           0
Marital Status          0
Number of Dependents    0
Education Level         0
Health Score            0
Location                0
Policy Type             0
Vehicle Age             0
Credit Score            0
Insurance Duration      0
Policy Start Date       0
Customer Feedback       0
Smoking Status          0
Exercise Frequency      0
Property Type           0
dtype: int64

In [20]:
# Просмотр данных по возрасту
print("Mean age",  df_train['Age'].mean().round(2))
print("Mode age",  df_train['Age'].mode())
print("Max and min age", df_train['Age'].max(),'   ',  df_train['Age'].min())

Mean age 41.15
Mode age 0    53.0
Name: Age, dtype: float64
Max and min age 64.0     18.0


In [21]:
df_train['Age'].nunique()

47

In [None]:
#Построим тепловую карту для отображения кореляций по методу пирсона  
plt.figure(figsize=(16, 8))
sns.heatmap(data= df_test.corr(method = 'kendall').round(2), annot= True, cmap="coolwarm")
plt.title('Heat map train')


In [13]:
#Преобразуем время в тайм степ. Для корректной работы модели 
df_train["Policy Start Date"] = df_train["Policy Start Date"].astype(np.int64)
df_test["Policy Start Date"] = df_test["Policy Start Date"].astype(np.int64)

In [14]:
#Уменьшаем размер данных 
df_train[columns_enc] = df_train[columns_enc].astype('int32')
df_test[columns_enc] = df_test[columns_enc].astype('int32')

float_col = df_train.select_dtypes(include=['float64']).columns
df_train[float_col] = df_train[float_col].astype('float32')

float_col = df_test.select_dtypes(include=['float64']).columns
df_test[float_col] = df_test[float_col].astype('float32')

## Разделение данных и обучение моделей на валидационных данных

In [15]:
#Производим разделение данных на валидационную и тренировочную
x = df_train .drop(['Premium Amount'], axis = 1)
y = df_train ['Premium Amount']

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42, shuffle= True)
print(x_train.shape)
print(x_val.shape) 



(930445, 17)
(232612, 17)


In [16]:
#Создаем и обучаем модель случайного леса
model = RandomForestRegressor(
    n_estimators= 100,
    max_depth = 5, 
    max_features='sqrt',
    random_state= 42
)
model.fit(x_train, y_train)

In [17]:
#Делаем предсказание на валидационной выборке
val_pred_RF = model.predict(x_val)

In [18]:
#Находим среднеквадратичную ошибку, среднюю абсолютную ошибку, коэф. детерминации для RandomForest
rf_mape = mean_absolute_percentage_error(y_val, val_pred_RF)
rf_mse = mean_squared_error(y_val, val_pred_RF)
rf_r2 = r2_score(y_val, val_pred_RF)
rf_rmsle = root_mean_squared_log_error(y_val, val_pred_RF)

print(f"RandomForest MAPE: {rf_mape:.2f}")
print(f"RandomForest MSE: {rf_mse:.2f}")
print(f"RandomForest R2_Score: {rf_r2:.2f}")
print(f'RamdomForest RMSLE:  {rf_rmsle:.2f}')

RandomForest MAPE: 3.02
RandomForest MSE: 738370.00
RandomForest R2_Score: 0.01
RamdomForest RMSLE:  1.17


In [19]:
#Создаем и обучаем модель LGBMR
lgbm_model = LGBMRegressor(
    num_leaves = 71,
    learning_rate = 0.05412467152424433,
    n_estimators =  595,
    max_depth = 12,
    min_data_in_leaf = 97,
    bagging_fraction = 0.5200288825838669,
    feature_fraction = 0.9881738491942492,
)
lgbm_model.fit(x_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012594 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1127
[LightGBM] [Info] Number of data points in the train set: 930445, number of used features: 17
[LightGBM] [Info] Start training from score 1100.929006


In [20]:
#Делаем предсказание на валидационной выборке
val_pred_LGBMR = lgbm_model.predict(x_val)



In [21]:
#Находим среднеквадратичную ошибку, среднюю абсолютную ошибку, коэф. детерминации для Light Gradient Boosting Machine
lgbm_mape = mean_absolute_percentage_error(y_val, val_pred_LGBMR)
lgbm_mse = mean_squared_error(y_val, val_pred_LGBMR)
lgbm_r2 = r2_score(y_val, val_pred_LGBMR)
lgbm_rmsle = root_mean_squared_log_error(y_val, val_pred_LGBMR)

print(f"LightGBM MAPE: {lgbm_mape :.2f}")
print(f"LightGBM MSE: {lgbm_mse:.2f}")
print(f"LightGBM R2_Score: {lgbm_r2:.2f}")
print(f'LightGBM RMSLE:  {lgbm_rmsle:.2f}')

LightGBM MAPE: 2.97
LightGBM MSE: 721626.91
LightGBM R2_Score: 0.03
LightGBM RMSLE:  1.15


## Обучаем модель на всех данных.

In [29]:
# Очищаем test от id
df_id = df_test['id'].copy()
df_test.drop(columns = 'id', axis = 1, inplace=True)


In [41]:
lgbm_model.fit(x, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011606 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1128
[LightGBM] [Info] Number of data points in the train set: 1163057, number of used features: 17
[LightGBM] [Info] Start training from score 1100.835166


In [42]:
test_predict = lgbm_model.predict(df_test)



In [46]:
output = pd.DataFrame({'id': df_id,
                       'Premium Amount': test_predict})

In [48]:
output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              800000 non-null  int64  
 1   Premium Amount  800000 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 12.2 MB


In [52]:
output.to_csv('d:\path\Insurance Dataset.csv', index= False)