# 0.0 Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

from xgboost import XGBRFClassifier

from boruta import BorutaPy

from imblearn.over_sampling import SMOTE

import pickle

import time
import warnings

# Tempo de processamento
time_agora = time.time()

###### 0.1 Colecting data

In [3]:
df = pickle.load(open('D:\\My Drive\\Pessoal\\Projetos\\insurance_sales_predict\\insurance_sales_predict\\data\\interim\\df_to_data_preparation.pkl', 'rb'))
df.head()

Unnamed: 0,gender,age,driving_license,region_code,previously_insured,vehicle_age,vehicle_damage,annual_premium,policy_sales_channel,vintage,response
0,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


# 1.0 Data Preparation

In [4]:
# backup
df1 = df.copy()

# Spliting Validation
X = df1.drop('response', axis=1)
y = df1['response'].copy()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=101)

df1 = pd.concat([X_train, y_train], axis=1)

## 1.1 Normalization

In [5]:
# annual_premium
ss = StandardScaler()
ss_annual_premium = ss.fit(df1[['annual_premium']].values)
df1['annual_premium'] = ss_annual_premium.transform(df1[['annual_premium']].values)

annual_premium foi transformada pelo método standardscaler porque tem distribuição parecida com uma normal

## 1.2 Rescaling

In [6]:
# age by the method minmaxscaler
mm = MinMaxScaler()
mm_age = mm.fit(df1[['age']].values)
df1['age'] = mm_age.transform(df1[['age']].values)

age foi transformada pelo método minmaxscaler porque não parece com uma normal, e não houve necessidade de utilizar um método robusto contra outliers

## 1.3 Transformation

### 1.3.1 Encoding

In [7]:
# vehicle_age by the method labelencoding
map_vehicle_age = {'< 1 Year': 1, '1-2 Year': 2, '> 2 Years': 3}
df1['vehicle_age'] = df1['vehicle_age'].map(map_vehicle_age)

# region_code by the method target_encoding
map_region_code = df1.groupby('region_code').agg({'response':'mean'})['response']
df1['region_code'] = df1['region_code'].map(map_region_code)

# policy_sales_channel
map_policy_sales_channel = df1.groupby('policy_sales_channel').agg({'response':'mean'})['response']
df1['policy_sales_channel'] = df1['policy_sales_channel'].map(map_policy_sales_channel)

### 1.3.2 Nature Transformation

In [16]:
# vintage
cicle = df1['vintage'].max() - df1['vintage'].min()
df1['vintage_sin'] = df1['vintage'].apply(lambda x: np.sin(x* (2*np.pi/cicle)))
df1['vintage_cos'] = df1['vintage'].apply(lambda x: np.cos(x* (2*np.pi/cicle)))

## 1.4 Balancing

In [7]:
smote = SMOTE()

# 2 Feature Selection

## 2.1 Importances

###### Random Forest

In [8]:
# model definition
rf_model = RandomForestClassifier(n_estimators=100, min_samples_leaf=1000, random_state=101)

# fiting model
rf_fit = rf_model.fit(X_train, y_train)

# preditions
rf_predict = rf_fit.predict(X_val)

# metrics
print(classification_report(y_val, rf_predict))

# importances
ranking = pd.DataFrame()
ranking['importance_rf'] = rf_model.feature_importances_

              precision    recall  f1-score   support

           0       0.88      1.00      0.94     60298
           1       0.00      0.00      0.00      8302

    accuracy                           0.88     68600
   macro avg       0.44      0.50      0.47     68600
weighted avg       0.77      0.88      0.82     68600



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


###### XGRFBoost

In [9]:
# fiting model
xgbrf_model = XGBRFClassifier(use_label_encoder=False, eval_metric='error')
xgbrf_fit = xgbrf_model.fit(X_train, y_train)

# predictions
xgbrf_predict = xgbrf_fit.predict(X_val)

# metrics
print(classification_report(y_val, xgbrf_predict))

# importances
ranking['importance_xgb'] = xgbrf_model.feature_importances_

              precision    recall  f1-score   support

           0       0.88      1.00      0.94     60298
           1       0.00      0.00      0.00      8302

    accuracy                           0.88     68600
   macro avg       0.44      0.50      0.47     68600
weighted avg       0.77      0.88      0.82     68600



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 3.0 Exporting data

###### 3.1 Features

In [12]:
# annual_premium
pickle.dump(ss_annual_premium, open('D:\\My Drive\\Pessoal\\Projetos\\insurance_sales_predict\\insurance_sales_predict\\src\\features\\ss_annual_premium.pkl', 'wb'))

# age
pickle.dump(mm_age, open('D:\\My Drive\\Pessoal\\Projetos\\insurance_sales_predict\\insurance_sales_predict\\src\\features\\mm_age.pkl', 'wb'))

# vehicle_age
pikcle.dump(map_vehicle_age, open('D:\\My Drive\\Pessoal\\Projetos\\insurance_sales_predict\\insurance_sales_predict\\src\\features\\map_vehicle_age', 'wb'))

# region_code
pikcle.dump(map_region_code, open('D:\\My Drive\\Pessoal\\Projetos\\insurance_sales_predict\\insurance_sales_predict\\src\\features\\map_region_code', 'wb'))

# policy_sales_channel
pikcle.dump(map_policy_sales_channel, open('D:\\My Drive\\Pessoal\\Projetos\\insurance_sales_predict\\insurance_sales_predict\\src\\features\\map_policy_sales_channel', 'wb'))

NameError: name 'picle' is not defined

###### 3.1 Dataframes

# Time

In [11]:
print(f'O tempo de processamento do projeto foi de: {int(round(time.time()-time_agora, 2)/60)} minutos')

O tempo de processamento do projeto foi de: 0 minutos
