# Set up

### Libraries

In [1]:
import pandas as pd
import inflection
import numpy as np
import seaborn as sns

from matplotlib             import pyplot as plt
from IPython.display        import HTML

### Functions

In [2]:




def jupyter_settings():
    %matplotlib inline
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    sns.set()

In [3]:
jupyter_settings()

### Import Data

In [4]:
df_train_raw = pd.read_csv('train.csv', low_memory=False)

## ETL

### Data Raw status

In [5]:
df1 = df_train_raw.copy()

In [6]:
df1.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

- Id: identificador único do cliente.
- Gender: gênero do cliente.
- Age: idade do cliente.
- Driving License: 0, o cliente não tem permissão para dirigir e 1, o cliente tem para dirigir ( CNH – CarteiraNacional de Habilitação )
- Region Code: código da região do cliente.
- Previously Insured: 0, o cliente não tem seguro de automóvel e 1, o cliente já tem seguro de automóvel.
- Vehicle Age: idade do veículo.
- Vehicle Damage: 0, cliente nunca teve seu veículo danificado no passado e 1, cliente já teve seu veículodanificado no passado.
- Anual Premium: quantidade que o cliente pagou à empresa pelo seguro de saúde anual.
- Policy sales channel: código anônimo para o canal de contato com o cliente.
- Vintage: número de dias que o cliente se associou à empresa através
- Response: Pesquisa prévia sobre pessoas que teriam intenção de comprar um seguro de carro

In [7]:
df1.head(5)

# Notes: 
# Vehicle_Age: Separar o Year da coluna
# Vehicle_damage: Pode ser 1=yes 0=no
# Vintage: precisa modificar alguma coisa pra tipo data??


Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [8]:
df1.dtypes

id                        int64
Gender                   object
Age                       int64
Driving_License           int64
Region_Code             float64
Previously_Insured        int64
Vehicle_Age              object
Vehicle_Damage           object
Annual_Premium          float64
Policy_Sales_Channel    float64
Vintage                   int64
Response                  int64
dtype: object

In [9]:
df1.isna().sum()

id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

### Rename Columns

In [10]:
old_columns = [ 'id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response']

snakecase = lambda x: inflection.underscore( x )
cols_new = list (map (snakecase, old_columns))

# Rename
df1.columns = cols_new

df1.columns

Index(['id', 'gender', 'age', 'driving_license', 'region_code',
       'previously_insured', 'vehicle_age', 'vehicle_damage', 'annual_premium',
       'policy_sales_channel', 'vintage', 'response'],
      dtype='object')

### Change Data Types

In [11]:
#Changin Vehicle 
df1['vehicle_damage'] = df1.apply( lambda x: 0 if x['vehicle_damage'] == 'No' else 1, axis=1 )

# Removindo Year
df1['vehicle_age_year'] = df1['vehicle_age'].apply(lambda x: x.replace(' Years', '').replace(' Year', ''))

df1['annual_premium'] = df1['annual_premium'].astype(int)
df1['policy_sales_channel'] = df1['policy_sales_channel'].astype(int)
df1['annual_premium'] = df1['annual_premium'].astype(int)
df1['policy_sales_channel'] = df1['policy_sales_channel'].astype(int)
df1['region_code'] = df1['region_code'].astype(int)



In [12]:
df1.dtypes

id                       int64
gender                  object
age                      int64
driving_license          int64
region_code              int32
previously_insured       int64
vehicle_age             object
vehicle_damage           int64
annual_premium           int32
policy_sales_channel     int32
vintage                  int64
response                 int64
vehicle_age_year        object
dtype: object

# Descriptive Statistical

### Data Dimensions

In [13]:
print(f'Number of Rows {df1.shape[0]}')
print(f'Number of Columns {df1.shape[1]}')

Number of Rows 381109
Number of Columns 13


### Basics Statistics

In [14]:
# Split Numerical/Categorical Atributes

num_attributes = df1.select_dtypes( include=['int64', 'float64', 'int32'])
cat_attributes = df1.select_dtypes( include=['object'])

In [15]:
# Cental Tendency - mean, median
ct1 = pd.DataFrame(num_attributes.apply( np.mean )).T
ct2 = pd.DataFrame(num_attributes.apply( np.median )).T


# Dispersion - std, min, max, range, skew, kurtosis
d1 = pd.DataFrame(num_attributes.apply( np.std )).T
d2 = pd.DataFrame(num_attributes.apply( np.min )).T
d3 = pd.DataFrame(num_attributes.apply( np.max )).T
d4 = pd.DataFrame(num_attributes.apply( lambda x: x.max() - x.min() )).T
d5 = pd.DataFrame(num_attributes.apply( lambda x: x.skew() )).T
d6 = pd.DataFrame(num_attributes.apply( lambda x: x.kurtosis() )).T


# Concatenate
table_m = pd.concat( [ d2, d3, d4, ct1, ct2, d1, d5, d6 ]).T.reset_index()
table_m.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']

table_m

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,id,1.0,381109.0,381108.0,190555.0,190555.0,110016.69187,9.443274e-16,-1.2
1,age,20.0,85.0,65.0,38.822584,36.0,15.511591,0.672539,-0.565655
2,driving_license,0.0,1.0,1.0,0.997869,1.0,0.046109,-21.59518,464.354302
3,region_code,0.0,52.0,52.0,26.388807,28.0,13.229871,-0.1152664,-0.867857
4,previously_insured,0.0,1.0,1.0,0.45821,0.0,0.498251,0.1677471,-1.971871
5,vehicle_damage,0.0,1.0,1.0,0.504877,1.0,0.499976,-0.01950724,-1.99963
6,annual_premium,2630.0,540165.0,537535.0,30564.389581,31669.0,17213.132474,1.766087,34.004569
7,policy_sales_channel,1.0,163.0,162.0,112.034295,133.0,54.203924,-0.9000081,-0.97081
8,vintage,10.0,299.0,289.0,154.347397,154.0,83.671194,0.003029517,-1.200688
9,response,0.0,1.0,1.0,0.122563,0.0,0.327935,2.301906,3.298788


# Feature Engineering

### Mental Map

In [16]:
df2 = df1.copy()

### Hyphoteses List

- Clientes que já pagaram seguro previamente comprariam novo seguro
- Mulheres compram mais seguros do que homem
- Clientes que vivem em regiões mais perigosas compram mais
- Clientes com mais idade compram mais
- Clientes que não possuem CHN não compram
- Carros mais danificados precisam de seguro
- Carros mais velhos precisam de seguro
- Carros que já possuem seguros não precisam de novo seguro

### Exploratory Data Analysis

#### Univariate Anaysis

#### Numerial Variable

#### Categorical Variable

#### Bivariate Analysis

#### More Relevant

##### 1

#### Less Relevant

##### l

### Multivariate Analysis

#### Numerical Attributes

#### Categorical Attributes

# Data Preparation