# 0.0 Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

# 0.1 Loading Data

In [2]:
df_raw = pd.read_csv('../data/raw/train.csv')

# 1.0 Data Description

In [3]:
df1 = df_raw.copy()

In [4]:
cols_old = ['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response']
cols_new = [i.lower() for i in cols_old]
df1.columns = cols_new

## 1.1 Data Dimension

In [5]:
print('Number of rows {}'.format(df1.shape[0]))
print('Number of columns {}'.format(df1.shape[1]))

Number of rows 381109
Number of columns 12


## 1.2 Data Types

In [6]:
df1.dtypes

id                        int64
gender                   object
age                       int64
driving_license           int64
region_code             float64
previously_insured        int64
vehicle_age              object
vehicle_damage           object
annual_premium          float64
policy_sales_channel    float64
vintage                   int64
response                  int64
dtype: object

## 1.3 Check N/A

In [7]:
df1.isna().sum()

id                      0
gender                  0
age                     0
driving_license         0
region_code             0
previously_insured      0
vehicle_age             0
vehicle_damage          0
annual_premium          0
policy_sales_channel    0
vintage                 0
response                0
dtype: int64

## 1.4 Data Descriptive

In [8]:
num_attributes = df1.select_dtypes(include = ['int', 'float64'])
cat_attributes = df1.select_dtypes(exclude = ['int', 'float64', 'datetime64[ns]'])

In [9]:
# Central Tendency - mean, median
ct1 = pd.DataFrame( num_attributes.apply(np.mean) ).T
ct2 = pd.DataFrame( num_attributes.apply(np.median) ).T

# Dispersion - std, min, max, range, skew, kurtosis
d1 = pd.DataFrame( num_attributes.apply(np.std) ).T
d2 = pd.DataFrame( num_attributes.apply(np.min) ).T
d3 = pd.DataFrame( num_attributes.apply(np.max) ).T
d4 = pd.DataFrame( num_attributes.apply( lambda x: x.max() - x.min()) ).T
d5 = pd.DataFrame( num_attributes.apply( lambda x: x.skew()) ).T
d6 = pd.DataFrame( num_attributes.apply( lambda x: x.kurtosis()) ).T

# Concatenate
m = pd.concat([d2, d3, d4, ct1, ct2, d1, d5, d6]).T.reset_index()
m.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']
m

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,id,1.0,381109.0,381108.0,190555.0,190555.0,110016.69187,9.443274e-16,-1.2
1,age,20.0,85.0,65.0,38.822584,36.0,15.511591,0.672539,-0.565655
2,driving_license,0.0,1.0,1.0,0.997869,1.0,0.046109,-21.59518,464.354302
3,region_code,0.0,52.0,52.0,26.388807,28.0,13.229871,-0.1152664,-0.867857
4,previously_insured,0.0,1.0,1.0,0.45821,0.0,0.498251,0.1677471,-1.971871
5,annual_premium,2630.0,540165.0,537535.0,30564.389581,31669.0,17213.132474,1.766087,34.004569
6,policy_sales_channel,1.0,163.0,162.0,112.034295,133.0,54.203924,-0.9000081,-0.97081
7,vintage,10.0,299.0,289.0,154.347397,154.0,83.671194,0.003029517,-1.200688
8,response,0.0,1.0,1.0,0.122563,0.0,0.327935,2.301906,3.298788


# 2.0 Feature Engineering

In [10]:
# vehicle age
df2['vehicle_age'] = df2['vehicle_age'].apply(lambda x: 'over_2_years' if x == '> 2 Years' else 'between_1_2_years'
                                                                       if x == '1-2 Year'  else 'below_1_year' )
# vehicle damage
df2['vehicle_damage'] = df2['vehicle_damage'].apply(lambda x: 1 if x=='Yes' else 0)

NameError: name 'df2' is not defined

# 3.0 Data Filtering

In [None]:
df3 = df2.copy()

# 4.0 EDA

In [None]:
df4 = df3.copy()

## 4.1 Univariate Analysis

In [None]:
# 'age'
sns.boxplot(x = 'response', y = 'age', data = df4)

In [None]:
aux00 = df4.loc[df4['response'] == 0, 'age']
sns.histplot(aux00)

In [None]:
aux00 = df4.loc[df4['response'] == 1, 'age']
sns.histplot(aux00)

In [None]:
# 'annual_premium'


# 5.0 Data Preparation

# 6.0 Feature Selection

# 7.0 Machine Learning