In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
%matplotlib inline

In [5]:
train = pd.read_csv('../data/raw/porto-seguro-safe-driver-prediction/train.csv')
test = pd.read_csv('../data/raw/porto-seguro-safe-driver-prediction/test.csv')

In [6]:
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (595212, 59)
Test shape: (892816, 58)


In [7]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [8]:
# Target distribution (CLASS IMBALANCE CHECK)
print(train['target'].value_counts())
print(f"\nClass imbalance: {train['target'].value_counts(normalize=True)}")

target
0    573518
1     21694
Name: count, dtype: int64

Class imbalance: target
0    0.963552
1    0.036448
Name: proportion, dtype: float64


#### Class imbalance 96.5% 

In [11]:
# Cell 5: Missing values
print("Missing values:")
print(train.isnull().sum().sum())  # Should be 0 for this dataset
print("\n-1 values (Porto Seguro uses -1 for missing):")
print((train == -1).sum().sort_values(ascending=False).head(10))

Missing values:
0

-1 values (Porto Seguro uses -1 for missing):
ps_car_03_cat    411231
ps_car_05_cat    266551
ps_reg_03        107772
ps_car_14         42620
ps_car_07_cat     11489
ps_ind_05_cat      5809
ps_car_09_cat       569
ps_ind_02_cat       216
ps_car_01_cat       107
ps_ind_04_cat        83
dtype: int64


In [12]:
# Feature types
# Porto Seguro naming: 
# - bin = binary
# - cat = categorical  
# - blank = continuous
# - ind/reg/car/calc = feature groups

In [13]:
binary_features = [col for col in train.columns if 'bin' in col]
categorical_features = [col for col in train.columns if 'cat' in col]
continuous_features = [col for col in train.columns if col not in binary_features + categorical_features + ['id', 'target']]

In [14]:
print(f"Binary features: {len(binary_features)}")
print(f"Categorical features: {len(categorical_features)}")
print(f"Continuous features: {len(continuous_features)}")

Binary features: 17
Categorical features: 14
Continuous features: 26


In [15]:
# Cell 7: Quick correlation with target
correlations = train[continuous_features + ['target']].corr()['target'].sort_values(ascending=False)
print("Top 10 correlations with target:")
print(correlations.head(11))  # Top 10 + target itself

Top 10 correlations with target:
target        1.000000
ps_car_13     0.053899
ps_car_12     0.038790
ps_reg_02     0.034800
ps_reg_03     0.030888
ps_car_15     0.027667
ps_reg_01     0.022888
ps_ind_01     0.018570
ps_ind_03     0.008360
ps_ind_14     0.007443
ps_calc_03    0.001907
Name: target, dtype: float64
