# Deep Dive Into Data

In this notebook, we go through each column (where applicable) to determine what we can from the information. The end objective is a feature engineering procedure to handle all of the information.

In [1]:
import pandas as pd 
import numpy as np

from scipy.stats import spearmanr
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
0,ID_279628684,190000.0,0,3,0,1,1,0,,0,...,100,1849,1,100,0,1.0,0.0,100.0,1849,4
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,144,4489,1,144,0,1.0,64.0,144.0,4489,4
2,ID_68de51c94,,0,8,0,1,1,0,,0,...,121,8464,1,0,0,0.25,64.0,121.0,8464,4
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,81,289,16,121,4,1.777778,1.0,121.0,289,4
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,121,1369,16,121,4,1.777778,1.0,121.0,1369,4


In [3]:
mapping = {"yes": 1, "no": 0}


    # Fill in the values with the correct mapping
data['dependency'] = data['dependency'].replace(mapping).astype(np.float64)
data['edjefa'] = data['edjefa'].replace(mapping).astype(np.float64)
data['edjefe'] = data['edjefe'].replace(mapping).astype(np.float64)

data[['dependency', 'edjefa', 'edjefe']].describe()

Unnamed: 0,dependency,edjefa,edjefe
count,9557.0,9557.0,9557.0
mean,1.14955,2.89683,5.096788
std,1.605993,4.612056,5.246513
min,0.0,0.0,0.0
25%,0.333333,0.0,0.0
50%,0.666667,0.0,6.0
75%,1.333333,6.0,9.0
max,8.0,21.0,21.0


In [30]:
# Create correlation matrix
corr_matrix = data.corr()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(abs(upper[column]) > 0.95)]


In [31]:
pd.options.display.max_rows = 150

In [32]:
corr_matrix['coopele']

v2a1              -0.121159
hacdor            -0.016256
rooms             -0.042660
hacapo             0.002100
v14a               0.020980
refrig            -0.011786
v18q              -0.071448
v18q1             -0.089603
r4h1               0.017304
r4h2               0.008631
r4h3               0.017434
r4m1               0.002534
r4m2              -0.015276
r4m3              -0.010362
r4t1               0.012920
r4t2              -0.003682
r4t3               0.004646
tamhog             0.005266
tamviv            -0.003584
escolari          -0.021639
rez_esc           -0.023067
hhsize             0.005266
paredblolad       -0.019048
paredzocalo        0.046894
paredpreb         -0.002665
pareddes          -0.014769
paredmad           0.003707
paredzinc         -0.017167
paredfibras        0.012626
paredother        -0.013521
pisomoscer        -0.054673
pisocemento        0.096099
pisoother         -0.010838
pisonatur         -0.011425
pisonotiene       -0.019398
pisomadera        -0

In [6]:
to_drop

['tamhog',
 'hhsize',
 'coopele',
 'female',
 'hogar_total',
 'area2',
 'SQBage',
 'SQBhogar_total',
 'agesq']

In [8]:
data['inst'] = data[[c for c in data if c.startswith('instl')]].sum(axis = 1)
data['inst'].head()

0    1
1    1
2    1
3    1
4    1
Name: inst, dtype: int64

In [9]:
data['inst'].describe()

count    9557.000000
mean        0.999686
std         0.017716
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: inst, dtype: float64

In [12]:
data['inst'] = np.argmax(np.array(data[[c for c in data if c.startswith('instl')]]), axis = 1)
data['inst'].describe()

count    9557.000000
mean        2.877577
std         2.244121
min         0.000000
25%         1.000000
50%         2.000000
75%         4.000000
max         8.000000
Name: inst, dtype: float64

In [13]:
data['lugar'] = np.argmax(np.array(data[[c for c in data if c.startswith('lugar')]]), axis = 1)
data['lugar'].describe()

count    9557.000000
mean        1.244533
std         1.748453
min         0.000000
25%         0.000000
50%         0.000000
75%         3.000000
max         5.000000
Name: lugar, dtype: float64

In [14]:
import seaborn as sns

In [15]:
data.groupby('lugar')['Target'].value_counts(normalize = True)

lugar  Target
0      4         0.698540
       2         0.133903
       3         0.109687
       1         0.057870
1      4         0.599323
       2         0.181716
       3         0.132054
       1         0.086907
2      4         0.458054
       2         0.214765
       3         0.187919
       1         0.139262
3      4         0.513274
       2         0.221239
       3         0.141593
       1         0.123894
4      4         0.488294
       2         0.237458
       3         0.159420
       1         0.114827
5      4         0.551232
       2         0.217899
       3         0.141375
       1         0.089494
Name: Target, dtype: float64

In [16]:
data['area'] = np.argmax(np.array(data[['area1', 'area2']]), axis = 1)
data.groupby('area')['Target'].value_counts(normalize = True)

area  Target
0     4         0.658369
      2         0.154049
      3         0.114365
      1         0.073217
1     4         0.549853
      2         0.199780
      3         0.156891
      1         0.093475
Name: Target, dtype: float64

In [17]:
data.groupby('area')['Target'].value_counts(normalize = False)

area  Target
0     4         4496
      2         1052
      3          781
      1          500
1     4         1500
      2          545
      3          428
      1          255
Name: Target, dtype: int64

In [19]:
np.all(data['r4h3'] == (data['r4h2'] + data['r4h1']))

True

In [20]:
np.all(data['r4m3'] == (data['r4m2'] + data['r4m1']))

True

In [21]:
np.all(data['r4t3'] == (data['r4t2'] + data['r4t1']))

True

In [22]:
data['hhsize'].corr(data['tamhog'])

1.0

In [23]:
data['hhsize'].corr(data['tamviv'])

0.9436399930083661

In [24]:
data['hhsize'].corr(data['r4t3'])

0.9981074966165117