In [16]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as plt

#Import cranial database 
df = pd.read_csv('../../datasources/fdb/cranial.csv')
#This shuffles the rows.
df = df.sample(frac=1)

#Drop categorical data. We'll focus only on measurement data.
del df['DB']
del df['Item']
del df['ID']
del df['ContNum']
del df['FDN']
del df['Pop']
del df['PopSex']
del df['Ethnicity'] #Probably really important.
del df['BirthYear']
del df['Age'] #Probably really important.
del df['Comments']

#This col is basically empty and disturbs standardization.
del df["MOW"]

rows, cols = df.shape 
print("Starting with", rows, "individuals and", cols, "features.")

Starting with 5342 individuals and 110 features.


In [17]:
#Consider our target col, Sex.
print("Unique entries.")
df['Sex'].value_counts(dropna=False)

Unique entries.


M      3187
F      2095
N        58
NaN       2
Name: Sex, dtype: int64

In [18]:
#Clean N and Nan entries
df = df[~(df.Sex.str.contains("N") == True)]
df = df.dropna(subset = ['Sex'])

#Move from object to float.
df['Sex'] = df['Sex'].map({'M':1.0, 'F':0.})
df.describe()

Unnamed: 0,Sex,GOL,NOL,BNL,BBH,XCB,XFB,WFB,ZYB,AUB,...,MLN,MAN,BABR,BANA,BAPR,UFHT,UFBR,ORBR,BIOB,INTB
count,5282.0,5200.0,4396.0,5177.0,5182.0,5196.0,4395.0,2622.0,5096.0,5106.0,...,519.0,518.0,636.0,607.0,574.0,2115.0,1605.0,453.0,604.0,413.0
mean,0.60337,180.3975,177.689945,100.143906,133.846584,137.568129,114.752673,94.898932,129.449372,120.85566,...,77.046243,124.990347,136.886792,101.494234,97.031359,70.201891,103.535826,41.00883,96.253311,20.552058
std,0.489244,8.838111,8.23857,5.764435,7.336864,6.811824,6.477442,4.952665,7.313785,6.763771,...,7.211489,7.455211,6.889893,5.523686,6.481473,4.934519,4.827782,2.493338,4.400408,3.388348
min,0.0,131.0,150.0,79.0,107.0,116.0,93.0,79.0,90.0,98.0,...,54.0,90.0,84.0,85.0,73.0,47.0,87.0,34.0,81.0,0.0
25%,0.0,174.0,172.0,96.0,129.0,133.0,110.0,92.0,124.0,116.0,...,73.0,120.25,133.0,98.0,93.0,67.0,100.0,39.0,93.0,19.0
50%,1.0,180.0,178.0,100.0,134.0,137.0,115.0,95.0,129.0,121.0,...,77.0,125.0,137.0,101.0,97.0,70.0,104.0,41.0,96.0,21.0
75%,1.0,187.0,183.0,104.0,139.0,142.0,119.0,98.0,134.0,125.0,...,81.0,130.0,142.0,105.0,101.0,74.0,107.0,42.0,99.0,22.0
max,1.0,211.0,207.0,120.0,157.0,167.0,145.0,119.0,158.0,154.0,...,120.0,147.0,156.0,130.0,116.0,91.0,122.0,51.0,112.0,29.0


In [19]:
#Split into data and labels.
X = df.drop(['Sex'], axis=1)
y = df[['Sex']]

In [20]:
#Split train test
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01)

In [21]:
X_train.isnull().sum().sum()

200836

In [22]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5229, 109), (53, 109), (5229, 1), (53, 1))

In [23]:
#Standardize df. Can't use scikit's scale because it doesn't like Nan.
X_train_mean = X_train.mean()
X_train_std = X_train.std()
X_train = (X_train - X_train_mean) / (X_train_std)
#Intentionally using train stats for test set.
X_test = ( X_test - X_train_mean) / (X_train_std)
X_train.describe()

Unnamed: 0,GOL,NOL,BNL,BBH,XCB,XFB,WFB,ZYB,AUB,ASB,...,MLN,MAN,BABR,BANA,BAPR,UFHT,UFBR,ORBR,BIOB,INTB
count,5149.0,4354.0,5125.0,5130.0,5145.0,4354.0,2595.0,5044.0,5056.0,4356.0,...,511.0,511.0,626.0,597.0,565.0,2093.0,1593.0,444.0,593.0,405.0
mean,-7.484788e-16,1.464913e-15,-9.256985e-16,-1.634993e-15,9.568418e-16,1.205538e-15,3.566404e-16,1.287674e-15,-2.54675e-16,7.072065e-16,...,-3.802134e-18,-1.051562e-16,1.215199e-15,-8.594465e-16,3.8808680000000006e-17,-1.183371e-15,6.183238e-16,-1.26888e-15,-1.021855e-15,1.436437e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-5.592392,-3.366252,-3.673726,-3.658135,-3.165778,-3.358941,-3.213419,-5.395259,-3.378691,-3.360803,...,-3.172503,-4.675418,-7.680145,-2.985927,-3.71607,-4.70062,-3.428648,-2.802876,-3.457685,-6.33166
25%,-0.7252169,-0.6924329,-0.7195752,-0.6591148,-0.6688461,-0.7325772,-0.5828118,-0.7456611,-0.7172185,-0.7291317,...,-0.5535223,-0.6650019,-0.557737,-0.6291916,-0.6273174,-0.6489788,-0.7328785,-0.7982417,-0.7292656,-0.4910794
50%,-0.04607622,0.03679056,-0.02448088,0.02248069,-0.08133265,0.03988263,0.02425133,-0.06189667,0.02207951,-0.071214,...,-0.1399991,0.003400875,0.02368409,-0.08532956,-0.009566932,-0.04123266,0.09658895,0.003611953,-0.04716071,0.1237186
75%,0.7462545,0.6444768,0.6706134,0.7040762,0.6530592,0.6578505,0.6313145,0.6218677,0.6135179,0.5867037,...,0.5492063,0.6718036,0.7504605,0.6398198,0.6081835,0.7690956,0.7186895,0.4045388,0.6349442,0.4311176
max,3.462817,3.561371,3.450991,3.15782,4.325018,4.674642,4.880756,3.903937,4.901446,3.547333,...,5.925008,2.944373,2.785434,5.172004,2.924748,4.21299,3.829192,4.01288,3.590732,2.58291


In [24]:
#The imputer. I haven't looked closely at this algo. Try other variations.
from fancyimpute import KNN, SoftImpute
# X_train_filled = pd.DataFrame(KNN(k=3).complete(X_train))
# X_test_filled = pd.DataFrame(KNN(k=3).complete(X_test))
X_train_filled = SoftImpute().complete(X_train)
X_test_filled = SoftImpute().complete(X_test)

[SoftImpute] Max Singular Value of X_init = 279.492252
[SoftImpute] Iter 1: observed MAE=0.061960 rank=106
[SoftImpute] Iter 2: observed MAE=0.062372 rank=103
[SoftImpute] Iter 3: observed MAE=0.062646 rank=102
[SoftImpute] Iter 4: observed MAE=0.062866 rank=101
[SoftImpute] Iter 5: observed MAE=0.063029 rank=101
[SoftImpute] Iter 6: observed MAE=0.063177 rank=101
[SoftImpute] Iter 7: observed MAE=0.063287 rank=100
[SoftImpute] Iter 8: observed MAE=0.063375 rank=99
[SoftImpute] Iter 9: observed MAE=0.063450 rank=99
[SoftImpute] Iter 10: observed MAE=0.063514 rank=99
[SoftImpute] Iter 11: observed MAE=0.063568 rank=99
[SoftImpute] Iter 12: observed MAE=0.063613 rank=99
[SoftImpute] Iter 13: observed MAE=0.063643 rank=98
[SoftImpute] Iter 14: observed MAE=0.063660 rank=97
[SoftImpute] Iter 15: observed MAE=0.063677 rank=97
[SoftImpute] Iter 16: observed MAE=0.063692 rank=97
[SoftImpute] Iter 17: observed MAE=0.063706 rank=97
[SoftImpute] Iter 18: observed MAE=0.063717 rank=97
[SoftImpute

In [25]:
#Num zero entries
(X_train_filled == 0).astype(int).sum().sum(),(X_test_filled == 0).astype(int).sum().sum()

(1962, 0)

In [26]:
#Impute zeros with mean. Could do this better.
#Think this introduces an improbable bug when the train set
#Has representation in a col that the test set doesn't or 
#Vice versa. Ignoring it for now. 
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values=0, strategy='mean', axis=0)
X_train_filled = pd.DataFrame(imp.fit_transform(X_train_filled))
X_train_filled = pd.DataFrame(imp.transform(X_train_filled))
X_test_filled = pd.DataFrame(imp.fit_transform(X_test_filled))

In [27]:
#No zero values
(X_train_filled == 0).astype(int).sum().sum(), (X_test_filled == 0).astype(int).sum().sum()

(0, 0)

In [28]:
#Notice that the training (and testing) set is no longer standardized
X_train_filled.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99,100,101,102,103,104,105,106,107,108
count,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,...,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0
mean,0.002345,0.025971,0.002379,0.00219,0.00095,0.010316,-0.035145,-0.002372,-0.002103,0.010693,...,-0.004966,-0.004903,-0.010775,-0.002039,-0.003513,-0.032981,-0.002816,0.005193,-0.002352,0.000726
std,0.99451,0.976106,0.994934,0.99302,0.993087,0.938528,0.80058,0.988857,0.98811,0.934795,...,0.403905,0.34651,0.601012,0.624881,0.652541,0.897025,0.811173,0.453427,0.638914,0.383673
min,-5.592392,-3.366252,-3.673726,-3.658135,-3.165778,-3.358941,-3.213419,-5.395259,-3.378691,-3.360803,...,-3.172503,-4.675418,-7.680145,-2.985927,-3.71607,-4.70062,-3.428648,-2.802876,-3.457685,-6.33166
25%,-0.725217,-0.612526,-0.719575,-0.659115,-0.668846,-0.578085,-0.540047,-0.745661,-0.717218,-0.564652,...,-0.196844,-0.108455,-0.391959,-0.41442,-0.42749,-0.611771,-0.525512,-0.263802,-0.415893,-0.18368
50%,-0.046076,0.036791,-0.024481,0.022481,-0.081333,0.039883,-0.050121,-0.061897,0.02208,0.010693,...,-0.004912,-0.003,-0.010579,-0.014333,-0.009567,-0.041233,-0.002816,0.003612,-0.015786,0.000676
75%,0.746255,0.644477,0.670613,0.704076,0.653059,0.611372,0.42896,0.621868,0.613518,0.586704,...,0.179587,0.101647,0.376633,0.395908,0.406248,0.566514,0.511323,0.25904,0.400575,0.182503
max,3.462817,3.561371,5.605809,3.15782,4.325018,4.674642,4.880756,3.903937,4.901446,3.547333,...,5.925008,2.944373,2.785434,5.172004,2.924748,4.21299,3.829192,4.01288,3.590732,2.58291


In [29]:
#Should we standardize again?
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train_filled)
X_train_filled = pd.DataFrame(scaler.transform(X_train_filled))
#Uses test stats
X_test_filled = pd.DataFrame(scaler.transform(X_test_filled))

In [30]:
#Save to disk
X_train_filled.to_csv("data/X_train_preproc.csv", index=False)
#Ravel strips col name and reindexes
pd.DataFrame(np.ravel(y_train)).to_csv("data/y_train_preproc.csv", index=False)

X_test_filled.to_csv("data/X_test_preproc.csv", index=False)
pd.DataFrame(np.ravel(y_test)).to_csv("data/y_test_preproc.csv", index=False)