In [25]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as plt

#Import cranial database 
df = pd.read_csv('../../datasources/fdb/cranial.csv')
#This shuffles the rows.
df = df.sample(frac=1)

#Drop categorical data. We'll focus only on measurement data.
del df['DB']
del df['Item']
del df['ID']
del df['ContNum']
del df['FDN']
del df['Pop']
del df['PopSex']
del df['Ethnicity'] #Probably really important.
del df['BirthYear']
del df['Age'] #Probably really important.
del df['Comments']

#This col is basically empty and disturbs standardization.
del df["MOW"]

rows, cols = df.shape 
print("Starting with", rows, "individuals and", cols, "features.")

Starting with 5342 individuals and 110 features.


In [26]:
#Consider our target col, Sex.
print("Unique entries.")
df['Sex'].value_counts(dropna=False)

Unique entries.


M      3187
F      2095
N        58
NaN       2
Name: Sex, dtype: int64

In [27]:
#Clean N and Nan entries
df = df[~(df.Sex.str.contains("N") == True)]
df = df.dropna(subset = ['Sex'])

#Move from object to float.
df['Sex'] = df['Sex'].map({'M':1.0, 'F':0.})
df.describe()



Unnamed: 0,Sex,GOL,NOL,BNL,BBH,XCB,XFB,WFB,ZYB,AUB,...,MLN,MAN,BABR,BANA,BAPR,UFHT,UFBR,ORBR,BIOB,INTB
count,5282.0,5200.0,4396.0,5177.0,5182.0,5196.0,4395.0,2622.0,5096.0,5106.0,...,519.0,518.0,636.0,607.0,574.0,2115.0,1605.0,453.0,604.0,413.0
mean,0.60337,180.3975,177.689945,100.143906,133.846584,137.568129,114.752673,94.898932,129.449372,120.85566,...,77.046243,124.990347,136.886792,101.494234,97.031359,70.201891,103.535826,41.00883,96.253311,20.552058
std,0.489244,8.838111,8.23857,5.764435,7.336864,6.811824,6.477442,4.952665,7.313785,6.763771,...,7.211489,7.455211,6.889893,5.523686,6.481473,4.934519,4.827782,2.493338,4.400408,3.388348
min,0.0,131.0,150.0,79.0,107.0,116.0,93.0,79.0,90.0,98.0,...,54.0,90.0,84.0,85.0,73.0,47.0,87.0,34.0,81.0,0.0
25%,0.0,,,,,,,,,,...,,,,,,,,,,
50%,1.0,,,,,,,,,,...,,,,,,,,,,
75%,1.0,,,,,,,,,,...,,,,,,,,,,
max,1.0,211.0,207.0,120.0,157.0,167.0,145.0,119.0,158.0,154.0,...,120.0,147.0,156.0,130.0,116.0,91.0,122.0,51.0,112.0,29.0


In [28]:
#Split into data and labels.
X = df.drop(['Sex'], axis=1)
y = df[['Sex']]

In [29]:
#Split train test
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01)

In [30]:
X_train.isnull().sum().sum()

201219

In [31]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5229, 109), (53, 109), (5229, 1), (53, 1))

In [32]:
#Standardize df. Can't use scikit's scale because it doesn't like Nan.
X_train_mean = X_train.mean()
X_train_std = X_train.std()
X_train = (X_train - X_train_mean) / (X_train_std)
#Intentionally using train stats for test set.
X_test = ( X_test - X_train_mean) / (X_train_std)
X_train.describe()



Unnamed: 0,GOL,NOL,BNL,BBH,XCB,XFB,WFB,ZYB,AUB,ASB,...,MLN,MAN,BABR,BANA,BAPR,UFHT,UFBR,ORBR,BIOB,INTB
count,5148.0,4349.0,5125.0,5130.0,5144.0,4347.0,2595.0,5046.0,5054.0,4349.0,...,512.0,511.0,633.0,604.0,571.0,2094.0,1584.0,450.0,601.0,411.0
mean,7.591269000000001e-17,-4.987196e-16,-2.2182800000000002e-17,9.834022e-16,-1.217619e-15,-8.140103e-16,1.505967e-16,-1.396866e-15,-8.533824e-16,-1.744089e-16,...,-5.2041700000000004e-18,2.077051e-16,-9.204503e-16,1.195512e-15,-9.768407e-16,-3.562893e-16,-9.420074e-16,-2.822434e-16,1.075863e-15,-2.895764e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-5.582534,-3.357156,-3.670704,-3.654984,-3.163812,-3.359802,-3.205032,-3.355656,-3.376289,-3.34944,...,-3.186018,-4.703,-7.670821,-2.982493,-3.697707,-4.697471,-3.430601,-2.800909,-3.459169,-6.055747
25%,,,,,,,,,,,...,,,,,,,,,,
50%,,,,,,,,,,,...,,,,,,,,,,
75%,,,,,,,,,,,...,,,,,,,,,,
max,3.457765,3.552178,3.449335,3.152281,4.313258,4.669028,4.862341,3.91322,4.892685,3.534176,...,5.961906,2.944199,2.77545,5.160567,2.922038,4.207823,3.830318,3.996982,3.575078,2.493163


In [33]:
#The imputer. I haven't looked closely at this algo. Try other variations.
from fancyimpute import KNN, SoftImpute
# X_train_filled = pd.DataFrame(KNN(k=3).complete(X_train))
# X_test_filled = pd.DataFrame(KNN(k=3).complete(X_test))
X_train_filled = SoftImpute().complete(X_train)
X_test_filled = SoftImpute().complete(X_test)

[SoftImpute] Max Singular Value of X_init = 279.713738
[SoftImpute] Iter 1: observed MAE=0.062063 rank=106
[SoftImpute] Iter 2: observed MAE=0.062472 rank=103
[SoftImpute] Iter 3: observed MAE=0.062746 rank=102
[SoftImpute] Iter 4: observed MAE=0.062965 rank=101
[SoftImpute] Iter 5: observed MAE=0.063128 rank=101
[SoftImpute] Iter 6: observed MAE=0.063278 rank=101
[SoftImpute] Iter 7: observed MAE=0.063387 rank=100
[SoftImpute] Iter 8: observed MAE=0.063480 rank=99
[SoftImpute] Iter 9: observed MAE=0.063554 rank=99
[SoftImpute] Iter 10: observed MAE=0.063617 rank=99
[SoftImpute] Iter 11: observed MAE=0.063671 rank=99
[SoftImpute] Iter 12: observed MAE=0.063716 rank=99
[SoftImpute] Iter 13: observed MAE=0.063746 rank=97
[SoftImpute] Iter 14: observed MAE=0.063762 rank=97
[SoftImpute] Iter 15: observed MAE=0.063779 rank=97
[SoftImpute] Iter 16: observed MAE=0.063794 rank=97
[SoftImpute] Iter 17: observed MAE=0.063808 rank=97
[SoftImpute] Iter 18: observed MAE=0.063819 rank=97
[SoftImpute

In [34]:
#Num zero entries
(X_train_filled == 0).astype(int).sum().sum(),(X_test_filled == 0).astype(int).sum().sum()

(1853, 0)

In [35]:
#Impute zeros with mean. Could do this better.
#Think this introduces an improbable bug when the train set
#Has representation in a col that the test set doesn't or 
#Vice versa. Ignoring it for now. 
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values=0, strategy='mean', axis=0)
X_train_filled = pd.DataFrame(imp.fit_transform(X_train_filled))
X_train_filled = pd.DataFrame(imp.transform(X_train_filled))
X_test_filled = pd.DataFrame(imp.fit_transform(X_test_filled))

In [36]:
#No zero values
(X_train_filled == 0).astype(int).sum().sum(), (X_test_filled == 0).astype(int).sum().sum()

(0, 0)

In [37]:
#Notice that the training (and testing) set is no longer standardized
X_train_filled.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99,100,101,102,103,104,105,106,107,108
count,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,...,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0,5229.0
mean,0.002446,0.025758,0.002377,0.002173,0.000909,0.010368,-0.034736,-0.002499,-0.002054,0.011452,...,-0.005339,-0.005193,-0.011222,-0.002438,-0.003152,-0.033384,-0.002273,0.005356,-0.002538,0.001519
std,0.994437,0.975632,0.994933,0.993018,0.99299,0.938159,0.800749,0.989001,0.988037,0.934575,...,0.403877,0.346891,0.604945,0.627776,0.654798,0.897506,0.812758,0.455549,0.643631,0.376582
min,-5.582534,-3.357156,-3.670704,-3.654984,-3.163812,-3.359802,-3.205032,-3.355656,-3.376289,-3.34944,...,-3.186018,-4.703,-7.670821,-2.982493,-3.697707,-4.697471,-3.430601,-2.800909,-3.459169,-6.055747
25%,-0.723373,-0.622953,-0.718493,-0.659787,-0.671455,-0.580592,-0.535161,-0.749833,-0.718405,-0.563215,...,-0.196726,-0.111094,-0.395434,-0.418413,-0.429821,-0.609212,-0.526233,-0.263521,-0.422802,-0.175777
50%,-0.045351,0.036903,-0.023855,0.020939,-0.085019,0.037011,-0.048235,-0.06409,0.019896,0.011452,...,-0.005339,-0.005193,-0.011222,-0.017277,-0.003152,-0.042431,0.000465,-0.001777,-0.016299,0.001519
75%,0.745675,0.644798,0.670783,0.701666,0.648028,0.613739,0.425286,0.621653,0.610537,0.584055,...,0.178733,0.102175,0.379351,0.399659,0.408231,0.564748,0.511041,0.260416,0.398322,0.173197
max,3.457765,3.552178,5.606616,3.152281,4.313258,4.669028,4.862341,3.91322,4.892685,3.534176,...,5.961906,2.944199,2.77545,5.160567,2.922038,4.207823,3.830318,3.996982,3.575078,2.493163


In [38]:
#Should we standardize again?
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train_filled)
X_train_filled = pd.DataFrame(scaler.transform(X_train_filled))
#Uses test stats
X_test_filled = pd.DataFrame(scaler.transform(X_test_filled))

In [39]:
#Save to disk
X_train_filled.to_csv("data/X_train_preproc.csv", index=False)
#Ravel strips col name and reindexes
pd.DataFrame(np.ravel(y_train)).to_csv("data/y_train_preproc.csv", index=False)

X_test_filled.to_csv("data/X_test_preproc.csv", index=False)
pd.DataFrame(np.ravel(y_test)).to_csv("data/y_test_preproc.csv", index=False)