In [199]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as plt

#Import cranial database 
df = pd.read_csv('../../datasources/fdb/cranial.csv')
#This shuffles the rows.
df = df.sample(frac=1)

#Drop categorical data. We'll focus only on measurement data.
del df['DB']
del df['Item']
del df['ID']
del df['ContNum']
del df['FDN']
del df['Pop']
del df['PopSex']
del df['Ethnicity'] #Probably really important.
del df['BirthYear']
del df['Age'] #Probably really important.
del df['Comments']

#This col is basically empty and disturbs standardization.
del df["MOW"]

rows, cols = df.shape 
print("Starting with", rows, "individuals and", cols, "features.")

Starting with 5342 individuals and 110 features.


In [200]:
#Consider our target col, Sex.
print("Unique entries.")
df['Sex'].value_counts(dropna=False)

Unique entries.


M      3187
F      2095
N        58
NaN       2
Name: Sex, dtype: int64

In [201]:
#Clean N and Nan entries
df = df[~(df.Sex.str.contains("N") == True)]
df = df.dropna(subset = ['Sex'])

#Move from object to float.
df['Sex'] = df['Sex'].map({'M':1.0, 'F':0.})
df.describe()

Unnamed: 0,Sex,GOL,NOL,BNL,BBH,XCB,XFB,WFB,ZYB,AUB,...,MLN,MAN,BABR,BANA,BAPR,UFHT,UFBR,ORBR,BIOB,INTB
count,5282.0,5200.0,4396.0,5177.0,5182.0,5196.0,4395.0,2622.0,5096.0,5106.0,...,519.0,518.0,636.0,607.0,574.0,2115.0,1605.0,453.0,604.0,413.0
mean,0.60337,180.3975,177.689945,100.143906,133.846584,137.568129,114.752673,94.898932,129.449372,120.85566,...,77.046243,124.990347,136.886792,101.494234,97.031359,70.201891,103.535826,41.00883,96.253311,20.552058
std,0.489244,8.838111,8.23857,5.764435,7.336864,6.811824,6.477442,4.952665,7.313785,6.763771,...,7.211489,7.455211,6.889893,5.523686,6.481473,4.934519,4.827782,2.493338,4.400408,3.388348
min,0.0,131.0,150.0,79.0,107.0,116.0,93.0,79.0,90.0,98.0,...,54.0,90.0,84.0,85.0,73.0,47.0,87.0,34.0,81.0,0.0
25%,0.0,174.0,172.0,96.0,129.0,133.0,110.0,92.0,124.0,116.0,...,73.0,120.25,133.0,98.0,93.0,67.0,100.0,39.0,93.0,19.0
50%,1.0,180.0,178.0,100.0,134.0,137.0,115.0,95.0,129.0,121.0,...,77.0,125.0,137.0,101.0,97.0,70.0,104.0,41.0,96.0,21.0
75%,1.0,187.0,183.0,104.0,139.0,142.0,119.0,98.0,134.0,125.0,...,81.0,130.0,142.0,105.0,101.0,74.0,107.0,42.0,99.0,22.0
max,1.0,211.0,207.0,120.0,157.0,167.0,145.0,119.0,158.0,154.0,...,120.0,147.0,156.0,130.0,116.0,91.0,122.0,51.0,112.0,29.0


In [202]:
#Split into data and labels.
X = df.drop(['Sex'], axis=1)
y = df[['Sex']]

In [203]:
#Split train test
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [204]:
X_train.isnull().sum().sum()

163680

In [205]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4225, 109), (1057, 109), (4225, 1), (1057, 1))

In [206]:
#Standardize df. Can't use scikit's scale because it doesn't like Nan.
X_train_mean = X_train.mean()
X_train_std = X_train.std()
X_train = (X_train - X_train_mean) / (X_train_std)
#Intentionally using train stats for test set.
X_test = ( X_test - X_train_mean) / (X_train_std)
X_train.describe()

Unnamed: 0,GOL,NOL,BNL,BBH,XCB,XFB,WFB,ZYB,AUB,ASB,...,MLN,MAN,BABR,BANA,BAPR,UFHT,UFBR,ORBR,BIOB,INTB
count,4158.0,3490.0,4134.0,4142.0,4158.0,3493.0,2102.0,4068.0,4082.0,3490.0,...,417.0,416.0,514.0,490.0,466.0,1708.0,1294.0,367.0,487.0,332.0
mean,-1.109689e-15,1.479619e-15,8.855734e-16,1.397884e-15,1.121598e-15,2.797648e-16,1.172336e-15,-2.673214e-16,-8.273637e-16,6.065826e-16,...,-1.332534e-16,-7.10436e-16,-1.5663e-15,-5.102494e-16,3.533178e-16,-2.537328e-16,1.228624e-16,1.398821e-15,1.521028e-15,-2.006427e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-5.578697,-3.367987,-3.655352,-3.488171,-3.159272,-3.35461,-3.230522,-5.390989,-3.37582,-3.050422,...,-3.224532,-3.20974,-7.555541,-2.939213,-3.664889,-4.692913,-3.487215,-2.825116,-2.777565,-6.03816
25%,-0.7245847,-0.6948488,-0.7194567,-0.6568415,-0.6716779,-0.7357615,-0.5950658,-0.7452954,-0.7173088,-0.7370141,...,-0.5841882,-0.6406478,-0.5553256,-0.6269014,-0.5926848,-0.6436604,-0.7620486,-0.8142208,-0.7451666,-0.4720759
50%,-0.04726674,0.03418887,-0.02865796,0.01728445,-0.0863616,0.03448813,0.01311652,0.0745329,0.02116649,-0.07604039,...,-0.0283263,0.03542903,0.01612056,-0.09329106,0.02175595,-0.03627258,0.07646406,-0.009862702,-0.06770026,0.1138276
75%,0.7429376,0.6417202,0.6621408,0.6914104,0.6452838,0.6506879,0.6212988,0.6210851,0.6119467,0.5849333,...,0.5275356,0.7115058,0.7304282,0.6181894,0.6361967,0.5711153,0.7053486,0.3923163,0.609766,0.6997312
max,3.45221,3.557871,3.425336,3.118264,4.303511,3.885736,4.878575,3.900398,4.895103,3.559315,...,5.947189,3.010167,2.444767,5.064942,2.94035,4.215442,3.849771,4.011928,3.545453,2.457442


In [207]:
#The imputer. I haven't looked closely at this algo. Try other variations.
from fancyimpute import KNN
X_train_filled = pd.DataFrame(KNN(k=3).complete(X_train))
X_test_filled = pd.DataFrame(KNN(k=3).complete(X_test))

Computing pairwise distances between 4225 samples
Computing distances for sample #1/4225, elapsed time: 1.981
Computing distances for sample #101/4225, elapsed time: 2.229
No samples have sufficient overlap with sample 139
Computing distances for sample #201/4225, elapsed time: 2.484
Computing distances for sample #301/4225, elapsed time: 2.717
No samples have sufficient overlap with sample 396
Computing distances for sample #401/4225, elapsed time: 2.959
No samples have sufficient overlap with sample 409
No samples have sufficient overlap with sample 421
Computing distances for sample #501/4225, elapsed time: 3.211
Computing distances for sample #601/4225, elapsed time: 3.457
Computing distances for sample #701/4225, elapsed time: 3.695
No samples have sufficient overlap with sample 710
Computing distances for sample #801/4225, elapsed time: 3.929
Computing distances for sample #901/4225, elapsed time: 4.173
No samples have sufficient overlap with sample 904
Computing distances for sa

In [208]:
#Num zero entries
(X_train_filled == 0).astype(int).sum().sum(),(X_test_filled == 0).astype(int).sum().sum()

(6457, 588)

In [209]:
#Impute zeros with mean. Could do this better.
#Think this introduces an improbable bug when the train set
#Has representation in a col that the test set doesn't or 
#Vice versa. Ignoring it for now. 
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values=0, strategy='mean', axis=0)
X_train_filled = pd.DataFrame(imp.fit_transform(X_train_filled))
X_train_filled = pd.DataFrame(imp.transform(X_train_filled))
X_test_filled = pd.DataFrame(imp.fit_transform(X_test_filled))

In [210]:
#No zero values
(X_train_filled == 0).astype(int).sum().sum(), (X_test_filled == 0).astype(int).sum().sum()

(0, 0)

In [211]:
#Notice that the training (and testing) set is no longer standardized
X_train_filled.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99,100,101,102,103,104,105,106,107,108
count,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,...,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0
mean,0.001656,0.032549,0.003616,0.003588,0.001804,0.019404,-0.093083,-0.008431,0.00104,0.028736,...,0.070353,0.227656,-0.268149,-0.059113,0.14003,-0.191663,-0.140561,0.085212,-0.007881,-0.03494
std,0.993224,0.971927,0.99264,0.993499,0.993506,0.950067,0.894621,0.988405,0.99017,0.957859,...,0.5341,0.535358,0.842188,0.82397,0.757669,0.854835,0.736098,0.685014,0.796483,0.613454
min,-5.578697,-3.367987,-3.655352,-3.488171,-3.159272,-3.35461,-3.230522,-5.390989,-3.37582,-3.050422,...,-3.224532,-3.20974,-7.555541,-2.939213,-3.664889,-4.692913,-3.487215,-2.825116,-2.777565,-6.03816
25%,-0.724585,-0.576141,-0.719457,-0.656842,-0.671678,-0.581712,-0.735038,-0.745295,-0.717309,-0.571771,...,-0.213388,0.002472,-0.856247,-0.54238,-0.30788,-0.70844,-0.55242,-0.408506,-0.57071,-0.432821
50%,-0.047267,0.034189,-0.028658,0.017284,-0.086362,0.034488,-0.174989,-0.008431,0.021166,0.0449,...,0.070353,0.234274,-0.28593,-0.135187,0.131579,-0.164918,-0.166376,0.033226,-0.098223,-0.023698
75%,0.742938,0.64172,0.662141,0.69141,0.645284,0.650688,0.457463,0.621085,0.611947,0.639815,...,0.396314,0.541004,0.311427,0.440319,0.613307,0.298536,0.266185,0.530029,0.556258,0.39567
max,3.45221,3.557871,3.425336,3.118264,4.303511,3.885736,4.878575,3.900398,4.895103,3.559315,...,5.947189,3.010167,2.444767,5.064942,2.94035,4.215442,3.849771,4.011928,3.545453,2.457442


In [212]:
#Should we standardize again?
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train_filled)
X_train_filled = pd.DataFrame(scaler.transform(X_train_filled))
#Uses test stats
X_test_filled = pd.DataFrame(scaler.transform(X_test_filled))

In [213]:
#Save to disk
X_train_filled.to_csv("data/X_train_preproc.csv", index=False)
#Ravel strips col name and reindexes
pd.DataFrame(np.ravel(y_train)).to_csv("data/y_train_preproc.csv", index=False)

X_test_filled.to_csv("data/X_test_preproc.csv", index=False)
pd.DataFrame(np.ravel(y_test)).to_csv("data/y_test_preproc.csv", index=False)