In [156]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as plt

#Import cranial database 
df = pd.read_csv('../../datasources/fdb/postcranial.csv')
#This shuffles the rows.
df = df.sample(frac=1)

#Drop categorical data. We'll focus only on measurement data.
del df['Item']
del df['ID']
del df['ContNum']
del df['FDN']
del df['Pop']
del df['PopSex']
del df['BirthYear']
del df['Age'] #Probably really important.
del df['Comments']
del df['DB']

#I don't know what these are.
# del df['FStat']
# del df['CStat']
# del df['MStat']

rows, cols = df.shape 
print("Starting with", rows, "individuals and", cols, "features.")

Starting with 3861 individuals and 48 features.


In [157]:
# df = df.dropna(subset = ['Pop'])
# df['Pop'].value_counts(dropna=False)

In [158]:
#Turn pop to one hot.
# df['Pop'] = df['Pop'].map({'W':0, 'B':1, 'H':2})
# cat_col = pd.DataFrame(df['Pop'])
# from sklearn import preprocessing
# enc = preprocessing.OneHotEncoder()
# enc.fit(cat_col)
# cat_col_oh = pd.DataFrame(enc.transform(cat_col).toarray())
# del df['Pop']
# df = pd.concat([df, cat_col_oh], axis=1)

In [159]:
#Consider our target col, Sex.
print("Unique entries.")
df['Sex'].value_counts(dropna=False)

Unique entries.


M      2973
F       887
NaN       1
Name: Sex, dtype: int64

In [160]:
#Clean N and Nan entries
df = df.dropna(subset = ['Sex'])

#Move from object to float.
df['Sex'] = df['Sex'].map({'M':1.0, 'F':0.})
df.describe()

Unnamed: 0,Sex,FStat,CStat,MStat,claxln,claapd,clavrd,scapht,scapbr,humxln,...,tibxln,tibpeb,tibdeb,tibnfx,tibnft,tibcir,fibxln,fibmdm,calcxl,calcbr
count,3860.0,894.0,1033.0,1211.0,1635.0,1570.0,1568.0,1639.0,1705.0,3572.0,...,3495.0,1602.0,1648.0,1739.0,1721.0,1498.0,3304.0,1522.0,1447.0,1391.0
mean,0.770207,171.201342,168.998064,174.02725,150.682569,12.415924,10.190689,154.868212,103.93607,328.865062,...,385.788555,76.134207,50.015777,34.504313,24.496223,93.779039,377.056598,15.296978,82.938493,42.479511
std,0.420754,9.649151,9.381864,6.279055,11.852631,2.028384,1.794479,13.947257,8.539951,20.914442,...,28.831919,6.111199,4.387955,3.831515,3.02972,9.349371,26.09544,2.008818,6.177163,3.729444
min,0.0,137.0,134.0,156.0,118.0,7.0,4.0,98.0,83.0,236.0,...,268.0,27.0,38.0,22.0,17.0,68.0,276.0,9.0,65.0,28.0
25%,1.0,165.0,163.0,170.0,142.0,11.0,9.0,144.0,97.0,315.0,...,366.0,71.0,47.0,32.0,22.0,87.0,360.0,14.0,78.0,40.0
50%,1.0,171.0,169.0,174.0,151.0,12.0,10.0,157.0,104.0,330.0,...,386.0,77.0,50.0,35.0,24.0,94.0,377.0,15.0,83.0,43.0
75%,1.0,178.0,175.0,178.0,160.0,14.0,11.0,165.0,110.0,343.0,...,405.0,81.0,53.0,37.0,26.0,100.0,394.0,17.0,87.0,45.0
max,1.0,202.0,203.0,191.0,183.0,22.0,20.0,199.0,131.0,404.0,...,483.0,94.0,67.0,55.0,41.0,127.0,469.0,22.0,100.0,83.0


In [161]:
#Split into data and labels.
X = df.drop(['Sex'], axis=1)
y = df[['Sex']]

In [162]:
#Split train test
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01)

In [163]:
X_train.isnull().sum().sum(),X_train.isnull().sum().sum()

(94759, 94759)

In [164]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3821, 47), (39, 47), (3821, 1), (39, 1))

In [165]:
#Standardize df. Can't use scikit's scale because it doesn't like Nan.
X_train_mean = X_train.mean()
X_train_std = X_train.std()
X_train = (X_train - X_train_mean) / (X_train_std)
#Intentionally using train stats for test set.
X_test = ( X_test - X_train_mean) / (X_train_std)
X_train.describe()

Unnamed: 0,FStat,CStat,MStat,claxln,claapd,clavrd,scapht,scapbr,humxln,humebr,...,tibxln,tibpeb,tibdeb,tibnfx,tibnft,tibcir,fibxln,fibmdm,calcxl,calcbr
count,889.0,1023.0,1200.0,1619.0,1556.0,1554.0,1621.0,1687.0,3535.0,1749.0,...,3459.0,1586.0,1635.0,1722.0,1705.0,1488.0,3273.0,1510.0,1437.0,1381.0
mean,1.443914e-15,1.0412e-15,1.256958e-15,-4.952459e-16,-2.465894e-16,2.820209e-16,1.659513e-16,-4.5935720000000007e-17,1.186353e-15,-9.712071000000001e-17,...,-3.08513e-16,-4.034052e-16,3.8262060000000003e-17,-9.051348e-16,-2.176167e-16,7.057527e-16,7.830922e-16,-6.896617e-17,-1.049111e-15,1.614687e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-3.537062,-3.735797,-2.881785,-2.759629,-2.666208,-3.447531,-4.080878,-2.453615,-4.437715,-2.914732,...,-4.078939,-8.026343,-2.737226,-3.259139,-2.467409,-2.752894,-3.864475,-3.139163,-2.904786,-3.87641
25%,-0.6400975,-0.6386059,-0.6453664,-0.729562,-0.6956013,-0.6618284,-0.777599,-0.8095183,-0.6604485,-0.7096454,...,-0.6853093,-0.8360387,-0.6859792,-0.6526754,-0.8206023,-0.7222155,-0.6505924,-0.6414625,-0.7987428,-0.6603966
50%,-0.01931931,0.002192369,-0.006389766,0.03171327,-0.2029497,-0.1046879,0.1559362,0.01253017,0.05675401,0.1723893,...,0.007268159,0.1444573,-0.002230373,0.1292636,-0.1618796,0.02592937,-0.0001636563,-0.1419223,0.01127372,0.1436067
75%,0.7049219,0.6429906,0.6325869,0.7929885,0.7823536,0.4524526,0.7304195,0.7171431,0.6783295,0.7604124,...,0.6652168,0.7981212,0.6815185,0.6505563,0.496843,0.6671964,0.6502651,0.8571579,0.6592869,0.6796089
max,3.188035,3.633382,2.709261,2.73847,4.723567,5.466717,3.171973,3.183289,3.594953,2.377476,...,3.366269,2.922529,3.872346,5.342191,5.437263,3.552898,3.519804,3.354859,2.76533,10.86365


In [166]:
#The imputer. I haven't looked closely at this algo. Try other variations.
from fancyimpute import KNN, SoftImpute, NuclearNormMinimization, MatrixFactorization, BiScaler
if True:
#     X_train_filled = pd.DataFrame(KNN(k=1).complete(X_train))
#     X_test_filled = pd.DataFrame(KNN(k=1).complete(X_test))
    X_train_filled = SoftImpute().complete(X_train)
    X_test_filled = SoftImpute().complete(X_test)
    
else:
    from sklearn.preprocessing import Imputer
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    X_train_filled = pd.DataFrame(imp.fit_transform(X_train_filled))
    # X_train_filled = pd.DataFrame(imp.transform(X_train_filled))
    X_test_filled = pd.DataFrame(imp.transform(X_test_filled))

[SoftImpute] Max Singular Value of X_init = 199.921319
[SoftImpute] Iter 1: observed MAE=0.066640 rank=47
[SoftImpute] Iter 2: observed MAE=0.067265 rank=47
[SoftImpute] Iter 3: observed MAE=0.067835 rank=47
[SoftImpute] Iter 4: observed MAE=0.068323 rank=47
[SoftImpute] Iter 5: observed MAE=0.068673 rank=46
[SoftImpute] Iter 6: observed MAE=0.068895 rank=46
[SoftImpute] Iter 7: observed MAE=0.069083 rank=46
[SoftImpute] Iter 8: observed MAE=0.069237 rank=46
[SoftImpute] Iter 9: observed MAE=0.069361 rank=46
[SoftImpute] Iter 10: observed MAE=0.069463 rank=46
[SoftImpute] Iter 11: observed MAE=0.069545 rank=46
[SoftImpute] Iter 12: observed MAE=0.069612 rank=46
[SoftImpute] Iter 13: observed MAE=0.069667 rank=46
[SoftImpute] Iter 14: observed MAE=0.069713 rank=46
[SoftImpute] Iter 15: observed MAE=0.069752 rank=46
[SoftImpute] Iter 16: observed MAE=0.069784 rank=46
[SoftImpute] Iter 17: observed MAE=0.069810 rank=46
[SoftImpute] Iter 18: observed MAE=0.069833 rank=46
[SoftImpute] Iter 

In [167]:
#Num zero entries
(X_train_filled == 0).astype(int).sum().sum(),(X_test_filled == 0).astype(int).sum().sum()

(1692, 0)

In [168]:
#Impute zeros with mean. Could do this better.
#Think this introduces an improbable bug when the train set
#Has representation in a col that the test set doesn't or 
#Vice versa. Ignoring it for now. 
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values=0, strategy='mean', axis=0)
X_train_filled = pd.DataFrame(imp.fit_transform(X_train_filled))
# X_train_filled = pd.DataFrame(imp.transform(X_train_filled))
X_test_filled = pd.DataFrame(imp.transform(X_test_filled))

In [169]:
#No zero values
(X_train_filled == 0).astype(int).sum().sum(), (X_test_filled == 0).astype(int).sum().sum()

(0, 0)

In [170]:
#Notice that the training (and testing) set is no longer standardized
X_train_filled.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
count,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,...,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0
mean,0.022591,0.048783,-0.106478,0.036503,0.015081,0.006759,0.030285,0.035092,-0.007377,0.017911,...,-0.011019,0.013182,0.009214,0.021442,0.021055,0.019792,-0.005648,0.00206,0.02208,0.012263
std,0.693122,0.752153,0.671208,0.704942,0.669746,0.655164,0.697175,0.713221,0.977388,0.702934,...,0.97269,0.692646,0.680736,0.697552,0.690695,0.691849,0.968505,0.644849,0.688484,0.659888
min,-3.537062,-3.735797,-2.881785,-2.759629,-2.666208,-3.447531,-4.080878,-2.453615,-4.437715,-2.914732,...,-4.078939,-8.026343,-2.737226,-3.259139,-2.467409,-2.752894,-4.263434,-3.139163,-2.904786,-3.87641
25%,-0.329708,-0.364269,-0.485622,-0.25215,-0.20295,-0.104688,-0.204822,-0.232716,-0.660448,-0.206239,...,-0.65068,-0.182375,-0.192845,-0.190437,-0.165223,-0.225178,-0.612332,-0.141922,-0.29077,-0.225651
50%,0.078276,0.108992,-0.137903,0.065195,0.032703,0.006759,0.070399,0.074473,0.015449,0.055288,...,-0.009846,0.049666,0.022258,0.048107,0.041973,0.043853,-0.000164,0.008316,0.054821,0.047944
75%,0.418154,0.509173,0.207481,0.370058,0.285569,0.12011,0.360212,0.364837,0.630516,0.319395,...,0.630588,0.307873,0.225686,0.258818,0.214714,0.281761,0.612005,0.118456,0.33528,0.261669
max,3.188035,3.633382,2.709261,2.73847,4.723567,5.466717,3.171973,3.183289,3.594953,2.377476,...,3.366269,2.922529,3.872346,5.342191,5.437263,3.552898,3.519804,3.354859,2.76533,10.863651


In [171]:
#Should we standardize again?
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train_filled)
X_train_filled = pd.DataFrame(scaler.transform(X_train_filled))
#Uses test stats
X_test_filled = pd.DataFrame(scaler.transform(X_test_filled))

In [172]:
#Save to disk
X_train_filled.to_csv("data/X_train_preproc.csv", index=False)
#Ravel strips col name and reindexes
pd.DataFrame(np.ravel(y_train)).to_csv("data/y_train_preproc.csv", index=False)

X_test_filled.to_csv("data/X_test_preproc.csv", index=False)
pd.DataFrame(np.ravel(y_test)).to_csv("data/y_test_preproc.csv", index=False)

In [173]:
X_test_filled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
0,0.792725,1.853595,0.430666,1.673288,-1.061261,3.231852,2.03453,1.285775,1.631257,1.683905,...,0.98016,2.07724,1.322596,0.902006,1.165861,1.090384,1.309489,1.326217,1.631649,2.636173
1,0.145227,0.867638,1.577293,0.408764,0.018676,0.233235,0.152046,0.163335,0.946292,0.285896,...,1.051372,0.257343,0.30515,0.491054,0.273093,0.529431,0.953901,0.726978,0.785837,0.223101
2,-1.082649,-1.796835,-0.805779,-1.446866,-2.532614,-1.871117,-2.807194,-1.678405,-1.304307,-2.290119,...,-1.263006,-1.462175,-1.691077,-2.087654,-1.695644,-1.446416,-1.219142,-2.316716,-1.462617,-2.033202
3,0.984559,0.637474,1.207177,-0.231065,1.881444,-0.170127,1.622469,0.791745,0.652736,0.42895,...,1.443035,0.661474,-0.686519,3.144252,1.642779,2.480898,1.507039,2.875743,1.396314,1.011435
4,-1.254798,-0.253743,0.300261,1.553282,-1.061261,-2.721611,-3.013225,-1.678405,0.370233,-0.821224,...,-0.260047,-0.965069,-0.262443,-0.278746,-1.027453,-0.45851,-0.206782,-0.588188,-0.551651,-0.852647
5,0.18855,0.203059,0.508308,3.113359,1.145768,1.530862,-0.025778,2.109158,1.826962,-2.917596,...,-0.119785,0.352446,0.389161,-0.129799,0.249748,0.253884,-0.297536,0.553643,0.077064,0.466821
6,-0.396491,-1.072704,0.243569,-1.806884,-1.184819,-0.398716,-0.746886,-1.843081,-0.815047,-0.198527,...,-0.077006,-1.226214,-0.589934,-1.713947,-0.264891,-0.779725,-0.028367,-0.710735,-0.768224,-0.902651
7,-0.239854,0.648102,-0.096206,-0.076266,-0.118508,-0.156495,-0.350314,-0.229255,-0.130081,-0.127842,...,-0.622102,0.062401,0.108086,-0.370784,-0.003001,-0.094506,-0.310415,-0.491967,0.047621,-0.067761
8,0.318811,0.620566,-0.709048,-0.246807,0.615342,1.540914,1.313423,0.133038,-0.032229,1.474745,...,-0.194832,1.133396,0.479732,0.528299,1.165861,0.394693,-0.270906,0.911205,0.424524,1.544334
9,0.536688,0.231781,-0.095984,2.393324,-0.325585,0.680368,0.077238,2.767865,0.359179,0.638109,...,0.303649,0.189552,0.987744,-0.219116,0.212026,0.119429,0.479782,0.551454,0.523997,0.518765
