In [195]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as plt
from math import floor
from sklearn.preprocessing import PolynomialFeatures


#Import cranial database 
df = pd.read_csv('../../datasources/fdb/cranial.csv')

#Drop categorical data. We'll focus on measurement data.
del df['DB']
del df['Item']
del df['ID']
del df['ContNum']
del df['FDN']
del df['Pop']
del df['PopSex']
del df['Ethnicity'] #Probably really important.
del df['BirthYear']
del df['Age'] #Probably really important.
del df['Comments']

rows, cols = df.shape 
print("Starting with", rows, "individuals and", cols, "features.")

Starting with 5342 individuals and 111 features.


In [196]:
#Consider our target col, Sex.
print("Unique entries.")
df['Sex'].value_counts(dropna=False)

Unique entries.


M      3187
F      2095
N        58
NaN       2
Name: Sex, dtype: int64

In [197]:
#Clean N and Nan entries
df = df[~(df.Sex.str.contains("N") == True)]
df = df.dropna(subset = ['Sex'])

df['Sex'].value_counts(dropna=False)

M    3187
F    2095
Name: Sex, dtype: int64

In [198]:
#Move from object to float
df['Sex'] = df['Sex'].map({'M':1, 'F':0})

In [199]:
#Shuffle rows of df
df = df.iloc[np.random.permutation(len(df))]

#Split into train and test sets
p=0.8
split_pt = floor(len(df)*p)
train, test = df.iloc[0:split_pt], df[split_pt:]

In [200]:
#Use train mean and std on test data.
tr_mean = train.mean()
tr_std  = train.std()
train = ( train - tr_mean ) / tr_std
test  = ( test - tr_mean ) / tr_std

#Clean this up.
train['Sex'] = train['Sex'].apply(np.sign)
test['Sex']  = test['Sex'].apply(np.sign)
train['Sex'] = ( train['Sex'] + 1 ) / 2
test['Sex'] = ( test['Sex'] + 1 ) / 2 

In [201]:
train.describe()



Unnamed: 0,Sex,GOL,NOL,BNL,BBH,XCB,XFB,WFB,ZYB,AUB,...,MAN,BABR,BANA,BAPR,UFHT,UFBR,ORBR,BIOB,INTB,MOW
count,4225.0,4162.0,3516.0,4136.0,4146.0,4164.0,3516.0,2099.0,4079.0,4087.0,...,404.0,512.0,486.0,461.0,1698.0,1269.0,362.0,487.0,332.0,0.0
mean,0.607574,5.514303e-16,-1.386326e-15,-4.260508e-16,-1.29992e-15,1.488829e-15,1.818795e-16,-6.761835e-16,5.504572e-16,8.849187e-16,...,-9.651246e-16,-6.938894e-18,7.310110000000001e-17,-7.32121e-17,-8.92363e-16,1.287124e-15,-1.398513e-16,1.530147e-15,7.490661000000001e-17,
std,0.488349,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,
min,0.0,-5.586959,-3.357328,-3.668699,-3.664309,-3.167015,-3.342226,-3.205874,-5.386418,-3.366687,...,-3.314051,-7.476188,-2.934634,-3.702169,-4.680744,-3.43618,-2.708423,-3.461576,-6.07803,
25%,0.0,,,,,,,,,,...,,,,,,,,,,
50%,1.0,,,,,,,,,,...,,,,,,,,,,
75%,1.0,,,,,,,,,,...,,,,,,,,,,
max,1.0,3.456473,3.543261,3.439414,3.162185,4.306704,3.892638,4.87935,3.883785,4.851884,...,2.992997,2.683138,5.049002,2.938086,4.22379,3.831687,3.856223,3.552169,2.513847,


In [202]:
test.describe()



Unnamed: 0,Sex,GOL,NOL,BNL,BBH,XCB,XFB,WFB,ZYB,AUB,...,MAN,BABR,BANA,BAPR,UFHT,UFBR,ORBR,BIOB,INTB,MOW
count,1057.0,1038.0,880.0,1041.0,1036.0,1032.0,879.0,523.0,1017.0,1019.0,...,114.0,124.0,121.0,113.0,417.0,336.0,91.0,117.0,81.0,0.0
mean,0.586566,-0.014641,-0.025483,-0.014968,0.005261,-0.031949,0.031172,0.039037,-0.042204,-0.062062,...,-0.112418,-0.070622,-0.041757,0.045017,0.074844,-0.011754,-0.009577,-0.054292,0.05589,
std,0.492682,0.995806,0.987172,0.997263,1.008955,0.991118,0.985553,1.005782,0.984958,0.961496,...,1.09733,0.848994,0.898191,1.008208,0.991944,1.013358,0.802805,0.980047,1.024651,
min,0.0,-2.87393,-2.873076,-2.975225,-2.84513,-2.873928,-3.034359,-3.003744,-2.796214,-2.926406,...,-4.822258,-2.396525,-2.75722,-2.312348,-3.264114,-3.43618,-2.708423,-1.877827,-6.07803,
25%,0.0,,,,,,,,,,...,,,,,,,,,,
50%,1.0,,,,,,,,,,...,,,,,,,,,,
75%,1.0,,,,,,,,,,...,,,,,,,,,,
max,1.0,3.34343,2.69582,3.439414,2.889125,4.306704,4.662304,3.464436,3.338479,3.531042,...,2.855887,1.836528,2.565204,2.783661,3.211911,3.41638,2.697756,3.325919,2.217575,


In [203]:
#This is fucked. 
#test = test.reindex_axis(sorted(test.count(), reverse=True), axis=1)
#test.reindex_axis(sorted(test.count().sort_values(ascending=False).index), axis=1)
sorted(test.count(),  #.sort_values(ascending=False))

#test = test.reindex_axis(sorted(test.count, axis=1)


Sex     1057
GOL     1038
NOL      880
BNL     1041
BBH     1036
XCB     1032
XFB      879
WFB      523
ZYB     1017
AUB     1019
ASB      876
BPL      961
NPH      929
NLH     1019
JUB      851
NLB     1025
MAB      932
MAL      439
MDH      991
MDB      147
OBH     1027
OBB     1024
DKB      985
NDS      818
WNB      831
SIS      816
ZMB      883
SSS      844
FMB      855
NAS      852
        ... 
CBA       98
BFA       98
BPA       98
FXA       98
RFA      683
RPA      683
ROA      683
BSA      661
SBA      682
SLA      682
TBA      682
GNI      216
HMF      141
TMF      145
GOG      219
CDL      206
WRB      169
XRB       57
XRH       99
MLN      112
MAN      114
BABR     124
BANA     121
BAPR     113
UFHT     417
UFBR     336
ORBR      91
BIOB     117
INTB      81
MOW        0
dtype: int64

In [204]:
# poly = PolynomialFeatures(2)
# test = poly.fit_transform(test)
# test = pd.DataFrame(test)
# train= poly.fit_transform(train)
# train= pd.DataFrame()
#test.count().sort_values(ascending=False).keys()


In [205]:
#Consider imputing with knn
#Consider removing outliers
#Mean imputation
test = test.fillna(0)
train = train.fillna(0)

In [206]:
#Write to CSV
train.to_csv("data/train_baseline.csv",index=False)
test.to_csv("data/test_baseline.csv", index=False) 