In [149]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as plt
from math import floor
from sklearn.preprocessing import PolynomialFeatures


#Import cranial database 
df = pd.read_csv('../../datasources/fdb/cranial.csv')

#Drop categorical data. We'll focus on measurement data.
del df['DB']
del df['Item']
del df['ID']
del df['ContNum']
del df['FDN']
del df['Pop']
del df['PopSex']
del df['Ethnicity'] #Probably really important.
del df['BirthYear']
del df['Age'] #Probably really important.
del df['Comments']

rows, cols = df.shape 
print("Starting with", rows, "individuals and", cols, "features.")

Starting with 5342 individuals and 111 features.


In [150]:
#Consider our target col, Sex.
print("Unique entries.")
df['Sex'].value_counts(dropna=False)

Unique entries.


M      3187
F      2095
N        58
NaN       2
Name: Sex, dtype: int64

In [151]:
#Clean N and Nan entries
df = df[~(df.Sex.str.contains("N") == True)]
df = df.dropna(subset = ['Sex'])

df['Sex'].value_counts(dropna=False)

M    3187
F    2095
Name: Sex, dtype: int64

In [152]:
#Move from object to float
df['Sex'] = df['Sex'].map({'M':1, 'F':0})

In [153]:
#Shuffle rows of df
df = df.iloc[np.random.permutation(len(df))]

#Split into train and test sets
p=0.8
split_pt = floor(len(df)*p)
train, test = df.iloc[0:split_pt], df[split_pt:]

In [154]:
#Use train mean and std on test data.
tr_mean = train.mean()
tr_std  = train.std()
train = ( train - tr_mean ) / tr_std
test  = ( test - tr_mean ) / tr_std

#Clean this up.
train['Sex'] = train['Sex'].apply(np.sign)
test['Sex']  = test['Sex'].apply(np.sign)
train['Sex'] = ( train['Sex'] + 1 ) / 2
test['Sex'] = ( test['Sex'] + 1 ) / 2 

In [155]:
train.describe()



Unnamed: 0,Sex,GOL,NOL,BNL,BBH,XCB,XFB,WFB,ZYB,AUB,...,MAN,BABR,BANA,BAPR,UFHT,UFBR,ORBR,BIOB,INTB,MOW
count,4225.0,4167.0,3518.0,4145.0,4152.0,4163.0,3516.0,2108.0,4091.0,4098.0,...,427.0,523.0,496.0,472.0,1710.0,1310.0,369.0,494.0,342.0,0.0
mean,0.60426,6.914449e-16,1.154278e-15,7.885396e-17,-1.668543e-15,1.117104e-15,1.057933e-15,3.286429e-16,1.657817e-15,-1.047262e-15,...,-8.278572e-16,1.518225e-15,-1.683241e-16,2.276898e-16,2.389252e-17,-7.024068e-16,3.441992e-16,3.739699e-16,1.194626e-16,
std,0.489067,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,
min,0.0,-5.572678,-3.353755,-3.666009,-3.546959,-3.16314,-3.217278,-3.220198,-5.419327,-3.388567,...,-4.647896,-7.54896,-2.765375,-2.971446,-4.731309,-3.449389,-2.453465,-3.478257,-5.755833,
25%,0.0,,,,,,,,,,...,,,,,,,,,,
50%,1.0,,,,,,,,,,...,,,,,,,,,,
75%,1.0,,,,,,,,,,...,,,,,,,,,,
max,1.0,3.450271,3.549429,3.440007,3.156572,4.295415,4.632471,4.860965,3.921911,4.906919,...,2.963808,2.459276,5.038031,2.958222,4.204921,3.835143,3.655365,3.629566,2.392376,


In [156]:
test.describe()



Unnamed: 0,Sex,GOL,NOL,BNL,BBH,XCB,XFB,WFB,ZYB,AUB,...,MAN,BABR,BANA,BAPR,UFHT,UFBR,ORBR,BIOB,INTB,MOW
count,1057.0,1033.0,878.0,1032.0,1030.0,1033.0,879.0,514.0,1005.0,1008.0,...,91.0,113.0,111.0,102.0,405.0,295.0,84.0,110.0,71.0,0.0
mean,0.599811,-0.006488,-0.00134,-0.007029,-0.055199,-0.044685,-0.115481,-0.041581,-0.000616,-0.014558,...,0.140419,0.070458,-0.095536,-0.009672,-0.099714,-0.042415,-0.034174,0.104836,0.108978,
std,0.490168,0.984365,0.989304,0.995826,1.017933,0.980415,0.979968,1.003249,1.024129,1.010215,...,0.971477,0.914824,0.88282,1.067636,1.0086,1.026874,1.086456,1.048582,0.673212,
min,0.0,-2.865793,-2.627104,-2.799422,-3.683766,-2.578155,-3.371195,-2.81614,-3.35876,-3.388567,...,-2.911893,-2.544842,-2.942725,-3.751666,-3.106539,-2.408741,-2.86072,-1.873265,-1.541242,
25%,0.0,,,,,,,,,,...,,,,,,,,,,
50%,1.0,,,,,,,,,,...,,,,,,,,,,
75%,1.0,,,,,,,,,,...,,,,,,,,,,
max,1.0,2.886337,2.822778,2.746737,2.882959,4.295415,2.939388,3.446762,3.097684,3.72185,...,2.563192,2.745226,2.377779,2.958222,2.580152,3.418884,4.06262,3.629566,1.549458,


In [157]:
#This is fucked. 
#test = test.reindex_axis(sorted(test.count(), reverse=True), axis=1)
test


Unnamed: 0,1057,1033,1033.1,1032,1030,1018,1017,1017.1,1013,1009,...,110,110.1,102,93,91,84,84.1,71,58,0
5068,,,,,,,,,,,...,,,,,,,,,,
1157,,,,,,,,,,,...,,,,,,,,,,
2277,,,,,,,,,,,...,,,,,,,,,,
2580,,,,,,,,,,,...,,,,,,,,,,
3517,,,,,,,,,,,...,,,,,,,,,,
1258,,,,,,,,,,,...,,,,,,,,,,
3756,,,,,,,,,,,...,,,,,,,,,,
3249,,,,,,,,,,,...,,,,,,,,,,
1389,,,,,,,,,,,...,,,,,,,,,,
3495,,,,,,,,,,,...,,,,,,,,,,


In [158]:
# poly = PolynomialFeatures(2)
# test = poly.fit_transform(test)
# test = pd.DataFrame(test)
# train= poly.fit_transform(train)
# train= pd.DataFrame()



In [159]:
#Consider imputing with knn
#Consider removing outliers
#Mean imputation
test = test.fillna(0)
train = train.fillna(0)

In [160]:
#Write to CSV
train.to_csv("data/train_baseline.csv",index=False)
test.to_csv("data/test_baseline.csv", index=False) 