In [55]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
#Import cranial database 
df = pd.read_csv('../../datasources/fdb/cranial.csv')
#This should shuffle the db.
df = df.sample(frac=1)
#Drop categorical data. We'll focus on measurement data.
del df['DB']
del df['Item']
del df['ID']
del df['ContNum']
del df['FDN']
del df['Pop']
del df['PopSex']
del df['Ethnicity'] #Probably really important.
del df['BirthYear']
del df['Age'] #Probably really important.
del df['Comments']

rows, cols = df.shape 
print("Starting with", rows, "individuals and", cols, "features.")


Starting with 5342 individuals and 111 features.


In [56]:
#Consider our target col, Sex.
print("Unique entries.")
df['Sex'].value_counts(dropna=False)


Unique entries.


M      3187
F      2095
N        58
NaN       2
Name: Sex, dtype: int64

In [57]:
#Clean N and Nan entries
df = df[~(df.Sex.str.contains("N") == True)]
df = df.dropna(subset = ['Sex'])
#Move from object to float
df['Sex'] = df['Sex'].map({'M':1.0, 'F':0.})
df.describe()


Unnamed: 0,Sex,GOL,NOL,BNL,BBH,XCB,XFB,WFB,ZYB,AUB,...,MAN,BABR,BANA,BAPR,UFHT,UFBR,ORBR,BIOB,INTB,MOW
count,5282.0,5200.0,4396.0,5177.0,5182.0,5196.0,4395.0,2622.0,5096.0,5106.0,...,518.0,636.0,607.0,574.0,2115.0,1605.0,453.0,604.0,413.0,0.0
mean,0.60337,180.3975,177.689945,100.143906,133.846584,137.568129,114.752673,94.898932,129.449372,120.85566,...,124.990347,136.886792,101.494234,97.031359,70.201891,103.535826,41.00883,96.253311,20.552058,
std,0.489244,8.838111,8.23857,5.764435,7.336864,6.811824,6.477442,4.952665,7.313785,6.763771,...,7.455211,6.889893,5.523686,6.481473,4.934519,4.827782,2.493338,4.400408,3.388348,
min,0.0,131.0,150.0,79.0,107.0,116.0,93.0,79.0,90.0,98.0,...,90.0,84.0,85.0,73.0,47.0,87.0,34.0,81.0,0.0,
25%,0.0,174.0,172.0,96.0,129.0,133.0,110.0,92.0,124.0,116.0,...,120.25,133.0,98.0,93.0,67.0,100.0,39.0,93.0,19.0,
50%,1.0,180.0,178.0,100.0,134.0,137.0,115.0,95.0,129.0,121.0,...,125.0,137.0,101.0,97.0,70.0,104.0,41.0,96.0,21.0,
75%,1.0,187.0,183.0,104.0,139.0,142.0,119.0,98.0,134.0,125.0,...,130.0,142.0,105.0,101.0,74.0,107.0,42.0,99.0,22.0,
max,1.0,211.0,207.0,120.0,157.0,167.0,145.0,119.0,158.0,154.0,...,147.0,156.0,130.0,116.0,91.0,122.0,51.0,112.0,29.0,


In [58]:
#Split into data and labels.
X = df.drop(['Sex'], axis=1)
y = df[['Sex']]

In [59]:
#Split train test
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2)

In [60]:
X_train.isnull().sum().sum()

165881

In [61]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4225, 110), (1057, 110), (4225, 1), (1057, 1))

In [None]:
#Standardize df
X_train_mean = X_train.mean()
X_train_std = X_train.std()
X_train = (X_train - X_train_mean) / (X_train_std)
X_train.describe()

Unnamed: 0,GOL,NOL,BNL,BBH,XCB,XFB,WFB,ZYB,AUB,ASB,...,MAN,BABR,BANA,BAPR,UFHT,UFBR,ORBR,BIOB,INTB,MOW
count,4161.0,3524.0,4141.0,4146.0,4157.0,3526.0,2097.0,4075.0,4084.0,3526.0,...,413.0,500.0,477.0,451.0,1698.0,1283.0,354.0,472.0,314.0,0.0
mean,-1.06273e-15,1.545113e-15,-7.164839e-16,-1.875356e-15,-1.073954e-15,3.839062e-16,-8.236801e-16,-1.861292e-16,5.532222e-16,-5.869131e-16,...,-5.510792e-16,1.588507e-15,-1.060647e-15,-9.694143e-16,4.206817e-16,-1.237427e-15,-7.479892e-16,5.491135e-16,5.798617e-17,
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,
min,-5.575828,-3.351344,-3.663912,-3.632284,-3.173401,-3.230745,-3.172069,-3.369782,-3.397817,-3.355843,...,-4.677018,-7.474356,-2.909331,-3.649186,-4.669619,-3.38486,-2.795292,-3.425997,-6.071619,
25%,-0.7265643,-0.6945041,-0.7146757,-0.6409527,-0.6690618,-0.7379839,-0.7629317,-0.7543729,-0.7150361,-0.7254767,...,-0.6517156,-0.524765,-0.7860351,-0.6132463,-0.6466793,-0.7301021,-0.7881249,-0.7364826,-0.499664,
50%,-0.04992284,0.03008855,-0.0207377,0.03889531,-0.07980547,0.04100402,0.04011399,-0.06610728,0.03018092,-0.06788522,...,0.0191681,0.04254851,-0.07826965,-0.006058416,-0.04323831,0.08674639,0.0147419,-0.0641039,0.08685749,
75%,0.7394922,0.7546812,0.6732003,0.7187433,0.656765,0.6641944,0.6423983,0.6221583,0.6263545,0.5897063,...,0.6900518,0.7516904,0.6294957,0.6011295,0.7110629,0.6993828,0.4161753,0.6082748,0.3801183,
max,3.446058,3.532286,3.448952,3.030226,4.339617,4.714932,4.858388,3.925833,4.203396,3.548868,...,2.971056,2.737288,5.053029,2.878084,4.180849,3.762565,4.029076,3.521916,2.432944,


In [None]:
#The imputer. I haven't looked closely at this algo. 
#Should probably be doing this after standardization because of scaling concerns.
from fancyimpute import KNN
X_train_filled = pd.DataFrame(KNN(k=3).complete(X_train))
X_test_filled = pd.DataFrame(KNN(k=3).complete(X_test))


Computing pairwise distances between 4225 samples
Computing distances for sample #1/4225, elapsed time: 1.917
No samples have sufficient overlap with sample 58
Computing distances for sample #101/4225, elapsed time: 2.235
Computing distances for sample #201/4225, elapsed time: 2.567
Computing distances for sample #301/4225, elapsed time: 2.968
Computing distances for sample #401/4225, elapsed time: 3.343
Computing distances for sample #501/4225, elapsed time: 3.607
Computing distances for sample #601/4225, elapsed time: 3.844
Computing distances for sample #701/4225, elapsed time: 4.136
Computing distances for sample #801/4225, elapsed time: 4.424
Computing distances for sample #901/4225, elapsed time: 4.722
Computing distances for sample #1001/4225, elapsed time: 5.012
Computing distances for sample #1101/4225, elapsed time: 5.227
Computing distances for sample #1201/4225, elapsed time: 5.440
Computing distances for sample #1301/4225, elapsed time: 5.649
Computing distances for sample

In [None]:
#Num zero entries
(X_train_filled == 0).astype(int).sum().sum(),(X_test_filled == 0).astype(int).sum().sum()

In [None]:
#Impute zeros with mean mean.
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values=0, strategy='mean', axis=0)
imp.fit(X_train_filled)
X_train_filled = pd.DataFrame(imp.transform(X_train_filled))
imp.fit(X_test_filled)
X_test_filled = pd.DataFrame(imp.transform(X_test_filled))

In [None]:
#No zero values
(X_train_filled == 0).astype(int).sum().sum(), (X_test_filled == 0).astype(int).sum().sum()

In [None]:
#Standardize
from sklearn import preprocessing
X_train_preprocessed = pd.DataFrame(preprocessing.scale(X_train_filled))
X_test_preprocessed = pd.DataFrame(preprocessing.scale(X_test_filled))

In [None]:
from sklearn import svm
clf = svm.SVC()
clf.fit( X_train_preprocessed, np.ravel(y_train) )

In [None]:
from sklearn import cross_validation
#scores = cross_validation.cross_val_score(clf, X_test_preprocessed, np.ravel( (y_test+1)/2), cv=5)
clf.score(X_test_preprocessed, np.ravel(y_test))


In [None]:
p=clf.predict(X_test_preprocessed)
from sklearn.metrics import confusion_matrix
sns.heatmap(confusion_matrix(y_test, p), annot=True)