# Submission File Generator for multiple Random Forest classification and prediction using the probabilities from each classification

In [1]:
# load a bunch of random py libraries
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
# Set individual file names
mainFolder = '~/PythonProjects/BBDCActivityClassification/bbdc_2019_Bewegungsdaten/'
trainFile = 'train.csv'
testFile = 'challenge.csv'
new_complete_set = 'new_feature_df.csv'
complete_set = 'final_full_feature_set.csv'
interm_complete_set = 'features_dataset_with_test_set_iqr_corr_skew.csv'
original_complete_set = 'features_dataset_with_test_set.csv'
entropy_df = 'entropy_df'

### Pre Processing

In [170]:
#initialize unlabelled df
complete_df = pd.read_csv(new_complete_set)
#complete_df = complete_df[complete_df['Label'] != 'lay']
#complete_df.drop(columns = ['Subject', 'Datafile', 'Label'], inplace=True)

#drop duplicate rows
complete_df.drop_duplicates(subset ='id', inplace = True) 
#show column names
print(list(complete_df.columns))

['id', 'n_rows', 'emg1_min', 'emg2_min', 'emg3_min', 'emg4_min', 'airborne_min', 'acc_u_x_min', 'acc_u_y_min', 'acc_u_z_min', 'gonio_x_min', 'acc_l_x_min', 'acc_l_y_min', 'acc_l_z_min', 'gonio_y_min', 'gyro_u_x_min', 'gyro_u_y_min', 'gyro_u_z_min', 'gyro_l_x_min', 'gyro_l_y_min', 'gyro_l_z_min', 'emg1_max', 'emg2_max', 'emg3_max', 'emg4_max', 'airborne_max', 'acc_u_x_max', 'acc_u_y_max', 'acc_u_z_max', 'gonio_x_max', 'acc_l_x_max', 'acc_l_y_max', 'acc_l_z_max', 'gonio_y_max', 'gyro_u_x_max', 'gyro_u_y_max', 'gyro_u_z_max', 'gyro_l_x_max', 'gyro_l_y_max', 'gyro_l_z_max', 'emg1_std', 'emg2_std', 'emg3_std', 'emg4_std', 'airborne_std', 'acc_u_x_std', 'acc_u_y_std', 'acc_u_z_std', 'gonio_x_std', 'acc_l_x_std', 'acc_l_y_std', 'acc_l_z_std', 'gonio_y_std', 'gyro_u_x_std', 'gyro_u_y_std', 'gyro_u_z_std', 'gyro_l_x_std', 'gyro_l_y_std', 'gyro_l_z_std', 'emg1_count_distinct', 'emg2_count_distinct', 'emg3_count_distinct', 'emg4_count_distinct', 'airborne_count_distinct', 'acc_u_x_count_distinct'

In [171]:
complete_df.shape[0]

8139

In [172]:
complete_df.replace(np.NaN, 0.0, inplace=True)
complete_df.replace(np.inf, 4.0, inplace=True)

#show whether I missed any
complete_df[complete_df.isin([np.nan, np.inf, -np.inf]).any(1)]

Unnamed: 0,id,n_rows,emg1_min,emg2_min,emg3_min,emg4_min,airborne_min,acc_u_x_min,acc_u_y_min,acc_u_z_min,...,cor_gyro_u_y_gyro_u_z,cor_gyro_u_y_gyro_l_x,cor_gyro_u_y_gyro_l_y,cor_gyro_u_y_gyro_l_z,cor_gyro_u_z_gyro_l_x,cor_gyro_u_z_gyro_l_y,cor_gyro_u_z_gyro_l_z,cor_gyro_l_x_gyro_l_y,cor_gyro_l_x_gyro_l_z,cor_gyro_l_y_gyro_l_z


### Train-Test Split

In [174]:
#choose whether to use PCA dataset or original dataset
#traintest_df = final_principal_df
traintest_df = complete_df

# initialize label dataframe
train_labels = pd.read_csv(mainFolder + trainFile)

# merge with training labels to create train_df
train_df = pd.merge(traintest_df, train_labels, left_on='id', right_on = 'Datafile', how = 'inner')
train_df = train_df[train_df['Label'] != 'lay']
train_df.drop(columns = ['Subject', 'Datafile', 'id'], inplace = True)

#initialize dataframe with test observations
test_subjects = pd.read_csv(mainFolder + testFile)

#create test_df by merging with testFile
test_df = pd.merge(test_subjects, traintest_df, left_on='Datafile', right_on= 'id', how='inner')
test_df.drop(columns = ['Label', 'id'], inplace = True)

print(test_df.shape)
print(train_df.shape)

(1738, 266)
(6385, 265)


In [175]:
#check whether lay snuck through
train_df.Label.unique()

array(['jump-two-leg', 'v-cut-right-Lfirst', 'walk', 'sit-to-stand',
       'lateral-shuffle-left', 'curve-right-spin-Rfirst',
       'curve-right-step', 'stand-to-sit', 'stair-down',
       'lateral-shuffle-right', 'jump-one-leg', 'curve-left-step',
       'v-cut-right-Rfirst', 'curve-left-spin-Lfirst', 'run',
       'v-cut-left-Lfirst', 'curve-right-spin-Lfirst',
       'v-cut-left-Rfirst', 'curve-left-spin-Rfirst', 'sit', 'stand',
       'stair-up'], dtype=object)

### Factorize Label

In [176]:
#create new modelling df
model_df = train_df

#factorize label
factor = pd.factorize(train_df['Label'])
model_df.Label = factor[0]

print(factor[0])

#store original labels
definitions = factor[1]
print(factor[1])

#print(model_df.Label.head())
#print(definitions)

[ 0  1  2 ... 15  0  1]
Index(['jump-two-leg', 'v-cut-right-Lfirst', 'walk', 'sit-to-stand',
       'lateral-shuffle-left', 'curve-right-spin-Rfirst', 'curve-right-step',
       'stand-to-sit', 'stair-down', 'lateral-shuffle-right', 'jump-one-leg',
       'curve-left-step', 'v-cut-right-Rfirst', 'curve-left-spin-Lfirst',
       'run', 'v-cut-left-Lfirst', 'curve-right-spin-Lfirst',
       'v-cut-left-Rfirst', 'curve-left-spin-Rfirst', 'sit', 'stand',
       'stair-up'],
      dtype='object')


In [160]:
factor[1]

Index(['curve-left-step', 'stand', 'curve-right-spin-Lfirst', 'jump-one-leg',
       'sit-to-stand', 'walk', 'run', 'curve-left-spin-Lfirst', 'stair-down',
       'stair-up', 'v-cut-left-Rfirst', 'v-cut-right-Lfirst',
       'lateral-shuffle-left', 'curve-left-spin-Rfirst', 'sit',
       'v-cut-right-Rfirst', 'curve-right-spin-Rfirst', 'stand-to-sit',
       'v-cut-left-Lfirst', 'jump-two-leg', 'lateral-shuffle-right',
       'curve-right-step'],
      dtype='object')

### Modelling

In [41]:
# insert random packages here
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

In [83]:
# initializing X and Y array
X_train = model_df.iloc[:,0:-1].values
y_train = model_df.iloc[:,-1].values

#specify classifier
classifier = RandomForestClassifier(n_estimators = 110, random_state = 7, oob_score = True)

# get the cross-validation accuracy score
#cross_val_score(classifier, X_train, y_train, cv = 5)

In [116]:
#fit model
classifier.fit(X_train, y_train)

#print(classifier.feature_importances_)

#get model score based on full training data
classifier.score(X_train, y_train)

1.0

In [37]:
# try grid search for number of trees
rf = RandomForestClassifier(n_estimators = 100, random_state = 7)

parameters = {'random_state': list(range(100))}

clf = GridSearchCV(rf, param_grid = parameters, cv = 5)
#clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=7, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'random_state': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]},
       pre_dispatch='2*n_jobs', refit=True,

In [14]:
#score_dict = clf.cv_results_

#dict_df = pd.DataFrame(score_dict)
#dict_df.to_csv('gridsearch_dict_df.csv', index=False)
dict_df = pd.read_csv('gridsearch_dict_df.csv')
dict_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,5.604284,0.080083,0.029972,0.001594,0,{'random_state': 0},0.909446,0.792969,0.757433,0.887147,...,0.834456,0.056856,30,1.0,1.000000,1.0,1.0,1.0,1.000000,0.000000
1,5.856136,0.098161,0.032174,0.003252,1,{'random_state': 1},0.902420,0.783594,0.742567,0.887931,...,0.834769,0.061743,25,1.0,1.000000,1.0,1.0,1.0,1.000000,0.000000
2,5.719183,0.200403,0.034827,0.008075,2,{'random_state': 2},0.900078,0.774219,0.776213,0.879310,...,0.832420,0.051664,46,1.0,1.000000,1.0,1.0,1.0,1.000000,0.000000
3,5.764650,0.190640,0.033734,0.004653,3,{'random_state': 3},0.895394,0.788281,0.773083,0.876959,...,0.839937,0.049569,6,1.0,1.000000,1.0,1.0,1.0,1.000000,0.000000
4,5.609518,0.113643,0.033921,0.004567,4,{'random_state': 4},0.888368,0.767188,0.749609,0.877743,...,0.823962,0.056541,100,1.0,1.000000,1.0,1.0,1.0,1.000000,0.000000
5,5.736556,0.067221,0.030907,0.000883,5,{'random_state': 5},0.892272,0.784375,0.770736,0.865204,...,0.834456,0.047992,30,1.0,0.999804,1.0,1.0,1.0,0.999961,0.000078
6,5.706496,0.082117,0.032371,0.002753,6,{'random_state': 6},0.901639,0.781250,0.758216,0.884013,...,0.834612,0.056248,28,1.0,1.000000,1.0,1.0,1.0,1.000000,0.000000
7,5.760128,0.078182,0.031145,0.001238,7,{'random_state': 7},0.906323,0.807813,0.769171,0.884013,...,0.842287,0.049795,2,1.0,1.000000,1.0,1.0,1.0,1.000000,0.000000
8,5.663766,0.059566,0.031399,0.001956,8,{'random_state': 8},0.887588,0.792188,0.761346,0.874608,...,0.834612,0.049214,28,1.0,1.000000,1.0,1.0,1.0,1.000000,0.000000
9,5.841155,0.170224,0.033829,0.004918,9,{'random_state': 9},0.887588,0.793750,0.738654,0.869906,...,0.827408,0.054494,88,1.0,1.000000,1.0,1.0,1.0,1.000000,0.000000


In [15]:
np.argsort(list(dict_df['rank_test_score']))[:20]

array([33,  7, 39, 69, 59,  3, 57, 87, 93, 99, 12, 88, 54, 58, 45, 56, 82,
       66, 74, 64])

### Looping Random Forest to get cum prob

In [84]:
#specify hyperparameters to stay the same during model testing/validation phase and when generating actual predictions

#the list of random states which the classifier uses to generate the Random Forests
#generic:
random_state_range = range(50, 70)
#take top 20(or whatever the last number is) random states from gridsearch
#random_state_range = np.argsort(list(dict_df['rank_test_score']))[:20]

# the number of trees in each Random Forest
n_trees = 100

In [85]:
#train-test split the train set
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(X_train, y_train, 
                                                                            test_size = 0.6, random_state = 2)
#initialize probability array
prob = np.empty((len(X_train_test), 22))

#loop to run Random Forest in different random states and add the probabilities obtained from each to prob
for random_state in random_state_range:
    classifier = RandomForestClassifier(n_estimators = n_trees, random_state = random_state)
    classifier.fit(X_train_train, y_train_train)
    prob += classifier.predict_proba(X_train_test)

#argmax function to choose the label with highest cumulative probability
cum_pred = np.argmax(prob, axis = 1)

#find predicition accuracy by comparing with test set labels
print('Prediction Accuracy=%f' % (sum(1 for i in range(len(cum_pred)) 
                             if cum_pred[i] == y_train_test[i]) / float(len(cum_pred))))

Prediction Accuracy=0.978074


In [27]:
#feature importance for last looped classifier:
feature_importances = pd.DataFrame(classifier.feature_importances_,
                                   index = complete_df.columns[1:],
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances.head(10)

Unnamed: 0,importance
acc_u_y_std,0.018295
cor_gonio_y_gyro_l_z,0.015382
gonio_y_std,0.014474
gonio_y_iqr,0.013924
cor_acc_u_y_gonio_y,0.013527
acc_u_y_iqr,0.012962
cor_gonio_y_gyro_u_z,0.012712
acc_l_z_std,0.012104
acc_u_z_std,0.012042
acc_l_y_count_distinct,0.011981


### Trying SVM

In [11]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

In [14]:
# initialize and score SVC
svc = LinearSVC(penalty = 'l2', C = 0.01, dual = False, multi_class = 'crammer_singer', random_state = 69, max_iter = 1000)

# get cross val score
cross_val_score(svc, X_train, y_train, cv = 5)



array([0.53005464, 0.353125  , 0.47809077, 0.39811912, 0.20787402])

In [44]:
from sklearn.feature_selection import SelectFromModel

# get new X by eliminating variables
lsvc = svc.fit(X_train, y_train)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X_train)
print(X_new.shape)

# get cross val score with X_new
cross_val_score(svc, X_new, y_train, cv = 5)

(6385, 83)


array([0.79000781, 0.57265625, 0.771518  , 0.67868339, 0.54566929])

### Predictions

In [151]:
#specify hyperparameters to stay the same during model testing/validation phase and when generating actual predictions

#the list of random states which the classifier uses to generate the Random Forests
#generic:
random_state_range = range(20, 40)
#take top 20(or whatever the last number is) random states from gridsearch
#random_state_range = np.argsort(list(dict_df['rank_test_score']))[:20]

# the number of trees in each Random Forest
n_trees = 100

In [152]:
test_df.head()

Unnamed: 0,Subject,Datafile,iqr_emg1,iqr_emg2,iqr_emg3,iqr_emg4,iqr_airborne,iqr_acc_upper_x,iqr_acc_upper_y,iqr_acc_upper_z,...,skew_acc_lower_x,skew_acc_lower_y,skew_acc_lower_z,skew_goniometer_y,skew_gyro_upper_x,skew_gyro_upper_y,skew_gyro_upper_z,skew_gyro_lower_x,skew_gyro_lower_y,skew_gyro_lower_z
0,Subject01,Subject01/Subject01_Aufnahme000.csv,284.0,415.5,190.0,305.0,466.0,1205.0,1228.0,832.0,...,1.485057,-0.547205,0.091917,-1.016616,0.408385,-0.120254,-0.624012,1.06883,-0.716702,0.072518
1,Subject01,Subject01/Subject01_Aufnahme001.csv,162.0,359.0,243.0,460.0,812.0,1716.0,3888.0,1970.0,...,-0.00091,-0.611155,0.600841,-0.301062,1.406128,0.1982,-0.71748,2.03935,-0.760093,-0.509044
2,Subject01,Subject01/Subject01_Aufnahme002.csv,116.0,386.0,217.0,364.0,288.0,792.0,896.0,300.0,...,-0.985236,-1.514348,0.499568,-0.813914,-0.46299,0.167053,-0.904684,0.164944,-1.395436,-1.109873
3,Subject01,Subject01/Subject01_Aufnahme003.csv,80.0,175.5,183.5,197.5,316.5,557.0,2783.0,2695.0,...,0.339806,0.562804,-0.810815,0.588814,1.351184,-0.12444,-1.862908,-0.720181,0.445187,0.403319
4,Subject01,Subject01/Subject01_Aufnahme004.csv,310.0,529.0,331.0,528.5,1131.0,1213.0,2834.0,1534.0,...,0.669314,-0.365481,-0.438842,-0.73075,-0.839778,0.430474,-0.617708,0.716052,-0.770605,-0.815882


In [153]:
#initialize train array (go agane)
X_train = model_df.iloc[:,0:-1].values
y_train = model_df.iloc[:,-1].values

#initialize test array
X_test = test_df.iloc[:,2:].values

#same procedure as in testing phase above
pred = np.empty((len(X_test), 22))
for random_state in random_state_range:
    classifier = RandomForestClassifier(n_estimators = n_trees, random_state = random_state)
    classifier.fit(X_train, y_train)
    pred += classifier.predict_proba(X_test)

#generate predicted labels
cum_pred = np.argmax(pred, axis = 1)

In [154]:
# Convert the predictions into labels
reversefactor = dict(zip(range(len(definitions)), definitions))
#print(reversefactor)
pred_label = np.vectorize(reversefactor.get)(cum_pred)

#assign the labels to the test_subjects file
test_subjects['Label'] = pred_label
test_subjects.head()

Unnamed: 0,Subject,Datafile,Label
0,Subject01,Subject01/Subject01_Aufnahme000.csv,v-cut-left-Lfirst
1,Subject01,Subject01/Subject01_Aufnahme001.csv,v-cut-right-Lfirst
2,Subject01,Subject01/Subject01_Aufnahme002.csv,stand-to-sit
3,Subject01,Subject01/Subject01_Aufnahme003.csv,lateral-shuffle-left
4,Subject01,Subject01/Subject01_Aufnahme004.csv,jump-one-leg


In [122]:
pavg_rf_pred = pred.copy()
pavg_rf_pred = pavg_rf_pred / 20

In [155]:
#check similarity with old submissions as a sanity check
best_submission = pd.read_csv('submission_pavg_rf_mimisdataset.csv')

print('similarity=%f' % (sum(1 for i in range(len(pred_label)) 
                             if test_subjects.iloc[i,2] == best_submission.iloc[i,2]) / float(len(pred_label))))

similarity=0.657652


### Generate Submission File

In [46]:
#name submission file
submissionFileName = 'submission_pavg_rf_norm_mimisdataset'

#test_subjects.to_csv(submissionFileName + '.csv',index=False)

### Final Ensembling of Previous Submissions

In [177]:
pavg_rf_norm_pred_df = pd.DataFrame(pavg_rf_norm_pred, columns = factor[1])
pavg_rf_norm_pred_df.head()

Unnamed: 0,jump-two-leg,v-cut-right-Lfirst,walk,sit-to-stand,lateral-shuffle-left,curve-right-spin-Rfirst,curve-right-step,stand-to-sit,stair-down,lateral-shuffle-right,...,v-cut-right-Rfirst,curve-left-spin-Lfirst,run,v-cut-left-Lfirst,curve-right-spin-Lfirst,v-cut-left-Rfirst,curve-left-spin-Rfirst,sit,stand,stair-up
0,0.0080,0.0065,0.2165,0.0010,0.0240,0.0005,0.3505,0.0050,0.0800,0.0260,...,0.0015,0.0030,0.0260,0.0050,0.0035,0.0070,0.0010,0.0000,0.0005,0.0485
1,0.1175,0.2200,0.0000,0.0040,0.0155,0.0125,0.0005,0.0035,0.0035,0.0110,...,0.1370,0.0235,0.0040,0.1690,0.0250,0.1450,0.0560,0.0000,0.0000,0.0010
2,0.0060,0.0225,0.0015,0.0105,0.0025,0.0465,0.0000,0.0210,0.0015,0.0000,...,0.0065,0.3390,0.0000,0.0185,0.1885,0.0070,0.3195,0.0000,0.0000,0.0065
3,0.0155,0.0085,0.0010,0.1045,0.0015,0.0335,0.0005,0.6335,0.0020,0.0010,...,0.0170,0.0355,0.0005,0.0120,0.0500,0.0120,0.0580,0.0025,0.0040,0.0020
4,0.0430,0.0065,0.0770,0.0020,0.1085,0.0000,0.0780,0.0060,0.3050,0.1395,...,0.0035,0.0015,0.0810,0.0045,0.0015,0.0050,0.0005,0.0000,0.0000,0.0340
5,0.0630,0.0575,0.0005,0.0075,0.0150,0.0240,0.0005,0.0055,0.0025,0.0125,...,0.5370,0.0105,0.0005,0.0305,0.0350,0.1235,0.0285,0.0000,0.0000,0.0000
6,0.0115,0.0045,0.0255,0.0005,0.3680,0.0000,0.0165,0.0015,0.0275,0.4625,...,0.0005,0.0010,0.0350,0.0025,0.0005,0.0030,0.0010,0.0000,0.0000,0.0065
7,0.0010,0.0000,0.0000,0.0005,0.0015,0.0000,0.0000,0.0000,0.0390,0.0000,...,0.0000,0.0010,0.0000,0.0000,0.0010,0.0000,0.0000,0.2990,0.6565,0.0000
8,0.0100,0.0225,0.0025,0.0270,0.0035,0.1070,0.0005,0.0165,0.0035,0.0010,...,0.0295,0.1150,0.0005,0.0150,0.5040,0.0150,0.1115,0.0000,0.0005,0.0035
9,0.0045,0.0040,0.2155,0.0010,0.0105,0.0005,0.3610,0.0020,0.1020,0.0195,...,0.0015,0.0020,0.0215,0.0050,0.0020,0.0055,0.0010,0.0000,0.0000,0.0570


In [161]:
pavg_rf_pred_df = pd.DataFrame(pavg_rf_pred, columns = factor[1])
pavg_rf_pred_df.head()

Unnamed: 0,curve-left-step,stand,curve-right-spin-Lfirst,jump-one-leg,sit-to-stand,walk,run,curve-left-spin-Lfirst,stair-down,stair-up,...,lateral-shuffle-left,curve-left-spin-Rfirst,sit,v-cut-right-Rfirst,curve-right-spin-Rfirst,stand-to-sit,v-cut-left-Lfirst,jump-two-leg,lateral-shuffle-right,curve-right-step
0,0.254545,0.000000,0.014091,0.001818,0.000455,0.242273,0.014545,0.003636,0.057727,0.015000,...,0.015000,0.007727,0.000000,0.013182,0.003182,0.001818,0.010909,0.004091,0.014545,0.306818
1,0.000455,0.000000,0.039545,0.027273,0.004545,0.000909,0.003182,0.018636,0.005909,0.000455,...,0.006818,0.063636,0.000000,0.124091,0.011818,0.010000,0.160000,0.077273,0.005909,0.000455
2,0.001818,0.000000,0.158636,0.001364,0.014545,0.000455,0.000000,0.386364,0.000455,0.004091,...,0.002727,0.291818,0.000909,0.005909,0.040455,0.032727,0.021364,0.002727,0.000000,0.001364
3,0.000000,0.019091,0.022273,0.004545,0.087727,0.000909,0.000455,0.020000,0.001364,0.000455,...,0.001364,0.029091,0.024091,0.012727,0.031364,0.702273,0.007727,0.016818,0.000000,0.000000
4,0.079091,0.000000,0.001364,0.014091,0.000909,0.094091,0.081364,0.000909,0.300455,0.034091,...,0.123636,0.000000,0.000000,0.004091,0.000000,0.005000,0.007727,0.020909,0.129545,0.084545
5,0.000455,0.000000,0.057273,0.030455,0.014545,0.000000,0.000909,0.029091,0.003636,0.000455,...,0.005000,0.039545,0.000000,0.374091,0.027273,0.022273,0.034545,0.049545,0.007727,0.000909
6,0.022727,0.000000,0.001364,0.008636,0.000000,0.020909,0.024091,0.002273,0.022727,0.003182,...,0.249545,0.000000,0.000000,0.001818,0.000455,0.000000,0.001818,0.003182,0.609091,0.022727
7,0.000455,0.536364,0.001364,0.000455,0.002273,0.000000,0.000000,0.000909,0.021364,0.000000,...,0.030000,0.000455,0.403182,0.000000,0.000909,0.000455,0.001364,0.000455,0.000000,0.000000
8,0.000455,0.000455,0.627273,0.002727,0.025455,0.000455,0.000000,0.071818,0.001818,0.005909,...,0.001818,0.084545,0.000000,0.020000,0.105455,0.013182,0.009545,0.004091,0.000000,0.000455
9,0.250000,0.000000,0.001364,0.000455,0.000455,0.260909,0.014545,0.001818,0.082273,0.024091,...,0.005455,0.000455,0.000455,0.002273,0.001364,0.000909,0.002273,0.001818,0.011818,0.335000


In [178]:
final_avg = pavg_rf_norm_pred_df + pavg_rf_pred_df
final_avg.head()

Unnamed: 0,curve-left-spin-Lfirst,curve-left-spin-Rfirst,curve-left-step,curve-right-spin-Lfirst,curve-right-spin-Rfirst,curve-right-step,jump-one-leg,jump-two-leg,lateral-shuffle-left,lateral-shuffle-right,...,sit-to-stand,stair-down,stair-up,stand,stand-to-sit,v-cut-left-Lfirst,v-cut-left-Rfirst,v-cut-right-Lfirst,v-cut-right-Rfirst,walk
0,0.006636,0.008727,0.436545,0.017591,0.003682,0.657318,0.005818,0.012091,0.039000,0.040545,...,0.001455,0.137727,0.063500,0.000500,0.006818,0.015909,0.012909,0.019227,0.014682,0.458773
1,0.042136,0.119636,0.000455,0.064545,0.024318,0.000955,0.078773,0.194773,0.022318,0.016909,...,0.008545,0.009409,0.001455,0.000000,0.013500,0.329000,0.258182,0.545909,0.261091,0.000909
2,0.725364,0.611318,0.003818,0.347136,0.086955,0.001364,0.001864,0.008727,0.005227,0.000000,...,0.025045,0.001955,0.010591,0.000000,0.053727,0.039864,0.017000,0.044773,0.012409,0.001955
3,0.055500,0.087091,0.000500,0.072273,0.064864,0.000500,0.009045,0.032318,0.002864,0.001000,...,0.192227,0.003364,0.002455,0.023091,1.335773,0.019727,0.023364,0.014864,0.029727,0.001909
4,0.002409,0.000500,0.172091,0.002864,0.000000,0.162545,0.024091,0.063909,0.232136,0.269045,...,0.002909,0.605455,0.068091,0.000000,0.011000,0.012227,0.009545,0.020136,0.007591,0.171091
5,0.039591,0.068045,0.000955,0.092273,0.051273,0.001409,0.075955,0.112545,0.020000,0.020227,...,0.022045,0.006136,0.000455,0.000000,0.027773,0.065045,0.303045,0.180227,0.911091,0.000500
6,0.003273,0.001000,0.045227,0.001864,0.000455,0.039227,0.018136,0.014682,0.617545,1.071591,...,0.000500,0.050227,0.009682,0.000000,0.001500,0.004318,0.004818,0.008136,0.002318,0.046409
7,0.001909,0.000455,0.000955,0.002364,0.000909,0.000000,0.000455,0.001455,0.031500,0.000000,...,0.002773,0.060364,0.000000,1.192864,0.000455,0.001364,0.000000,0.000000,0.000000,0.000000
8,0.186818,0.196045,0.000955,1.131273,0.212455,0.000955,0.013727,0.014091,0.005318,0.001000,...,0.052455,0.005318,0.009409,0.000955,0.029682,0.024545,0.023636,0.038409,0.049500,0.002955
9,0.003818,0.001455,0.431500,0.003364,0.001864,0.696000,0.002955,0.006318,0.015955,0.031318,...,0.001455,0.184273,0.081091,0.000000,0.002909,0.007273,0.006864,0.004909,0.003773,0.476409


In [180]:
final_avg.idxmax(axis = 1)
#assign the labels to the test_subjects file
test_subjects['Label'] = final_avg.idxmax(axis = 1)
test_subjects.head()

Unnamed: 0,Subject,Datafile,Label
0,Subject01,Subject01/Subject01_Aufnahme000.csv,curve-right-step
1,Subject01,Subject01/Subject01_Aufnahme001.csv,v-cut-right-Lfirst
2,Subject01,Subject01/Subject01_Aufnahme002.csv,curve-left-spin-Lfirst
3,Subject01,Subject01/Subject01_Aufnahme003.csv,stand-to-sit
4,Subject01,Subject01/Subject01_Aufnahme004.csv,stair-down


In [196]:
#name submission file
submissionFileName = 'submission_mode_ plus90sub_mimisdataset'

test_subjects.to_csv(submissionFileName + '.csv',index=False)

In [182]:
sub1 = pd.read_csv('submission_direction_chain.csv')
sub2 = pd.read_csv('submission_ensembled_ensemble_mimisdataset.csv')
sub3 = pd.read_csv('submission_adv_pavg_rf_mimisdataset.csv')
sub4 = pd.read_csv('submission_pavg_rf_mimisdataset.csv')
sub5 = pd.read_csv('submission_pavg_rf_norm_mimisdataset.csv')


In [191]:
mega_sub = pd.DataFrame(sub1.Label)
mega_sub['Label2'] = sub2.Label
mega_sub['Label3'] = sub3.Label
mega_sub['Label4'] = sub4.Label
mega_sub['Label5'] = sub5.Label
mega_sub.mode(axis=1).iloc[:,1].unique()

array([nan, 'curve-right-spin-Rfirst', 'v-cut-right-Lfirst',
       'curve-right-spin-Lfirst'], dtype=object)

In [202]:
mega_sub[mega_sub.mode(axis=1).iloc[:,1] == 'v-cut-right-Lfirst']

Unnamed: 0,Label,Label2,Label3,Label4,Label5
919,lateral-shuffle-left,v-cut-right-Lfirst,v-cut-right-Lfirst,lateral-shuffle-right,lateral-shuffle-left


In [203]:
sub2.iloc[919]

Subject                               Subject14
Datafile    Subject14/Subject14_Aufnahme061.csv
Label                        v-cut-right-Lfirst
Name: 919, dtype: object

In [195]:
test_subjects['Label'] = mega_sub.mode(axis=1).iloc[:, 0]
test_subjects

Unnamed: 0,Subject,Datafile,Label
0,Subject01,Subject01/Subject01_Aufnahme000.csv,curve-right-step
1,Subject01,Subject01/Subject01_Aufnahme001.csv,v-cut-right-Lfirst
2,Subject01,Subject01/Subject01_Aufnahme002.csv,curve-left-spin-Lfirst
3,Subject01,Subject01/Subject01_Aufnahme003.csv,stand-to-sit
4,Subject01,Subject01/Subject01_Aufnahme004.csv,stair-down
5,Subject01,Subject01/Subject01_Aufnahme005.csv,v-cut-right-Rfirst
6,Subject01,Subject01/Subject01_Aufnahme006.csv,lateral-shuffle-right
7,Subject01,Subject01/Subject01_Aufnahme007.csv,stand
8,Subject01,Subject01/Subject01_Aufnahme008.csv,curve-right-spin-Lfirst
9,Subject01,Subject01/Subject01_Aufnahme009.csv,curve-right-step


In [197]:
#check similarity with old submissions as a sanity check
best_submission = pd.read_csv('submission_pavg_rf_mimisdataset.csv')

print('similarity=%f' % (sum(1 for i in range(len(pred_label)) 
                             if sub1.iloc[i,2] == sub2.iloc[i,2]) / float(len(pred_label))))

similarity=0.945915
