In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import time
%matplotlib notebook
import seaborn as sns
from scipy.special import expit as logit
from sklearn.model_selection import train_test_split
from scipy.special import expit as sigmoid # is more stable in case of overflows
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, \
recall_score, precision_score, accuracy_score, confusion_matrix
import math


# Prepare data

In [58]:
df_cleaned = pd.read_csv('/data/ortho/AllPatients.csv',  sep= ';')


In [59]:
# bereken symmetrie
df_cleaned['clavicula_x_dif'] = np.absolute(df_cleaned['clavicula_l_x'] - df_cleaned['clavicula_r_x'])
df_cleaned['clavicula_y_dif'] = np.absolute(df_cleaned['clavicula_l_y'] - df_cleaned['clavicula_r_y'])
df_cleaned['clavicula_z_dif'] = np.absolute(df_cleaned['clavicula_l_z'] - df_cleaned['clavicula_r_z'])

df_cleaned['scapula_x_dif'] = np.absolute(df_cleaned['scapula_l_x'] - df_cleaned['scapula_r_x'])
df_cleaned['scapula_y_dif'] = np.absolute(df_cleaned['scapula_l_y'] - df_cleaned['scapula_r_y'])
df_cleaned['scapula_z_dif'] = np.absolute(df_cleaned['scapula_l_z'] - df_cleaned['scapula_r_z'])

df_cleaned['humerus_x_dif'] = np.absolute(df_cleaned['humerus_l_x'] - df_cleaned['humerus_r_x'])
df_cleaned['humerus_y_dif'] = np.absolute(df_cleaned['humerus_l_y'] - df_cleaned['humerus_r_y'])
df_cleaned['humerus_z_dif'] = np.absolute(df_cleaned['humerus_l_z'] - df_cleaned['humerus_r_z'])

# hulp array, met alle parameters die voor de classifier gebruikt worden, je kan hier alles in doen wat je wilt
param = [ \
          'humerus_l_x', 'humerus_l_y', 'humerus_l_z', 'humerus_r_x', 'humerus_r_y', 'humerus_r_z', \
          'clavicula_l_x', 'clavicula_l_y', 'clavicula_l_z', 'clavicula_r_x', 'clavicula_r_y', 'clavicula_r_z', \
          'scapula_l_x', 'scapula_l_y', 'scapula_l_z', 'scapula_r_x', 'scapula_r_y', 'scapula_r_z', \
          'clavicula_x_dif','clavicula_y_dif','clavicula_z_dif', \
          'scapula_x_dif','scapula_y_dif','scapula_z_dif', \
          'humerus_x_dif', 'humerus_y_dif', 'humerus_z_dif'
         ]

df_cleaned['bias'] = 1

# split oorsprong kolom in onderdelen
x,y = df_cleaned['Oorsprong'].str.split(".").str #Oordprong word vertaald naar een string en wordt gesplits op de punt
df_cleaned['cat'],df_cleaned['pat'],df_cleaned['meting'],df_cleaned['oef'] = x.str.split("_").str #4 categorieen gemaakt obv file name
df_cleaned['cat'] = [ int(x[3:]) for x in df_cleaned['cat']] #voor elk 3+ element in de kolom wordt vertaald naar een int
df_cleaned['meting'] = [ int(x[6:]) for x in df_cleaned['meting']] 
df_cleaned['oef'] = [ int(x[3:]) for x in df_cleaned['oef']] 
df_cleaned['pat'] = [ int(x[3:]) for x in df_cleaned['pat']] 
#na deze regels te hebben uitgevoerd zijn er nieuwe categorieen met ints.

df_cleaned['pat'] = df_cleaned['cat']*1000+df_cleaned['pat'] #geef elke patient een uniek nummer

#maak boolean kolom per categorie
df_cleaned['c4'] = ['Cat4' in vincent for vincent in df_cleaned['Oorsprong']]
df_cleaned['c3'] = ['Cat3' in vincent for vincent in df_cleaned['Oorsprong']]
df_cleaned['c2'] = ['Cat2' in vincent for vincent in df_cleaned['Oorsprong']]
df_cleaned['c1'] = ['Cat1' in vincent for vincent in df_cleaned['Oorsprong']]

#df_cleaned = df_cleaned[~df_cleaned.c3]
#df_cleaned = df_cleaned[~df_cleaned.c2]

#Xcolumns = ['bias']
#Xcolumns.extend(param)

#X = df_cleaned[Xcolumns]
#y = df_cleaned['c4']
y = df_cleaned['cat']

Cleaned_train, Cleaned_test, y_tmp, y_tmp2 = train_test_split(df_cleaned, y, test_size = 0.2, random_state=42)




In [None]:
def VisualizeItems(items, x, y, z):
    for i in items:
        
        
        try:
            splitted = i.split('.')[0].split('_')
            Cat = splitted[0]
            pat = splitted[1]
            meting = splitted[2]
            oef = splitted[3]
        except(IndexError):
            oef = 'failed'
        print(i)
        data = HeaderMaker(i)

        xlist = data[x]
        ylist = data[y]
        zlist = data[z]
        
        
        index = 0
        NewList = []
        for num, ColumnList in enumerate([xlist, ylist, zlist]):
            templist = []
            for index in range(1, len(ColumnList)):
                try:
                    vorige = ColumnList.iloc[index-1]
                    volgende = ColumnList.iloc[index+1]
                    templist.append(vorige - volgende)
                except(IndexError):
                    templist.append(0)
                    
            NewList.append({'x':list(ColumnList[1:]), 'y':templist})
#             return NewList
#             break
        
                
        for item in NewList:
            print('NewItems')
            fig = plt.figure()
            ax = fig.gca()
            for i in range(0, len(item['x']), 1):
                plt.plot(item['x'][i:i+2], item['y'][i:i+2], 'ro-')
            cricle = plt.Circle((item['x'][0], item['y'][0]), 2, color='y')
            ax.add_artist(cricle)
            plt.title('cat: %s pat: %s oef: %s meting: %s' % (Cat, pat, oef, meting))
            
            TotaleOppervlakte = 0
            for index, CurrentXValue in enumerate(item['x']):
                if index == 0:
                    continue
                
                PreviousXValue = item['x'][index - 1]
                
                if CurrentXValue > PreviousXValue:
                    Multiplier = 1
                elif CurrentXValue < PreviousXValue:
                    Multiplier = -1
                else:
                    # Zelfde X waarde, dus geen oppervlakte
                    continue
                
                LowestValue = min(item['y'])
                CurrentyValue = item['y'][index]
                PreviousyValue = item['y'][index - 1]
                
                # Vierkant oppervlakte berekenen
                breedte = abs(abs(CurrentXValue) - abs(PreviousXValue))
                hoogte = abs(LowestValue) + abs(min([CurrentyValue, PreviousyValue]))
                Oppervlakte = breedte * hoogte
                
                # Driehoek oppervlakte berekeken
                hoogteDrie = abs(CurrentyValue - PreviousyValue)
                Oppervlakte = Oppervlakte + 0.5 * hoogteDrie * breedte
                
                TotaleOppervlakte += Oppervlakte * Multiplier
            plt.text(0,0, str(abs(TotaleOppervlakte)))
                
            
# #         plt.ylim((-180,180))
#         plt.plot(NewList[0], color = 'red')
#         plt.plot(NewList[1], color = 'blue')
#         plt.plot(NewList[2], color = 'green')
#         plt.legend()
        
    plt.tight_layout()  
    plt.show()

def CreateList(keywordslist):
    allitems = listdir('/data/ortho/Cleaned Train/')
    filteredlist = []
    for i in keywordslist:
        for x in allitems:
            if (i in x):
                filteredlist.append(x)
        allitems = filteredlist
        filteredlist = []
    finallist = []
    for i in allitems:
        finallist.append('/data/ortho/Cleaned Train/%s' % i)
    return finallist

In [60]:
trueval = 'c3'

Cleaned_train = Cleaned_train[~Cleaned_train.c1]
Cleaned_train = Cleaned_train[~Cleaned_train.c2]

Xcolumns = ['bias']
Xcolumns.extend(param)

X_train = Cleaned_train[Xcolumns]
y_train = Cleaned_train[trueval]

Cleaned_test = Cleaned_test[~Cleaned_test.c1]
Cleaned_test = Cleaned_test[~Cleaned_test.c2]

X_test = Cleaned_test[Xcolumns]
y_test = Cleaned_test[trueval]



In [61]:

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

lr = LogisticRegression()
lr.fit(X_train, y_train)

y_true = y_test
y_pred = lr.predict(X_test)

TN, FP, FN, TP = confusion_matrix(y_true, y_pred).ravel()

print(lr.coef_[0])

tab = [["pred pos", TP, FP], ["pred neg", FN, TN]]
print(pd.DataFrame(tab, columns=["", "pos", "neg"]))
print()
print("recall: ", recall_score(y_true, y_pred))
print("precision: ", precision_score(y_true, y_pred))
print("accuracy: ", accuracy_score(y_true, y_pred.round().astype(bool)))
print('\n\n')






[ 2.03972004e+00  1.50535538e-02 -8.27857736e-04 -7.50669398e-03
 -7.67129880e-03  8.91702091e-02 -3.70871059e-03  7.09253305e-02
  7.39888994e-04 -3.08675032e-03  1.39829472e-02  3.14405430e-02
  3.54060559e-02  3.38103226e-03 -1.06833729e-01  3.82811680e-02
 -3.11740566e-02 -1.43953321e-01 -9.32446284e-02  4.44109994e-02
 -2.98722088e-02  2.84904766e-02 -9.83266514e-02 -2.54278420e-02
  8.17545855e-02  2.75342589e-02  6.68627741e-03 -2.22519665e-02]
               pos   neg
0  pred pos  11451   681
1  pred neg    316  1400

recall:  0.9731452366788477
precision:  0.9438674579624134
accuracy:  0.928004043905257





In [62]:
print(y_pred)

[ True  True  True ...  True  True  True]


In [63]:
y_pred.round().astype(bool)

array([ True,  True,  True, ...,  True,  True,  True])