In [54]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import accuracy_score
from parse import preprocess

In [30]:
# Load dataset, preprocess rawfile
df = preprocess("rawfile_blood.csv")


####################################################################
Number of Rows of Dataframe:
1123
Number of Columns of Dataframe:
59

####################################################################
Threshold for number of NULLs in a column: 0.1095
Number of Columns before Parsing for Too Many NULLs in a column:
59
Number of Columns after Parsing for Too Many NULLs in a column:
51

Columns Removed:
B1_b5
B4_a1
B4_a3
B4_a4
B4_a6
B4_b1
B4_b3
B5_a1

####################################################################
Number of Rows before Parsing NULLs in data:
1123
Number of Rows after Parsing NULLs in data:
1007


In [31]:
# Initialise counters for each condition
frail = 0
frail_mci = 0
mci = 0
prefrail_mci = 0
prefrail = 0
robust = 0

In [32]:
# Count rows of data for each condition
for i in range(0, len(df)):
	if df.at[i, 'condition'] == 'frail':
		frail += 1
	elif df.at[i, 'condition'] == 'frail_mci':
		frail_mci += 1
	elif df.at[i, 'condition'] == 'mci':
		mci += 1
	elif df.at[i, 'condition'] == 'prefrail_mci':
		prefrail_mci += 1
	elif df.at[i, 'condition'] == 'prefrail':
		prefrail += 1
	elif df.at[i, 'condition'] == 'robust':
		robust += 1
        
# Display number of rows (frequency) for each condition (label)
print("\n####################################################################")
print("Labels with frequencies:")
print("Frail:", frail)
print("Frail + MCI:", frail_mci)
print("MCI:", mci)
print("Prefrail + MCI:", prefrail_mci)
print("Prefrail:", prefrail)
print("Robust:", robust)


####################################################################
Labels with frequencies:
Frail: 7
Frail + MCI: 76
MCI: 133
Prefrail + MCI: 231
Prefrail: 221
Robust: 339


In [35]:
# There are 668 Non-Robust and 339 Robust

# Collect data that is Non-Robust
df1 = df[df.condition != 'robust']
# Collect data that is Robust
df2 = df[df.condition == 'robust']

# Random sample 339 from Non-Robust
df1 = df1.sample(n=339)

# Append 339 Robust to 339 Non-Robust
df = df1.append(df2, ignore_index=True)

# Display Dataframe
print(df)

        mtag     condition  A1_1  A2_1  A3_1  B1_a  B1_a1  B1_a2  B1_a3  \
0    ME06950     frail_mci  1476    13  33.7   133   4.58   0.38     83   
1    MV00408      prefrail   239     9  72.6   142   4.95   0.45     91   
2    ME10236     frail_mci   474    20  38.0   121   3.95   0.37     94   
3    MV00386      prefrail   474    22  39.2   158   6.80   0.49     72   
4    ME03542  prefrail_mci   352    14  29.8   131   5.04   0.41     81   
..       ...           ...   ...   ...   ...   ...    ...    ...    ...   
673  MV00454        robust   220    19  67.5   138   4.66   0.42     91   
674  MV00456        robust   334    18  51.0   139   4.63   0.42     91   
675  MV00460        robust   418    17  61.0   122   4.18   0.38     90   
676  MV00502        robust   393    18  43.1   136   4.57   0.43     94   
677  MV00510        robust   371    24  55.9   127   4.41   0.40     90   

     B1_a4  ...  B2_d6  B2_d7  B2_d8  B2_d9    B3  B4_a2  B4_a5  B5_a2  B5_a3  \
0       26  ...   

In [36]:
# Specify features and labels
y = df['condition']
x = df.drop(['mtag', 'condition'], axis=1)

# Display features and labels
print(x, y)

# Display shape of features and labels
print("\nShape of Features:")
print(x.shape)
print("\nShape of Labels:")
print(y.shape)

     A1_1  A2_1  A3_1  B1_a  B1_a1  B1_a2  B1_a3  B1_a4  B1_a5  B1_a6  ...  \
0    1476    13  33.7   133   4.58   0.38     83     26    316   15.0  ...   
1     239     9  72.6   142   4.95   0.45     91     29    316   13.5  ...   
2     474    20  38.0   121   3.95   0.37     94     30    321   14.5  ...   
3     474    22  39.2   158   6.80   0.49     72     23    322   15.2  ...   
4     352    14  29.8   131   5.04   0.41     81     26    321   14.3  ...   
..    ...   ...   ...   ...    ...    ...    ...    ...    ...    ...  ...   
673   220    19  67.5   138   4.66   0.42     91     30    325   14.1  ...   
674   334    18  51.0   139   4.63   0.42     91     30    330   15.6  ...   
675   418    17  61.0   122   4.18   0.38     90     29    324   13.5  ...   
676   393    18  43.1   136   4.57   0.43     94     30    316   12.5  ...   
677   371    24  55.9   127   4.41   0.40     90     29    320   13.8  ...   

     B2_d6  B2_d7  B2_d8  B2_d9    B3  B4_a2  B4_a5  B5_a2  B5_

In [37]:
# Data Dictionary:
# frail -> 0
# frail_mci -> 0
# mci -> 0
# prefrail_mci -> 0
# prefrail -> 0
# robust -> 1

# Conduct label mapping for conditions
label_mapping = {
    'frail' : 0,
    'frail_mci' : 0,
    'mci' : 0,
    'prefrail_mci' : 0,
    'prefrail' : 0,
    'robust' : 1
}

y = y.map(label_mapping)

# Display label
print(y)

# Display shape of label
print(y.shape)

0      0
1      0
2      0
3      0
4      0
      ..
673    1
674    1
675    1
676    1
677    1
Name: condition, Length: 678, dtype: int64
(678,)


In [40]:
# Conduct train-test split on dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Display x_train, x_test, y_train, y_test
print("\nX Train:")
print(x_train)
print("\nX Test:")
print(x_test)
print("\nY Train:")
print(y_train)
print("\nY Test:")
print(y_test)

# Display shape of train and test sets
print("\nShape of X Train:")
print(x_train.shape)
print("\nShape of X Test:")
print(x_test.shape)
print("\nShape of Y Train:")
print(y_train.shape)
print("\nShape of Y Test:")
print(y_test.shape)


X Train:
     A1_1  A2_1   A3_1  B1_a  B1_a1  B1_a2  B1_a3  B1_a4  B1_a5  B1_a6  ...  \
580   485    18  42.60   153   4.30   0.40     93     32    340   12.9  ...   
173   469    20  47.10   136   4.64   0.40     86     29    341   14.6  ...   
643   231    14   4.07   133   4.71   0.39     83     28    340   14.4  ...   
370   349    16  51.10   146   5.73   0.48     83     27    328   16.4  ...   
327   679    10  41.30   140   4.04   0.36     88     32    360   16.4  ...   
..    ...   ...    ...   ...    ...    ...    ...    ...    ...    ...  ...   
27    259    14  27.20   117   4.19   0.37     89     28    314   14.1  ...   
281   413    15  30.90   125   4.12   0.37     89     30    341   14.1  ...   
396   275    15  68.10   134   6.22   0.40     65     21    317   15.9  ...   
156   239    16  21.00   151   4.42   0.40     91     29    323   13.5  ...   
597   472    17  92.20   136   6.08   0.41     67     22    334   15.4  ...   

     B2_d6  B2_d7  B2_d8  B2_d9   B3  B4_

In [64]:
# Create kNN model
knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [65]:
# Train the model
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [66]:
# Make prediction
prediction = knn.predict(x_test)

In [67]:
# Measure accuracy
accuracy = metrics.accuracy_score(y_test, prediction)

# Display predictions and Accuracy
print("Predictions:", prediction)
print("Actual:", np.array(y_test))
print("Accuracy:", accuracy)

Predictions: [0 1 1 0 1 1 0 0 0 1 1 1 0 0 1 1 1 1 1 1 0 0 1 1 0 0 1 1 1 1 0 1 0 0 0 0 1
 1 1 0 0 1 1 0 1 1 1 0 0 1 0 1 0 0 0 1 1 0 0 1 1 1 0 1 0 1 0 1 0 1 0 1 1 0
 0 1 0 0 0 0 0 0 0 1 1 1 0 0 1 1 1 0 1 1 1 0 1 0 0 0 0 1 0 1 0 1 0 0 1 0 0
 1 0 0 0 0 0 1 0 1 0 0 1 0 1 1 1 1 0 1 0 0 1 1 1 1]
Actual: [0 0 1 0 0 1 0 0 0 0 0 1 1 1 0 1 1 1 0 0 0 1 0 1 1 1 0 1 0 0 1 0 0 1 0 1 1
 0 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 0 1 0 1 0 0 1 1 0 0 1 0 1 1 1 1 1 0 0 1 0
 0 1 0 1 1 1 0 1 1 0 1 1 1 1 0 1 0 1 0 1 1 0 1 0 0 0 0 1 0 1 0 1 0 0 1 1 1
 1 0 1 0 1 0 1 1 0 0 1 0 0 1 0 1 1 1 0 0 1 0 1 0 1]
Accuracy: 0.5441176470588235


In [68]:
# Create linear SVM model
model = svm.SVC(kernel='linear')

# Train the model
model.fit(x_train, y_train)

# Make predictions
predictions = model.predict(x_test)

# Measure accuracy
acc = accuracy_score(y_test, predictions)

# Print predictions, actual, and accuracy score
print("Predictions:", predictions)
print("Actual:", y_test)
print("Accuracy:", acc)

Predictions: [0 0 0 0 0 1 1 0 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 1 1 0
 1 1 1 1 0 0 0 1 1 1 0 1 0 0 1 0 0 1 0 0 0 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0
 0 1 0 1 0 1 1 1 0 0 1 0 0 0 1 0 1 1 1 1 1 0 1 1 0 0 1 1 1 0 1 0 0 0 1 0 1
 1 0 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 1 0 0 0 1 0 0 0]
Actual: 201    0
130    0
611    1
115    0
271    0
      ..
381    1
291    0
511    1
49     0
501    1
Name: condition, Length: 136, dtype: int64
Accuracy: 0.6102941176470589


In [56]:
# Create RBF SVM model
model = svm.SVC(kernel='rbf')

# Train the model
model.fit(x_train, y_train)

# Make predictions
predictions = model.predict(x_test)

# Measure accuracy
acc = accuracy_score(y_test, predictions)

# Print predictions, actual, and accuracy score
print("Predictions:", predictions)
print("Actual:", y_test)
print("Accuracy:", acc)

Predictions: [1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1 1 1 0 1 1 0
 1 1 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 0 0 0 1 0 1 1 0 1 0 0 1 0 1
 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1]
Actual: 201    0
130    0
611    1
115    0
271    0
      ..
381    1
291    0
511    1
49     0
501    1
Name: condition, Length: 136, dtype: int64
Accuracy: 0.5367647058823529


In [57]:
# Create Sigmoid SVM model
model = svm.SVC(kernel='sigmoid')

# Train the model
model.fit(x_train, y_train)

# Make predictions
predictions = model.predict(x_test)

# Measure accuracy
acc = accuracy_score(y_test, predictions)

# Print predictions, actual, and accuracy score
print("Predictions:", predictions)
print("Actual:", y_test)
print("Accuracy:", acc)

Predictions: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Actual: 201    0
130    0
611    1
115    0
271    0
      ..
381    1
291    0
511    1
49     0
501    1
Name: condition, Length: 136, dtype: int64
Accuracy: 0.49264705882352944


In [58]:
# Create Polynomial SVM model
model = svm.SVC(kernel='poly')

# Train the model
model.fit(x_train, y_train)

# Make predictions
predictions = model.predict(x_test)

# Measure accuracy
acc = accuracy_score(y_test, predictions)

# Print predictions, actual, and accuracy score
print("Predictions:", predictions)
print("Actual:", y_test)
print("Accuracy:", acc)

Predictions: [1 0 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0
 1 1 1 1 1 0 0 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 0 0 1 1 0 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 0]
Actual: 201    0
130    0
611    1
115    0
271    0
      ..
381    1
291    0
511    1
49     0
501    1
Name: condition, Length: 136, dtype: int64
Accuracy: 0.5735294117647058
