In [34]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import accuracy_score
from parse import preprocess

In [35]:
# Load dataset, preprocess rawfile
df = preprocess("rawfile_blood.csv")


####################################################################
Number of Rows of Dataframe:
1123
Number of Columns of Dataframe:
59

####################################################################
Threshold for number of NULLs in a column: 0.1095
Number of Columns before Parsing for Too Many NULLs in a column:
59
Number of Columns after Parsing for Too Many NULLs in a column:
51

Columns Removed:
B1_b5
B4_a1
B4_a3
B4_a4
B4_a6
B4_b1
B4_b3
B5_a1

####################################################################
Number of Rows before Parsing NULLs in data:
1123
Number of Rows after Parsing NULLs in data:
1007


In [36]:
# Initialise counters for each condition
frail = 0
frail_mci = 0
mci = 0
prefrail_mci = 0
prefrail = 0
robust = 0

In [37]:
# Count rows of data for each condition
for i in range(0, len(df)):
	if df.at[i, 'condition'] == 'frail':
		frail += 1
	elif df.at[i, 'condition'] == 'frail_mci':
		frail_mci += 1
	elif df.at[i, 'condition'] == 'mci':
		mci += 1
	elif df.at[i, 'condition'] == 'prefrail_mci':
		prefrail_mci += 1
	elif df.at[i, 'condition'] == 'prefrail':
		prefrail += 1
	elif df.at[i, 'condition'] == 'robust':
		robust += 1
        
# Display number of rows (frequency) for each condition (label)
print("\n####################################################################")
print("Labels with frequencies:")
print("Frail:", frail)
print("Frail + MCI:", frail_mci)
print("MCI:", mci)
print("Prefrail + MCI:", prefrail_mci)
print("Prefrail:", prefrail)
print("Robust:", robust)


####################################################################
Labels with frequencies:
Frail: 7
Frail + MCI: 76
MCI: 133
Prefrail + MCI: 231
Prefrail: 221
Robust: 339


In [38]:
# There are 76 Frail+MCI

# Collect data that is Frail+MCI
df1 = df[df.condition == 'frail_mci']
# Collect data that is Robust
df2 = df[df.condition == 'robust']

# Random sample 76 from Robust
df2 = df2.sample(n=76)

# Append 76 Frail+MCI to 76 Robust
df = df1.append(df2, ignore_index=True)

# Display Dataframe
print(df)

        mtag  condition  A1_1  A2_1  A3_1  B1_a  B1_a1  B1_a2  B1_a3  B1_a4  \
0    ME01378  frail_mci   241    20  33.5   150   5.25   0.46     87     29   
1    ME02832  frail_mci   444    16  87.0   134   4.65   0.40     85     28   
2    ME02909  frail_mci  1476    16  57.0   119   3.80   0.36     94     31   
3    ME02998  frail_mci   339    18  63.8   135   4.89   0.42     86     28   
4    ME03061  frail_mci   287    20  95.5   146   5.18   0.44     85     28   
..       ...        ...   ...   ...   ...   ...    ...    ...    ...    ...   
147  MV00373     robust   272    16  50.8   133   4.81   0.41     85     28   
148  ME01650     robust   237    14  33.8   125   5.07   0.47     92     30   
149  MV00024     robust   442    21  57.3   145   4.63   0.43     92     31   
150  MV00510     robust   371    24  55.9   127   4.41   0.40     90     29   
151  ME05432     robust   352    18  75.0   149   5.08   0.46     90     29   

     ...  B2_d6  B2_d7  B2_d8  B2_d9    B3  B4_a2  

In [39]:
# Specify features and labels
y = df['condition']
x = df.drop(['mtag', 'condition'], axis=1)

# Display features and labels
print(x, y)

# Display shape of features and labels
print("\nShape of Features:")
print(x.shape)
print("\nShape of Labels:")
print(y.shape)

     A1_1  A2_1  A3_1  B1_a  B1_a1  B1_a2  B1_a3  B1_a4  B1_a5  B1_a6  ...  \
0     241    20  33.5   150   5.25   0.46     87     29    328   12.8  ...   
1     444    16  87.0   134   4.65   0.40     85     28    329   15.2  ...   
2    1476    16  57.0   119   3.80   0.36     94     31    333   12.8  ...   
3     339    18  63.8   135   4.89   0.42     86     28    321   12.4  ...   
4     287    20  95.5   146   5.18   0.44     85     28    332   15.8  ...   
..    ...   ...   ...   ...    ...    ...    ...    ...    ...    ...  ...   
147   272    16  50.8   133   4.81   0.41     85     28    325   15.2  ...   
148   237    14  33.8   125   5.07   0.47     92     30    328   13.3  ...   
149   442    21  57.3   145   4.63   0.43     92     31    340   14.3  ...   
150   371    24  55.9   127   4.41   0.40     90     29    320   13.8  ...   
151   352    18  75.0   149   5.08   0.46     90     29    326   13.9  ...   

     B2_d6  B2_d7  B2_d8  B2_d9    B3  B4_a2  B4_a5  B5_a2  B5_

In [40]:
# Data Dictionary:
# frail_mci -> 0
# robust -> 1

# Conduct label mapping for conditions
label_mapping = {
    'frail_mci' : 0,
    'robust' : 1
}

y = y.map(label_mapping)

# Display label
print(y)

# Display shape of label
print(y.shape)

0      0
1      0
2      0
3      0
4      0
      ..
147    1
148    1
149    1
150    1
151    1
Name: condition, Length: 152, dtype: int64
(152,)


In [41]:
# Conduct train-test split on dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Display x_train, x_test, y_train, y_test
print("\nX Train:")
print(x_train)
print("\nX Test:")
print(x_test)
print("\nY Train:")
print(y_train)
print("\nY Test:")
print(y_test)

# Display shape of train and test sets
print("\nShape of X Train:")
print(x_train.shape)
print("\nShape of X Test:")
print(x_test.shape)
print("\nShape of Y Train:")
print(y_train.shape)
print("\nShape of Y Test:")
print(y_test.shape)


X Train:
     A1_1  A2_1  A3_1  B1_a  B1_a1  B1_a2  B1_a3  B1_a4  B1_a5  B1_a6  ...  \
53    519    26  44.8   131   4.42   0.39     87     29    330   12.5  ...   
41    796    15  42.7   143   4.97   0.42     85     26    310   14.0  ...   
107   342    28  39.8   113   6.05   0.56     92     32    345   13.0  ...   
121   274    18  41.8   134   4.60   0.41     88     29    331   14.5  ...   
2    1476    16  57.0   119   3.80   0.36     94     31    333   12.8  ...   
..    ...   ...   ...   ...    ...    ...    ...    ...    ...    ...  ...   
30    288    16  52.2   135   4.47   0.42     93     30    325   12.8  ...   
130   297    15  57.0   126   5.09   0.45     88     28    324   14.6  ...   
34    348    19  58.5   135   4.44   0.39     87     30    347   12.7  ...   
5     237    28  28.2   117   5.55   0.38     69     21    305   15.8  ...   
131   344    15  60.4   142   4.76   0.43     90     30    332   13.5  ...   

     B2_d6  B2_d7  B2_d8  B2_d9   B3  B4_a2  B4_a5  B

In [42]:
# Create kNN model
knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [43]:
# Train the model
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [44]:
# Make prediction
prediction = knn.predict(x_test)

In [45]:
# Measure accuracy
accuracy = metrics.accuracy_score(y_test, prediction)

# Display predictions and Accuracy
print("Predictions:", prediction)
print("Actual:", np.array(y_test))
print("Accuracy:", accuracy)

Predictions: [1 0 0 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1]
Actual: [0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 1 0 1 0 1]
Accuracy: 0.5161290322580645


In [46]:
# Create kNN model
knn = neighbors.KNeighborsClassifier(n_neighbors=10, weights='uniform')

In [47]:
# Train the model
knn.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [48]:
# Make prediction
prediction = knn.predict(x_test)

In [49]:
# Measure accuracy
accuracy = metrics.accuracy_score(y_test, prediction)

# Display predictions and Accuracy
print("Predictions:", prediction)
print("Actual:", np.array(y_test))
print("Accuracy:", accuracy)

Predictions: [0 0 0 1 0 1 1 1 0 0 1 0 1 0 0 1 1 1 0 1 1 0 0 1 0 1 1 1 0 1 1]
Actual: [0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 1 0 1 0 1]
Accuracy: 0.6451612903225806


In [50]:
# Create linear SVM model
model = svm.SVC(kernel='linear')

# Train the model
model.fit(x_train, y_train)

# Make predictions
predictions = model.predict(x_test)

# Measure accuracy
acc = accuracy_score(y_test, predictions)

# Print predictions, actual, and accuracy score
print("Predictions:", predictions)
print("Actual:", np.array(y_test))
print("Accuracy:", acc)

Predictions: [0 0 0 1 0 1 1 1 0 1 1 0 0 1 0 0 0 1 1 1 1 1 0 0 1 1 1 0 0 0 1]
Actual: [0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 1 0 1 0 1]
Accuracy: 0.8064516129032258


In [54]:
# Create RBF SVM model
model = svm.SVC(kernel='rbf')

# Train the model
model.fit(x_train, y_train)

# Make predictions
predictions = model.predict(x_test)

# Measure accuracy
acc = accuracy_score(y_test, predictions)

# Print predictions, actual, and accuracy score
print("Predictions:", predictions)
print("Actual:", np.array(y_test))
print("Accuracy:", acc)

Predictions: [1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Actual: [0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 1 0 1 0 1]
Accuracy: 0.41935483870967744


In [55]:
# Create Sigmoid SVM model
model = svm.SVC(kernel='sigmoid')

# Train the model
model.fit(x_train, y_train)

# Make predictions
predictions = model.predict(x_test)

# Measure accuracy
acc = accuracy_score(y_test, predictions)

# Print predictions, actual, and accuracy score
print("Predictions:", predictions)
print("Actual:", np.array(y_test))
print("Accuracy:", acc)

Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Actual: [0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 1 0 1 0 1]
Accuracy: 0.3870967741935484


In [56]:
# Create Polynomial SVM model
model = svm.SVC(kernel='poly')

# Train the model
model.fit(x_train, y_train)

# Make predictions
predictions = model.predict(x_test)

# Measure accuracy
acc = accuracy_score(y_test, predictions)

# Print predictions, actual, and accuracy score
print("Predictions:", predictions)
print("Actual:", np.array(y_test))
print("Accuracy:", acc)

Predictions: [1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1]
Actual: [0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 1 0 1 0 1]
Accuracy: 0.45161290322580644
