In [79]:
# Import packages
import pandas as pd 
from matplotlib import pyplot
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from parse import preprocess

In [80]:
df = preprocess("rawfile_blood.csv")


####################################################################
Number of Rows of Dataframe:
1123
Number of Columns of Dataframe:
59

####################################################################
Threshold for number of NULLs in a column: 0.1095
Number of Columns before Parsing for Too Many NULLs in a column:
59
Number of Columns after Parsing for Too Many NULLs in a column:
51

Columns Removed:
B1_b5
B4_a1
B4_a3
B4_a4
B4_a6
B4_b1
B4_b3
B5_a1

####################################################################
Number of Rows before Parsing NULLs in data:
1123
Number of Rows after Parsing NULLs in data:
1007

####################################################################
Number of Columns after dropping A1_2, B1_b4, B2_c3, B4_b2 for inconsistent data types:
47


In [81]:
# Initialise counters for each condition
frail = 0
frail_mci = 0
mci = 0
prefrail_mci = 0
prefrail = 0
robust = 0

# Count rows of data for each condition
for i in range(0, len(df)):
	if df.at[i, 'condition'] == 'frail':
		frail += 1
	elif df.at[i, 'condition'] == 'frail_mci':
		frail_mci += 1
	elif df.at[i, 'condition'] == 'mci':
		mci += 1
	elif df.at[i, 'condition'] == 'prefrail_mci':
		prefrail_mci += 1
	elif df.at[i, 'condition'] == 'prefrail':
		prefrail += 1
	elif df.at[i, 'condition'] == 'robust':
		robust += 1
        
# Display number of rows (frequency) for each condition (label)
print("\n####################################################################")
print("Labels with frequencies:")
print("Frail:", frail)
print("Frail + MCI:", frail_mci)
print("MCI:", mci)
print("Prefrail + MCI:", prefrail_mci)
print("Prefrail:", prefrail)
print("Robust:", robust)


####################################################################
Labels with frequencies:
Frail: 7
Frail + MCI: 76
MCI: 133
Prefrail + MCI: 231
Prefrail: 221
Robust: 339


In [82]:
# Specify features and labels
y = df['condition']
x = df.drop(['mtag', 'condition'], axis=1)

# Display features and labels
print(x, y)

# Display shape of features and labels
print("\nShape of Features:")
print(x.shape)
print("\nShape of Labels:")
print(y.shape)

      A1_1  A2_1  A3_1  B1_a  B1_a1  B1_a2  B1_a3  B1_a4  B1_a5  B1_a6  ...  \
0      196    24  46.5   121   3.93   0.37     95     31    324   13.3  ...   
1      200    23  55.6   142   4.82   0.42     87     30    346   12.8  ...   
2      441    20  76.8   105   4.54   0.41     90     30    330   14.0  ...   
3      265    16  47.2   122   4.53   0.39     86     27    313   14.9  ...   
4      425    14  31.3   124   4.44   0.38     85     28    329   12.6  ...   
...    ...   ...   ...   ...    ...    ...    ...    ...    ...    ...  ...   
1002   220    19  67.5   138   4.66   0.42     91     30    325   14.1  ...   
1003   334    18  51.0   139   4.63   0.42     91     30    330   15.6  ...   
1004   418    17  61.0   122   4.18   0.38     90     29    324   13.5  ...   
1005   393    18  43.1   136   4.57   0.43     94     30    316   12.5  ...   
1006   371    24  55.9   127   4.41   0.40     90     29    320   13.8  ...   

      B2_d6  B2_d7  B2_d8  B2_d9   B3  B4_a2  B4_a5

In [83]:
# Transform the dataset using SMOTE
oversample = SMOTE()
x, y = oversample.fit_resample(x, y)

In [84]:
# Summarise the new class distribution
counter = Counter(y)
print(counter)

Counter({'frail': 339, 'frail_mci': 339, 'mci': 339, 'prefrail_mci': 339, 'prefrail': 339, 'robust': 339})


In [85]:
# Show Dataframes
print(x)
print(y)

      A1_1  A2_1       A3_1  B1_a     B1_a1     B1_a2  B1_a3  B1_a4  B1_a5  \
0      196    24  46.500000   121  3.930000  0.370000     95     31    324   
1      200    23  55.600000   142  4.820000  0.420000     87     30    346   
2      441    20  76.800000   105  4.540000  0.410000     90     30    330   
3      265    16  47.200000   122  4.530000  0.390000     86     27    313   
4      425    14  31.300000   124  4.440000  0.380000     85     28    329   
...    ...   ...        ...   ...       ...       ...    ...    ...    ...   
2029   359    21  31.314081   140  5.034699  0.450128     90     29    321   
2030   472    16  97.426022   143  4.329100  0.391704     90     29    327   
2031   323    26  51.657118   151  4.794706  0.445810     93     30    332   
2032   426    15  40.541701   137  5.567463  0.397034     72     24    344   
2033   346    16  33.987411   133  4.360000  0.390180     89     29    324   

          B1_a6  ...  B2_d6  B2_d7  B2_d8  B2_d9        B3     

In [86]:
# Initialise counters for each condition
frail = 0
frail_mci = 0
mci = 0
prefrail_mci = 0
prefrail = 0
robust = 0

# Count rows of data for each condition
for i in range(0, len(y)):
	if y[i] == 'frail':
		frail += 1
	elif y[i] == 'frail_mci':
		frail_mci += 1
	elif y[i] == 'mci':
		mci += 1
	elif y[i] == 'prefrail_mci':
		prefrail_mci += 1
	elif y[i] == 'prefrail':
		prefrail += 1
	elif y[i] == 'robust':
		robust += 1
        
# Display number of rows (frequency) for each condition (label)
print("\n####################################################################")
print("Labels with frequencies:")
print("Frail:", frail)
print("Frail + MCI:", frail_mci)
print("MCI:", mci)
print("Prefrail + MCI:", prefrail_mci)
print("Prefrail:", prefrail)
print("Robust:", robust)


####################################################################
Labels with frequencies:
Frail: 339
Frail + MCI: 339
MCI: 339
Prefrail + MCI: 339
Prefrail: 339
Robust: 339


In [87]:
# Data Dictionary:
# frail -> 0
# frail_mci -> 1
# mci -> 2
# prefrail_mci -> 3
# prefrail -> 4
# robust -> 5

# Conduct label mapping for conditions
label_mapping = {
    'frail' : 0,
    'frail_mci' : 1,
    'mci' : 2,
    'prefrail_mci' : 3,
    'prefrail' : 4,
    'robust' : 5
}

y = y.map(label_mapping)
y = np.array(y)

# Display label
print(y)

# Display shape of label
print(y.shape)

[0 0 0 ... 3 3 3]
(2034,)


In [88]:
# Conduct train-test split on dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Display x_train, x_test, y_train, y_test
print("\nX Train:")
print(x_train)
print("\nX Test:")
print(x_test)
print("\nY Train:")
print(y_train)
print("\nY Test:")
print(y_test)

# Display shape of train and test sets
print("\nShape of X Train:")
print(x_train.shape)
print("\nShape of X Test:")
print(x_test.shape)
print("\nShape of Y Train:")
print(y_train.shape)
print("\nShape of Y Test:")
print(y_test.shape)


X Train:
      A1_1  A2_1       A3_1  B1_a     B1_a1     B1_a2  B1_a3  B1_a4  B1_a5  \
2002   575    16  36.010380   137  4.764876  0.384016     80     27    346   
229    221    25  55.800000   162  5.290000  0.480000     91     31    337   
1098   443    16  29.127091   125  4.349804  0.375900     85     28    337   
1975   578    24  34.152032   134  4.042324  0.348656     85     29    346   
469    418    15  76.600000   113  5.120000  0.360000     71     23    319   
...    ...   ...        ...   ...       ...       ...    ...    ...    ...   
863    279    13  46.900000   143  3.990000  0.380000     95     32    338   
1583   238    24  30.479631   131  5.420964  0.414410     76     24    314   
1720   561    15  52.046187   141  4.774581  0.413755     85     29    346   
249    371    21  27.200000   143  4.270000  0.380000     89     30    334   
401    317    14  55.600000   111  4.460000  0.410000     91     30    333   

          B1_a6  ...  B2_d6  B2_d7  B2_d8  B2_d9     

In [89]:
# Create linear SVM model
model = svm.SVC(kernel='linear')

# Train the model
model.fit(x_train, y_train)

# Make predictions
predictions = model.predict(x_test)

# Measure accuracy
acc = accuracy_score(y_test, predictions)

# Print predictions, actual, and accuracy score
print("Predictions:", predictions)
print("Actual:", y_test)
print("Accuracy:", acc)

Predictions: [0 4 1 2 0 1 3 2 0 5 0 4 0 5 4 0 0 5 2 5 1 1 1 4 5 5 4 1 1 1 1 3 4 4 2 2 1
 2 1 1 0 5 1 4 2 3 1 3 0 5 4 5 3 5 2 1 4 1 5 1 4 1 3 5 2 1 3 3 3 4 5 5 0 3
 0 3 5 5 2 5 1 1 4 0 2 0 5 5 5 5 5 1 2 4 0 1 1 3 5 2 5 1 3 0 5 1 5 1 0 3 3
 2 3 3 0 2 0 1 2 5 1 5 2 3 1 0 0 4 1 3 5 0 2 4 1 4 1 1 2 1 2 3 2 4 4 0 1 1
 3 5 5 0 0 1 3 2 1 1 2 5 4 3 1 4 3 3 5 5 3 0 0 5 1 1 4 1 1 4 1 2 2 1 1 0 2
 1 5 2 4 5 5 4 1 1 0 1 4 2 0 0 3 3 1 2 1 5 2 3 5 5 0 1 3 3 2 5 0 0 2 4 0 0
 3 5 3 5 5 1 5 0 1 2 4 3 5 1 2 2 3 5 0 1 2 2 5 1 4 3 4 1 1 5 1 5 1 4 0 1 3
 3 1 5 4 5 0 0 3 3 1 2 1 0 0 0 3 2 5 1 2 3 5 1 1 3 1 0 5 5 1 5 4 1 1 1 3 4
 5 5 3 3 4 5 1 2 2 0 0 0 5 1 5 1 2 2 0 0 3 1 0 1 1 5 0 0 5 2 1 1 5 2 3 1 0
 4 1 2 0 2 4 1 1 3 3 1 2 0 1 2 1 0 5 4 2 4 3 1 2 5 1 2 2 4 2 1 1 1 1 2 3 2
 5 1 3 1 0 1 4 4 3 5 5 0 3 4 2 1 0 1 1 5 5 3 0 2 5 4 4 1 2 2 2 3 2 3 1 3 3]
Actual: [0 4 1 2 0 4 3 2 0 4 0 3 0 5 5 0 0 5 2 4 3 1 1 4 5 2 1 1 1 3 3 1 4 4 4 1 3
 1 1 1 0 5 3 4 5 2 5 5 0 4 4 5 2 5 3 3 3 2 5 1 3 3 5 5 3 1 5 5 3 2 5 3 0 3
 0 

In [91]:
# Show confusion matrix
print(confusion_matrix(y_test, predictions))

[[54  0  0  0  0  0]
 [ 0 47  6 13  4  4]
 [ 1 15 35 14  3 16]
 [ 0 24  3 17  9  9]
 [ 2 12  8  3 23 17]
 [ 3  7 11 13  6 28]]
