In [1]:
# Import packages
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from parse import preprocess

In [57]:
df = preprocess("rawfile_blood.csv")


####################################################################
Number of Rows of Dataframe:
1123
Number of Columns of Dataframe:
59

####################################################################
Threshold for number of NULLs in a column: 0.1095
Number of Columns before Parsing for Too Many NULLs in a column:
59
Number of Columns after Parsing for Too Many NULLs in a column:
51

Columns Removed:
B1_b5
B4_a1
B4_a3
B4_a4
B4_a6
B4_b1
B4_b3
B5_a1

####################################################################
Number of Rows before Parsing NULLs in data:
1123
Number of Rows after Parsing NULLs in data:
1007

####################################################################
Number of Columns after dropping A1_2, B1_b4, B2_c3, B4_b2 for inconsistent data types:
47


In [58]:
# Conduct High Correlation Filter

# Conduct mapping for Feature Names
featureName_mapping = {
    "A1_1" : "Vitamin B12 (pmol/L)",
    "A1_2" : "Serum Folate (nmol/L)",
    "A2_1" : "Serum Homocysteine (µmol/L)",
    "A3_1" : "25-hydroxy Vitamin D (nmol/L)",
    "B1_a" : "Haemoglobin (g/L)",
    "B1_a1" : "RBC (/L)",
    "B1_a2" : "PCV (L/L)",
    "B1_a3" : "MCV (fL)",
    "B1_a4" : "MCH (pg)",
    "B1_a5" : "MCHC (g/L)",
    "B1_a6" : "RDW (%)",
    "B1_b" : "White Cell Count (/L)",
    "B1_b1" : "Neutrophils (/L)",
    "B1_b2" : "Lymphocytes (/L)",
    "B1_b3" : "Monocytes (/L)",
    "B1_b4" : "Eosinophils (/L)",
    "B1_b5" : "Basophils (/L)",
    "B1_c" : "Platelets (/L)",
    "B1_d" : "Glucose (mmol/L)",
    "B2_a1" : "Total Cholesterol (mmol/L)",
    "B2_a2" : "Triglyceride (mmol/L)",
    "B2_a3" : "HDL Cholesterol (mmol/L)",
    "B2_a4" : "LDL Cholesterol (mmol/L)",
    "B2_a5" : "Total Cholesterol/HDL Ratio",
    "B2_b1" : "Sodium (mmol/L)",
    "B2_b2" : "Potassium (mmol/L)",
    "B2_b3" : "Chloride (mmol/L)",
    "B2_c1" : 'Urea (mmol/L)',
    "B2_c2" : "Creatinine (umol/L)",
    "B2_c3" : "eGFR (mL/min/1.73m2)",
    "B2_c4" : "Uric Acid (mmol/L)",
    "B2_c5" : "Calcium (mmol/L)",
    "B2_c6" : "Corrected Calcium (mmol/L)",
    "B2_c7" : "Phosphate (mmol/L)",
    "B2_d1" : "Total Protein (g/L)",
    "B2_d2" : "Albumin (g/L)",
    "B2_d3" : "Globulin (g/L)",
    "B2_d4" : "Albumin/Globulin ratio",
    "B2_d5" : "Alkaline Phosphatase (U/L)",
    "B2_d6" : "Total Bilirubin (µmol/L)",
    "B2_d7" : "GGT",
    "B2_d8" : "AST",
    "B2_d9" : "ALT",
    "B3" : "C-Reactive Protein",
    "B4_a1" : "Protein",
    "B4_a2" : "pH",
    "B4_a3" : "Glucose",
    "B4_a4" : "Ketones",
    "B4_a5" : "S.G.",
    "B4_a6" : "Blood",
    "B4_b1" : "Leucocytes (/L)",
    "B4_b2" : "Erythrocytes (/L)",
    "B4_b3" : "Epithelial Cells",
    "B5_a1" : "Free Thyroxine (FT4) (pmol/L)",
    "B5_a2" : "Thyroid Stimulating Hormone (mIU/L)",
    "B5_a3" : "Free Tri-iodothyronine (FT3) (pmol/L)",
    "B6" : "HbA1c"
}

# Remove 'mtag' and label 'condition' to keep only features
df_temp = df.drop(['mtag', 'condition'], 1)

# Show correlation between features
corr = df_temp.corr()

print("#################################################################")
print("Feature Correlation Table:\n")
print(corr)

# Export correlation data to CSV
corr.to_csv("featureCorrelation.csv")

highCorrValue = []
highCorrelationPairs = []
new_tuple = []

for i in range(0, len(corr)):
    for j in range(0, len(corr.columns)):
        if (corr.iat[i,j] != 1.0) and (corr.iat[i,j] >= 0.5):
            highCorrValue.append(corr.iat[i,j])
            highCorrelationPairs.append((corr.columns[i],corr.index[j]))

# Show High Correlation Pairs with respective Correlation value
highCorrelationPairs = list(set([tuple(sorted(i)) for i in highCorrelationPairs]))
highCorrelationPairs = sorted(highCorrelationPairs)
highCorrValue = list(dict.fromkeys(highCorrValue))

print("\n#################################################################")
print("High Correlation Pairs with Correlation Values:")
for i in range(0, len(highCorrValue)):
    tempList.append(highCorrelationPairs[i][0])
    tempList.append(highCorrelationPairs[i][1])
    print(highCorrelationPairs[i][0], "and", highCorrelationPairs[i][1], ":", highCorrValue[i])
print("\nHigh Correlation Pairs with Correlation Values:")

df_dummy = pd.DataFrame(highCorrelationPairs)

df_dummy[0] = df_dummy[0].map(featureName_mapping)
df_dummy[1] = df_dummy[1].map(featureName_mapping)

for i in range(0, len(df_dummy)):
    new_tuple.append((df_dummy.iat[i,0], df_dummy.iat[i,1]))

names = new_tuple

for i in range(0, len(highCorrValue)):
    print(names[i][0], "and", names[i][1], ":", highCorrValue[i])

#################################################################
Feature Correlation Table:

           A1_1      A2_1      A3_1      B1_a     B1_a1     B1_a2     B1_a3  \
A1_1   1.000000 -0.284103  0.062753 -0.059712 -0.102591 -0.091398  0.036896   
A2_1  -0.284103  1.000000 -0.033859 -0.012390  0.022796  0.019866  0.017413   
A3_1   0.062753 -0.033859  1.000000  0.041232 -0.012811  0.069288  0.095420   
B1_a  -0.059712 -0.012390  0.041232  1.000000  0.316936  0.487857  0.126687   
B1_a1 -0.102591  0.022796 -0.012811  0.316936  1.000000  0.621606 -0.505574   
B1_a2 -0.091398  0.019866  0.069288  0.487857  0.621606  1.000000  0.259784   
B1_a3  0.036896  0.017413  0.095420  0.126687 -0.505574  0.259784  1.000000   
B1_a4  0.018811  0.041787  0.120954  0.189485 -0.444586  0.274886  0.892863   
B1_a5 -0.026018  0.053191  0.093920  0.207737  0.024970  0.139553  0.107533   
B1_a6  0.038120  0.077722 -0.065439 -0.154458  0.195576 -0.235808 -0.525790   
B1_b   0.026430  0.104268 -0.010307 -

In [59]:
# Drop one of the feature pairs having high correlation
dropping_list = ['B1_a2', 'B1_a4', 'B1_b', 'B1_d', 'B2_a1', 'B2_a5', 'B2_b3', 'B2_c1', 'B2_c5', 'B2_d1', 'B2_d8']

df = df.drop(dropping_list, axis=1)

# Display final Dataframe
print(df)

         mtag condition  A1_1  A2_1  A3_1  B1_a  B1_a1  B1_a3  B1_a5  B1_a6  \
0     ME02646     frail   196    24  46.5   121   3.93     95    324   13.3   
1     ME03109     frail   200    23  55.6   142   4.82     87    346   12.8   
2     ME06997     frail   441    20  76.8   105   4.54     90    330   14.0   
3     ME07149     frail   265    16  47.2   122   4.53     86    313   14.9   
4     ME07700     frail   425    14  31.3   124   4.44     85    329   12.6   
...       ...       ...   ...   ...   ...   ...    ...    ...    ...    ...   
1002  MV00454    robust   220    19  67.5   138   4.66     91    325   14.1   
1003  MV00456    robust   334    18  51.0   139   4.63     91    330   15.6   
1004  MV00460    robust   418    17  61.0   122   4.18     90    324   13.5   
1005  MV00502    robust   393    18  43.1   136   4.57     94    316   12.5   
1006  MV00510    robust   371    24  55.9   127   4.41     90    320   13.8   

      ...  B2_d5  B2_d6  B2_d7  B2_d9   B3  B4_a2  

In [60]:
#==================================================
# RUN SVM LINEAR CLASSIFICATION (WITH 6 FEATURES) //
#==================================================

# Specify features and labels
y = df['condition']
x = df.drop(['mtag', 'condition'], axis=1)

# Display features and labels
print(x, y)

# Display shape of features and labels
print("\nShape of Features:")
print(x.shape)
print("\nShape of Labels:")
print(y.shape)

# Data Dictionary:
# frail -> 0
# frail_mci -> 1
# mci -> 2
# prefrail_mci -> 3
# prefrail -> 4
# robust -> 5

# Conduct label mapping for conditions
label_mapping = {
    'frail' : 0,
    'frail_mci' : 1,
    'mci' : 2,
    'prefrail_mci' : 3,
    'prefrail' : 4,
    'robust' : 5
}

y = y.map(label_mapping)
y = np.array(y)

# Display label
print(y)

# Display shape of label
print(y.shape)

# Conduct train-test split on dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Display x_train, x_test, y_train, y_test
print("\nX Train:")
print(x_train)
print("\nX Test:")
print(x_test)
print("\nY Train:")
print(y_train)
print("\nY Test:")
print(y_test)

# Display shape of train and test sets
print("\nShape of X Train:")
print(x_train.shape)
print("\nShape of X Test:")
print(x_test.shape)
print("\nShape of Y Train:")
print(y_train.shape)
print("\nShape of Y Test:")
print(y_test.shape)

# Create linear SVM model
model = svm.SVC(kernel='linear')

# Train the model
model.fit(x_train, y_train)

# Make predictions
predictions = model.predict(x_test)

# Measure accuracy
acc = accuracy_score(y_test, predictions)

# Print predictions, actual, and accuracy score
print("Predictions:", predictions)
print("Actual:", y_test)
print("Accuracy:", acc)

      A1_1  A2_1  A3_1  B1_a  B1_a1  B1_a3  B1_a5  B1_a6  B1_b1  B1_b2  ...  \
0      196    24  46.5   121   3.93     95    324   13.3    2.9    2.0  ...   
1      200    23  55.6   142   4.82     87    346   12.8    5.1    2.8  ...   
2      441    20  76.8   105   4.54     90    330   14.0    3.2    1.2  ...   
3      265    16  47.2   122   4.53     86    313   14.9    4.9    2.1  ...   
4      425    14  31.3   124   4.44     85    329   12.6    2.6    2.2  ...   
...    ...   ...   ...   ...    ...    ...    ...    ...    ...    ...  ...   
1002   220    19  67.5   138   4.66     91    325   14.1    3.2    2.3  ...   
1003   334    18  51.0   139   4.63     91    330   15.6    2.5    2.3  ...   
1004   418    17  61.0   122   4.18     90    324   13.5    2.1    1.7  ...   
1005   393    18  43.1   136   4.57     94    316   12.5    2.9    1.9  ...   
1006   371    24  55.9   127   4.41     90    320   13.8    3.1    2.0  ...   

      B2_d5  B2_d6  B2_d7  B2_d9   B3  B4_a2  B4_a5

In [61]:
#==================================================
# RUN SVM LINEAR CLASSIFICATION (WITH 2 FEATURES - FRAIL+MCI vs ROBUST) //
#==================================================

# There are 76 Frail+MCI

# Collect data that is Frail+MCI
df1 = df[df.condition == 'frail_mci']
# Collect data that is Robust
df2 = df[df.condition == 'robust']

# Random sample 76 from Robust
df2 = df2.sample(n=76)

# Append 76 Frail+MCI to 76 Robust
df1 = df1.append(df2, ignore_index=True)

# Display Dataframe
print(df1)

# Specify features and labels
y = df1['condition']
x = df1.drop(['mtag', 'condition'], axis=1)

# Display features and labels
print(x, y)

# Display shape of features and labels
print("\nShape of Features:")
print(x.shape)
print("\nShape of Labels:")
print(y.shape)

# Data Dictionary:
# frail_mci -> 0
# robust -> 1

# Conduct label mapping for conditions
label_mapping = {
    'frail_mci' : 0,
    'robust' : 1
}

y = y.map(label_mapping)

# Display label
print(y)

# Display shape of label
print(y.shape)

# Conduct train-test split on dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Display x_train, x_test, y_train, y_test
print("\nX Train:")
print(x_train)
print("\nX Test:")
print(x_test)
print("\nY Train:")
print(y_train)
print("\nY Test:")
print(y_test)

# Display shape of train and test sets
print("\nShape of X Train:")
print(x_train.shape)
print("\nShape of X Test:")
print(x_test.shape)
print("\nShape of Y Train:")
print(y_train.shape)
print("\nShape of Y Test:")
print(y_test.shape)

# Create linear SVM model
model = svm.SVC(kernel='linear')

# Train the model
model.fit(x_train, y_train)

# Make predictions
predictions = model.predict(x_test)

# Measure accuracy
acc = accuracy_score(y_test, predictions)

# Print predictions, actual, and accuracy score
print("Predictions:", predictions)
print("Actual:", np.array(y_test))
print("Accuracy:", acc)

        mtag  condition  A1_1  A2_1  A3_1  B1_a  B1_a1  B1_a3  B1_a5  B1_a6  \
0    ME01378  frail_mci   241    20  33.5   150   5.25     87    328   12.8   
1    ME02832  frail_mci   444    16  87.0   134   4.65     85    329   15.2   
2    ME02909  frail_mci  1476    16  57.0   119   3.80     94    333   12.8   
3    ME02998  frail_mci   339    18  63.8   135   4.89     86    321   12.4   
4    ME03061  frail_mci   287    20  95.5   146   5.18     85    332   15.8   
..       ...        ...   ...   ...   ...   ...    ...    ...    ...    ...   
147  ME08067     robust   413    11  59.8   145   4.26     95    328   12.7   
148  MV00152     robust   383    19  46.0   155   5.07     89    344   13.5   
149  ME01369     robust   350    17  54.8   135   4.83     88    318   13.2   
150  ME04517     robust   477    16  71.2   152   5.18     90    326   14.1   
151  ME06446     robust   208    20  81.2   142   4.75     92    336   14.0   

     ...  B2_d5  B2_d6  B2_d7  B2_d9    B3  B4_a2  

Predictions: [1 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 1 0 0 0 0 1 1 0 1 1 0 1]
Actual: [1 0 0 0 1 1 0 1 1 0 1 1 1 1 0 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0]
Accuracy: 0.6774193548387096


In [62]:
#==================================================
# RUN SVM LINEAR CLASSIFICATION (WITH 2 FEATURES - ROBUST vs NON-ROBUST) //
#==================================================

# There are 668 Non-Robust and 339 Robust

# Collect data that is Non-Robust
df1 = df[df.condition != 'robust']
# Collect data that is Robust
df2 = df[df.condition == 'robust']

# Random sample 339 from Non-Robust
df1 = df1.sample(n=339)

# Append 339 Robust to 339 Non-Robust
df1 = df1.append(df2, ignore_index=True)

# Display Dataframe
print(df1)

# Specify features and labels
y = df['condition']
x = df.drop(['mtag', 'condition'], axis=1)

# Display features and labels
print(x, y)

# Display shape of features and labels
print("\nShape of Features:")
print(x.shape)
print("\nShape of Labels:")
print(y.shape)

# Data Dictionary:
# frail -> 0
# frail_mci -> 0
# mci -> 0
# prefrail_mci -> 0
# prefrail -> 0
# robust -> 1

# Conduct label mapping for conditions
label_mapping = {
    'frail' : 0,
    'frail_mci' : 0,
    'mci' : 0,
    'prefrail_mci' : 0,
    'prefrail' : 0,
    'robust' : 1
}

y = y.map(label_mapping)

# Display label
print(y)

# Display shape of label
print(y.shape)

# Conduct train-test split on dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Display x_train, x_test, y_train, y_test
print("\nX Train:")
print(x_train)
print("\nX Test:")
print(x_test)
print("\nY Train:")
print(y_train)
print("\nY Test:")
print(y_test)

# Display shape of train and test sets
print("\nShape of X Train:")
print(x_train.shape)
print("\nShape of X Test:")
print(x_test.shape)
print("\nShape of Y Train:")
print(y_train.shape)
print("\nShape of Y Test:")
print(y_test.shape)

# Create linear SVM model
model = svm.SVC(kernel='linear')

# Train the model
model.fit(x_train, y_train)

# Make predictions
predictions = model.predict(x_test)

# Measure accuracy
acc = accuracy_score(y_test, predictions)

# Print predictions, actual, and accuracy score
print("Predictions:", predictions)
print("Actual:", y_test)
print("Accuracy:", acc)

        mtag     condition  A1_1  A2_1  A3_1  B1_a  B1_a1  B1_a3  B1_a5  \
0    MV00357      prefrail   683    11  59.1   115   5.44     67    316   
1    ME05815      prefrail   254    14  33.0   120   5.41     91    333   
2    ME05188      prefrail   325    15  69.1   152   4.36     93    316   
3    MV00144  prefrail_mci   220    15  63.7   124   4.17     90    331   
4    ME09133      prefrail   409    18  47.4   127   4.65     99    317   
..       ...           ...   ...   ...   ...   ...    ...    ...    ...   
673  MV00454        robust   220    19  67.5   138   4.66     91    325   
674  MV00456        robust   334    18  51.0   139   4.63     91    330   
675  MV00460        robust   418    17  61.0   122   4.18     90    324   
676  MV00502        robust   393    18  43.1   136   4.57     94    316   
677  MV00510        robust   371    24  55.9   127   4.41     90    320   

     B1_a6  ...  B2_d5  B2_d6  B2_d7  B2_d9   B3  B4_a2  B4_a5  B5_a2  B5_a3  \
0     14.8  ...    

Predictions: [0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0
 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 1 0
 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
Actual: 9      0
141    0
194    0
588    0
883    1
      ..
842    1
485    0
581    0
791    1
273    0
Name: condition, Length: 202, dtype: int64
Accuracy: 0.6237623762376238
