In [81]:
# Import packages
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from parse import preprocess

In [82]:
df = preprocess("rawfile_blood.csv")


####################################################################
Number of Rows of Dataframe:
1123
Number of Columns of Dataframe:
59

####################################################################
Threshold for number of NULLs in a column: 0.1095
Number of Columns before Parsing for Too Many NULLs in a column:
59
Number of Columns after Parsing for Too Many NULLs in a column:
51

Columns Removed:
B1_b5
B4_a1
B4_a3
B4_a4
B4_a6
B4_b1
B4_b3
B5_a1

####################################################################
Number of Rows before Parsing NULLs in data:
1123
Number of Rows after Parsing NULLs in data:
1007

####################################################################
Number of Columns after dropping A1_2, B1_b4, B2_c3, B4_b2 for inconsistent data types:
47


In [83]:
# Conduct "Low Variance Filter"

# Check for NaN values in features
print("##########################################")
print("% of Nan Values in each Feature")
print(df.isnull().sum()/len(df)*100)
print("##########################################")

# Show Variance
print("Variance between values in each feature")
print(df.var())

##########################################
% of Nan Values in each Feature
mtag         0.0
condition    0.0
A1_1         0.0
A2_1         0.0
A3_1         0.0
B1_a         0.0
B1_a1        0.0
B1_a2        0.0
B1_a3        0.0
B1_a4        0.0
B1_a5        0.0
B1_a6        0.0
B1_b         0.0
B1_b1        0.0
B1_b2        0.0
B1_b3        0.0
B1_c         0.0
B1_d         0.0
B2_a1        0.0
B2_a2        0.0
B2_a3        0.0
B2_a4        0.0
B2_a5        0.0
B2_b1        0.0
B2_b2        0.0
B2_b3        0.0
B2_c1        0.0
B2_c2        0.0
B2_c4        0.0
B2_c5        0.0
B2_c6        0.0
B2_c7        0.0
B2_d1        0.0
B2_d2        0.0
B2_d3        0.0
B2_d4        0.0
B2_d5        0.0
B2_d6        0.0
B2_d7        0.0
B2_d8        0.0
B2_d9        0.0
B3           0.0
B4_a2        0.0
B4_a5        0.0
B5_a2        0.0
B5_a3        0.0
B6           0.0
dtype: float64
##########################################
Variance between values in each feature
A1_1     38284.192365
A2_1  

In [84]:
# Keeping features only with variance >= 10%

# Assign variable to variance
var = df.var()

# Assign variable to features
features = df.columns
feature = []
removed_features = []

columns = list(var.index)

for i in range(0, len(var)):
    if var[i] >= 10:   # Set the threshold as 10%
        feature.append(columns[i])
    else:
        removed_features.append(columns[i])

# Display features that have variance equal to or exceeding 10%
print("##########################################")
print("Features having >= 10 Variance")
print(feature)
print("Total Features: ", end="")
print(len(feature))
# Display removed features
print("##########################################")
print("Features removed")
print(removed_features)
print("Total Features: ", end="")
print(len(removed_features))
print("##########################################")
print("Dataframe after Low Variance Filter:")

temp_df = df[feature]
temp_df['mtag'] = df['mtag'].values
temp_df['condition'] = df['condition'].values

df = temp_df

# Display Dataframe after Low Variance Filter
print(df)

##########################################
Features having >= 10 Variance
['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a3', 'B1_a5', 'B1_b', 'B1_c', 'B2_c2', 'B2_d1', 'B2_d3', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8', 'B2_d9', 'B3', 'B5_a2']
Total Features: 18
##########################################
Features removed
['B1_a1', 'B1_a2', 'B1_a4', 'B1_a6', 'B1_b1', 'B1_b2', 'B1_b3', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1', 'B2_b2', 'B2_b3', 'B2_c1', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7', 'B2_d2', 'B2_d4', 'B4_a2', 'B4_a5', 'B5_a3', 'B6']
Total Features: 27
##########################################
Dataframe after Low Variance Filter:
      A1_1  A2_1  A3_1  B1_a  B1_a3  B1_a5  B1_b   B1_c  B2_c2  B2_d1  B2_d3  \
0      196    24  46.5   121     95    324   5.6  255.0     63     73     31   
1      200    23  55.6   142     87    346  10.0  219.0    101     76     34   
2      441    20  76.8   105     90    330   8.4  225.0     63     72     29   
3      265    16  47.2   122     8

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['mtag'] = df['mtag'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['condition'] = df['condition'].values


In [85]:
#==================================================
# RUN SVM LINEAR CLASSIFICATION (WITH 6 FEATURES) //
#==================================================

# Specify features and labels
y = df['condition']
x = df.drop(['mtag', 'condition'], axis=1)

# Display features and labels
print(x, y)

# Display shape of features and labels
print("\nShape of Features:")
print(x.shape)
print("\nShape of Labels:")
print(y.shape)

# Data Dictionary:
# frail -> 0
# frail_mci -> 1
# mci -> 2
# prefrail_mci -> 3
# prefrail -> 4
# robust -> 5

# Conduct label mapping for conditions
label_mapping = {
    'frail' : 0,
    'frail_mci' : 1,
    'mci' : 2,
    'prefrail_mci' : 3,
    'prefrail' : 4,
    'robust' : 5
}

y = y.map(label_mapping)
y = np.array(y)

# Display label
print(y)

# Display shape of label
print(y.shape)

# Conduct train-test split on dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Display x_train, x_test, y_train, y_test
print("\nX Train:")
print(x_train)
print("\nX Test:")
print(x_test)
print("\nY Train:")
print(y_train)
print("\nY Test:")
print(y_test)

# Display shape of train and test sets
print("\nShape of X Train:")
print(x_train.shape)
print("\nShape of X Test:")
print(x_test.shape)
print("\nShape of Y Train:")
print(y_train.shape)
print("\nShape of Y Test:")
print(y_test.shape)

# Create linear SVM model
model = svm.SVC(kernel='linear')

# Train the model
model.fit(x_train, y_train)

# Make predictions
predictions = model.predict(x_test)

# Measure accuracy
acc = accuracy_score(y_test, predictions)

# Print predictions, actual, and accuracy score
print("Predictions:", predictions)
print("Actual:", y_test)
print("Accuracy:", acc)

      A1_1  A2_1  A3_1  B1_a  B1_a3  B1_a5  B1_b   B1_c  B2_c2  B2_d1  B2_d3  \
0      196    24  46.5   121     95    324   5.6  255.0     63     73     31   
1      200    23  55.6   142     87    346  10.0  219.0    101     76     34   
2      441    20  76.8   105     90    330   8.4  225.0     63     72     29   
3      265    16  47.2   122     86    313   7.7  312.0     77     74     32   
4      425    14  31.3   124     85    329   6.6  295.0     64     75     30   
...    ...   ...   ...   ...    ...    ...   ...    ...    ...    ...    ...   
1002   220    19  67.5   138     91    325   6.3  291.0     45     78     36   
1003   334    18  51.0   139     91    330   5.3  235.0     65     70     31   
1004   418    17  61.0   122     90    324   4.3  191.0     63     71     30   
1005   393    18  43.1   136     94    316   5.5  298.0     52     72     32   
1006   371    24  55.9   127     90    320   5.5  301.0     59     73     30   

      B2_d5  B2_d6  B2_d7  B2_d8  B2_d9

In [86]:
#==================================================
# RUN SVM LINEAR CLASSIFICATION (WITH 2 FEATURES - FRAIL+MCI vs ROBUST) //
#==================================================

# There are 76 Frail+MCI

# Collect data that is Frail+MCI
df1 = df[df.condition == 'frail_mci']
# Collect data that is Robust
df2 = df[df.condition == 'robust']

# Random sample 76 from Robust
df2 = df2.sample(n=76)

# Append 76 Frail+MCI to 76 Robust
df1 = df1.append(df2, ignore_index=True)

# Display Dataframe
print(df1)

# Specify features and labels
y = df1['condition']
x = df1.drop(['mtag', 'condition'], axis=1)

# Display features and labels
print(x, y)

# Display shape of features and labels
print("\nShape of Features:")
print(x.shape)
print("\nShape of Labels:")
print(y.shape)

# Data Dictionary:
# frail_mci -> 0
# robust -> 1

# Conduct label mapping for conditions
label_mapping = {
    'frail_mci' : 0,
    'robust' : 1
}

y = y.map(label_mapping)

# Display label
print(y)

# Display shape of label
print(y.shape)

# Conduct train-test split on dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Display x_train, x_test, y_train, y_test
print("\nX Train:")
print(x_train)
print("\nX Test:")
print(x_test)
print("\nY Train:")
print(y_train)
print("\nY Test:")
print(y_test)

# Display shape of train and test sets
print("\nShape of X Train:")
print(x_train.shape)
print("\nShape of X Test:")
print(x_test.shape)
print("\nShape of Y Train:")
print(y_train.shape)
print("\nShape of Y Test:")
print(y_test.shape)

# Create linear SVM model
model = svm.SVC(kernel='linear')

# Train the model
model.fit(x_train, y_train)

# Make predictions
predictions = model.predict(x_test)

# Measure accuracy
acc = accuracy_score(y_test, predictions)

# Print predictions, actual, and accuracy score
print("Predictions:", predictions)
print("Actual:", np.array(y_test))
print("Accuracy:", acc)

     A1_1  A2_1  A3_1  B1_a  B1_a3  B1_a5  B1_b   B1_c  B2_c2  B2_d1  B2_d3  \
0     241    20  33.5   150     87    328   7.0  271.0     40     70     28   
1     444    16  87.0   134     85    329   7.4  192.0     54     88     41   
2    1476    16  57.0   119     94    333   5.2  284.0     55     74     30   
3     339    18  63.8   135     86    321   7.8  295.0     72     75     30   
4     287    20  95.5   146     85    332   9.7  287.0     86     77     33   
..    ...   ...   ...   ...    ...    ...   ...    ...    ...    ...    ...   
147   378    18  58.0   134     93    332   5.1  344.0     55     79     34   
148   700    11  35.2   141     91    327   8.5  201.0     82     76     32   
149   613    12  56.5   222     93    334   8.2  277.0     40     76     33   
150   289    10  36.5   146     95    322   7.5  422.0     42     76     32   
151   228    18  82.5   147     92    322   8.1  336.0     78     83     32   

     B2_d5  B2_d6  B2_d7  B2_d8  B2_d9    B3  B5_a2

Predictions: [0 1 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 0 1 1 1 0 0 1 0 1 1 0 1 0 1]
Actual: [1 0 1 0 1 1 0 0 0 1 1 1 1 1 1 0 0 0 0 1 1 0 1 0 0 1 1 1 1 1 1]
Accuracy: 0.6451612903225806


In [87]:
#==================================================
# RUN SVM LINEAR CLASSIFICATION (WITH 2 FEATURES - ROBUST vs NON-ROBUST) //
#==================================================

# There are 668 Non-Robust and 339 Robust

# Collect data that is Non-Robust
df1 = df[df.condition != 'robust']
# Collect data that is Robust
df2 = df[df.condition == 'robust']

# Random sample 339 from Non-Robust
df1 = df1.sample(n=339)

# Append 339 Robust to 339 Non-Robust
df1 = df1.append(df2, ignore_index=True)

# Display Dataframe
print(df1)

# Specify features and labels
y = df['condition']
x = df.drop(['mtag', 'condition'], axis=1)

# Display features and labels
print(x, y)

# Display shape of features and labels
print("\nShape of Features:")
print(x.shape)
print("\nShape of Labels:")
print(y.shape)

# Data Dictionary:
# frail -> 0
# frail_mci -> 0
# mci -> 0
# prefrail_mci -> 0
# prefrail -> 0
# robust -> 1

# Conduct label mapping for conditions
label_mapping = {
    'frail' : 0,
    'frail_mci' : 0,
    'mci' : 0,
    'prefrail_mci' : 0,
    'prefrail' : 0,
    'robust' : 1
}

y = y.map(label_mapping)

# Display label
print(y)

# Display shape of label
print(y.shape)

# Conduct train-test split on dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Display x_train, x_test, y_train, y_test
print("\nX Train:")
print(x_train)
print("\nX Test:")
print(x_test)
print("\nY Train:")
print(y_train)
print("\nY Test:")
print(y_test)

# Display shape of train and test sets
print("\nShape of X Train:")
print(x_train.shape)
print("\nShape of X Test:")
print(x_test.shape)
print("\nShape of Y Train:")
print(y_train.shape)
print("\nShape of Y Test:")
print(y_test.shape)

# Create linear SVM model
model = svm.SVC(kernel='linear')

# Train the model
model.fit(x_train, y_train)

# Make predictions
predictions = model.predict(x_test)

# Measure accuracy
acc = accuracy_score(y_test, predictions)

# Print predictions, actual, and accuracy score
print("Predictions:", predictions)
print("Actual:", y_test)
print("Accuracy:", acc)

     A1_1  A2_1  A3_1  B1_a  B1_a3  B1_a5  B1_b   B1_c  B2_c2  B2_d1  B2_d3  \
0     562    12  40.7   128     87    327   7.1  230.0     45     86     41   
1     435    13  39.4   126     90    323   7.5  357.0     58     75     31   
2     346    19  75.5   125     87    349   7.1  231.0     75     73     29   
3     417    18  25.0   128     87    335   3.7  287.0     44     72     28   
4     309    26  20.5   128     83    325   6.6  351.0     51     79     35   
..    ...   ...   ...   ...    ...    ...   ...    ...    ...    ...    ...   
673   220    19  67.5   138     91    325   6.3  291.0     45     78     36   
674   334    18  51.0   139     91    330   5.3  235.0     65     70     31   
675   418    17  61.0   122     90    324   4.3  191.0     63     71     30   
676   393    18  43.1   136     94    316   5.5  298.0     52     72     32   
677   371    24  55.9   127     90    320   5.5  301.0     59     73     30   

     B2_d5  B2_d6  B2_d7  B2_d8  B2_d9    B3  B5_a2