In [6]:
# Import packages
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from parse import preprocess

In [7]:
df = preprocess("rawfile_blood.csv")


####################################################################
Number of Rows of Dataframe:
1123
Number of Columns of Dataframe:
59

####################################################################
Threshold for number of NULLs in a column: 0.1095
Number of Columns before Parsing for Too Many NULLs in a column:
59
Number of Columns after Parsing for Too Many NULLs in a column:
51

Columns Removed:
B1_b5
B4_a1
B4_a3
B4_a4
B4_a6
B4_b1
B4_b3
B5_a1

####################################################################
Number of Rows before Parsing NULLs in data:
1123
Number of Rows after Parsing NULLs in data:
1007

####################################################################
Number of Columns after dropping A1_2, B1_b4, B2_c3, B4_b2 for inconsistent data types:
47


In [8]:
# Conduct "Low Variance Filter"

# Check for NaN values in features
print("##########################################")
print("% of Nan Values in each Feature")
print(df.isnull().sum()/len(df)*100)
print("##########################################")

# Show Variance
print("Variance between values in each feature")
print(df.var())

##########################################
% of Nan Values in each Feature
mtag         0.0
condition    0.0
A1_1         0.0
A2_1         0.0
A3_1         0.0
B1_a         0.0
B1_a1        0.0
B1_a2        0.0
B1_a3        0.0
B1_a4        0.0
B1_a5        0.0
B1_a6        0.0
B1_b         0.0
B1_b1        0.0
B1_b2        0.0
B1_b3        0.0
B1_c         0.0
B1_d         0.0
B2_a1        0.0
B2_a2        0.0
B2_a3        0.0
B2_a4        0.0
B2_a5        0.0
B2_b1        0.0
B2_b2        0.0
B2_b3        0.0
B2_c1        0.0
B2_c2        0.0
B2_c4        0.0
B2_c5        0.0
B2_c6        0.0
B2_c7        0.0
B2_d1        0.0
B2_d2        0.0
B2_d3        0.0
B2_d4        0.0
B2_d5        0.0
B2_d6        0.0
B2_d7        0.0
B2_d8        0.0
B2_d9        0.0
B3           0.0
B4_a2        0.0
B4_a5        0.0
B5_a2        0.0
B5_a3        0.0
B6           0.0
dtype: float64
##########################################
Variance between values in each feature
A1_1     38284.192365
A2_1  

In [9]:
# Keeping features only with variance >= 10%

# Assign variable to variance
var = df.var()

# Assign variable to features
features = df.columns
feature = []
removed_features = []

columns = list(var.index)

for i in range(0, len(var)):
    if var[i] >= 10:   # Set the threshold as 10%
        feature.append(columns[i])
    else:
        removed_features.append(columns[i])

# Display features that have variance equal to or exceeding 10%
print("##########################################")
print("Features having >= 10 Variance")
print(feature)
print("Total Features: ", end="")
print(len(feature))
# Display removed features
print("##########################################")
print("Features removed")
print(removed_features)
print("Total Features: ", end="")
print(len(removed_features))
print("##########################################")
print("Dataframe after Low Variance Filter:")

temp_df = df[feature]
temp_df['mtag'] = df['mtag'].values
temp_df['condition'] = df['condition'].values

df = temp_df

# Display Dataframe after Low Variance Filter
print(df)

##########################################
Features having >= 10 Variance
['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a3', 'B1_a5', 'B1_b', 'B1_c', 'B2_c2', 'B2_d1', 'B2_d3', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8', 'B2_d9', 'B3', 'B5_a2']
Total Features: 18
##########################################
Features removed
['B1_a1', 'B1_a2', 'B1_a4', 'B1_a6', 'B1_b1', 'B1_b2', 'B1_b3', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1', 'B2_b2', 'B2_b3', 'B2_c1', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7', 'B2_d2', 'B2_d4', 'B4_a2', 'B4_a5', 'B5_a3', 'B6']
Total Features: 27
##########################################
Dataframe after Low Variance Filter:
      A1_1  A2_1  A3_1  B1_a  B1_a3  B1_a5  B1_b   B1_c  B2_c2  B2_d1  B2_d3  \
0      196    24  46.5   121     95    324   5.6  255.0     63     73     31   
1      200    23  55.6   142     87    346  10.0  219.0    101     76     34   
2      441    20  76.8   105     90    330   8.4  225.0     63     72     29   
3      265    16  47.2   122     8

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['mtag'] = df['mtag'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['condition'] = df['condition'].values


In [10]:
# Conduct High Correlation Filter

# Conduct mapping for Feature Names
featureName_mapping = {
    "A1_1" : "Vitamin B12 (pmol/L)",
    "A1_2" : "Serum Folate (nmol/L)",
    "A2_1" : "Serum Homocysteine (µmol/L)",
    "A3_1" : "25-hydroxy Vitamin D (nmol/L)",
    "B1_a" : "Haemoglobin (g/L)",
    "B1_a1" : "RBC (/L)",
    "B1_a2" : "PCV (L/L)",
    "B1_a3" : "MCV (fL)",
    "B1_a4" : "MCH (pg)",
    "B1_a5" : "MCHC (g/L)",
    "B1_a6" : "RDW (%)",
    "B1_b" : "White Cell Count (/L)",
    "B1_b1" : "Neutrophils (/L)",
    "B1_b2" : "Lymphocytes (/L)",
    "B1_b3" : "Monocytes (/L)",
    "B1_b4" : "Eosinophils (/L)",
    "B1_b5" : "Basophils (/L)",
    "B1_c" : "Platelets (/L)",
    "B1_d" : "Glucose (mmol/L)",
    "B2_a1" : "Total Cholesterol (mmol/L)",
    "B2_a2" : "Triglyceride (mmol/L)",
    "B2_a3" : "HDL Cholesterol (mmol/L)",
    "B2_a4" : "LDL Cholesterol (mmol/L)",
    "B2_a5" : "Total Cholesterol/HDL Ratio",
    "B2_b1" : "Sodium (mmol/L)",
    "B2_b2" : "Potassium (mmol/L)",
    "B2_b3" : "Chloride (mmol/L)",
    "B2_c1" : 'Urea (mmol/L)',
    "B2_c2" : "Creatinine (umol/L)",
    "B2_c3" : "eGFR (mL/min/1.73m2)",
    "B2_c4" : "Uric Acid (mmol/L)",
    "B2_c5" : "Calcium (mmol/L)",
    "B2_c6" : "Corrected Calcium (mmol/L)",
    "B2_c7" : "Phosphate (mmol/L)",
    "B2_d1" : "Total Protein (g/L)",
    "B2_d2" : "Albumin (g/L)",
    "B2_d3" : "Globulin (g/L)",
    "B2_d4" : "Albumin/Globulin ratio",
    "B2_d5" : "Alkaline Phosphatase (U/L)",
    "B2_d6" : "Total Bilirubin (µmol/L)",
    "B2_d7" : "GGT",
    "B2_d8" : "AST",
    "B2_d9" : "ALT",
    "B3" : "C-Reactive Protein",
    "B4_a1" : "Protein",
    "B4_a2" : "pH",
    "B4_a3" : "Glucose",
    "B4_a4" : "Ketones",
    "B4_a5" : "S.G.",
    "B4_a6" : "Blood",
    "B4_b1" : "Leucocytes (/L)",
    "B4_b2" : "Erythrocytes (/L)",
    "B4_b3" : "Epithelial Cells",
    "B5_a1" : "Free Thyroxine (FT4) (pmol/L)",
    "B5_a2" : "Thyroid Stimulating Hormone (mIU/L)",
    "B5_a3" : "Free Tri-iodothyronine (FT3) (pmol/L)",
    "B6" : "HbA1c"
}

# Remove 'mtag' and label 'condition' to keep only features
df_temp = df.drop(['mtag', 'condition'], 1)

# Show correlation between features
corr = df_temp.corr()

print("#################################################################")
print("Feature Correlation Table:\n")
print(corr)

# Export correlation data to CSV
corr.to_csv("featureCorrelation.csv")

highCorrValue = []
highCorrelationPairs = []
new_tuple = []

for i in range(0, len(corr)):
    for j in range(0, len(corr.columns)):
        if (corr.iat[i,j] != 1.0) and (corr.iat[i,j] >= 0.5):
            highCorrValue.append(corr.iat[i,j])
            highCorrelationPairs.append((corr.columns[i],corr.index[j]))

# Show High Correlation Pairs with respective Correlation value
highCorrelationPairs = list(set([tuple(sorted(i)) for i in highCorrelationPairs]))
highCorrelationPairs = sorted(highCorrelationPairs)
highCorrValue = list(dict.fromkeys(highCorrValue))

print("\n#################################################################")
print("High Correlation Pairs with Correlation Values:")
for i in range(0, len(highCorrValue)):
    print(highCorrelationPairs[i][0], "and", highCorrelationPairs[i][1], ":", highCorrValue[i])
print("\nHigh Correlation Pairs with Correlation Values:")

df_dummy = pd.DataFrame(highCorrelationPairs)

df_dummy[0] = df_dummy[0].map(featureName_mapping)
df_dummy[1] = df_dummy[1].map(featureName_mapping)

for i in range(0, len(df_dummy)):
    new_tuple.append((df_dummy.iat[i,0], df_dummy.iat[i,1]))

names = new_tuple

for i in range(0, len(highCorrValue)):
    print(names[i][0], "and", names[i][1], ":", highCorrValue[i])

#################################################################
Feature Correlation Table:

           A1_1      A2_1      A3_1      B1_a     B1_a3     B1_a5      B1_b  \
A1_1   1.000000 -0.284103  0.062753 -0.059712  0.036896 -0.026018  0.026430   
A2_1  -0.284103  1.000000 -0.033859 -0.012390  0.017413  0.053191  0.104268   
A3_1   0.062753 -0.033859  1.000000  0.041232  0.095420  0.093920 -0.010307   
B1_a  -0.059712 -0.012390  0.041232  1.000000  0.126687  0.207737 -0.039956   
B1_a3  0.036896  0.017413  0.095420  0.126687  1.000000  0.107533  0.023018   
B1_a5 -0.026018  0.053191  0.093920  0.207737  0.107533  1.000000  0.023185   
B1_b   0.026430  0.104268 -0.010307 -0.039956  0.023018  0.023185  1.000000   
B1_c   0.030533  0.015486 -0.115349 -0.109395 -0.067699 -0.169209  0.138688   
B2_c2 -0.024759  0.446663  0.067132 -0.051081  0.041217  0.046087  0.073744   
B2_d1  0.010351  0.103980 -0.076349  0.053077 -0.041204  0.066071  0.011854   
B2_d3  0.003759  0.122693 -0.112755 -

NameError: name 'tempList' is not defined