# Preliminaries

In [None]:
!pip install numpy pandas sklearn

In [2]:
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Load pandas
import pandas as pd

# Load numpy
import numpy as np

# Load Data

In [3]:
# Load the data from microbiome file
df = pd.read_csv("./microbiome.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)

# View the top 5 rows
df.head()

Unnamed: 0,msp_0001,msp_0002,msp_0003,msp_0004,msp_0005,msp_0006,msp_0007,msp_0008,msp_0009,msp_0010,...,bmi,country_AUT,country_CAN,country_CHN,country_FRA,country_GER,country_IND,country_ITA,country_JPN,country_USA
0,0.0,0.0,0.0,0.0,3.429033e-06,0.0,3.932075e-06,0.0,4.887927e-07,9.402294e-08,...,22.506925,0,0,0,0,0,0,0,1,0
1,0.0,0.0,1.307809e-07,0.0,1.124743e-08,0.0,2.231319e-07,0.0,5.214318e-08,5.770329e-07,...,26.880952,0,0,0,0,0,0,0,1,0
2,0.0,0.0,5.272835e-09,0.0,9.665916e-07,0.0,1.375124e-06,0.0,8.561444e-08,1.488008e-06,...,26.5625,0,0,0,0,0,0,0,1,0
3,0.0,0.0,0.0,0.0,3.057769e-05,0.0,1.212841e-06,0.0,2.030427e-06,0.0,...,25.0,0,0,0,0,0,0,0,1,0
4,0.0,0.0,0.0,0.0,1.769843e-07,0.0,1.307458e-08,0.0,2.387858e-08,0.0,...,20.173253,0,0,0,0,0,0,0,1,0


In [4]:
# Add a new column with the label name from microbiome_label file, this is what we are going to predict
labels = pd.read_csv("./microbiome_labels.csv")
df["label"] = labels["health_status"]

# View the top 5 rows
df.head()

Unnamed: 0,msp_0001,msp_0002,msp_0003,msp_0004,msp_0005,msp_0006,msp_0007,msp_0008,msp_0009,msp_0010,...,country_AUT,country_CAN,country_CHN,country_FRA,country_GER,country_IND,country_ITA,country_JPN,country_USA,label
0,0.0,0.0,0.0,0.0,3.429033e-06,0.0,3.932075e-06,0.0,4.887927e-07,9.402294e-08,...,0,0,0,0,0,0,0,1,0,1
1,0.0,0.0,1.307809e-07,0.0,1.124743e-08,0.0,2.231319e-07,0.0,5.214318e-08,5.770329e-07,...,0,0,0,0,0,0,0,1,0,1
2,0.0,0.0,5.272835e-09,0.0,9.665916e-07,0.0,1.375124e-06,0.0,8.561444e-08,1.488008e-06,...,0,0,0,0,0,0,0,1,0,0
3,0.0,0.0,0.0,0.0,3.057769e-05,0.0,1.212841e-06,0.0,2.030427e-06,0.0,...,0,0,0,0,0,0,0,1,0,0
4,0.0,0.0,0.0,0.0,1.769843e-07,0.0,1.307458e-08,0.0,2.387858e-08,0.0,...,0,0,0,0,0,0,0,1,0,0


# Create Training And Test Data

In [5]:
# Split your data into train and test dataset (80% train and 20% test)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'label'], df['label'], test_size=0.2, random_state=42)

# Train The Random Forest Classifier


In [6]:
# Create a random forest Classifier.
clf = RandomForestClassifier()

# Train the Classifier with the train dataset
clf.fit(X_train, y_train)

# Hyper Parameter Optimization

In [7]:
# Use grid search to find the optimum number of "n_estimators"
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators':[1, 10]}
model = GridSearchCV(clf, parameters)
model.fit(X_train, y_train)

# Apply Classifier To Test Data

In [8]:
# Apply the best Classifier you found to the test data (which, remember, it has never seen before)
y_pred = model.predict(X_test)

In [9]:
# View the predicted probabilities instead of labels
model.predict_proba(X_test)

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])

# Evaluate Classifier


In [10]:
# Calculate Accuracy Area under Curve and f1 score
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

acc = accuracy_score(y_pred, y_test)
auc = roc_auc_score(y_pred, y_test, multi_class='ovr')
f1 = f1_score(y_pred, y_test)
print("Accuracy: \t\t" + str(acc))
print("Area under Curve: \t" + str(auc))
print("F1 Score: \t\t" + str(f1))

Accuracy: 		0.55
Area under Curve: 	0.5606060606060606
F1 Score: 		0.5714285714285715
