# Random Forest

### Import libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [None]:
# display settings
pd.options.display.max_columns
pd.options.display.max_rows
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

### Import Data

In [None]:
train = pd.read_csv('Dataset/Training.csv') # 4920
test = pd.read_csv('Dataset/Testing.csv') # 42

### Preprocess

In [None]:
# check for null values
train.isnull().any()

In [None]:
# check if balanced
train['prognosis'].value_counts()

### Split data

In [None]:
A = train[["prognosis"]] # diseases 
B = train.drop(["prognosis"],axis=1) # symptoms 
C = test.drop(["prognosis"],axis=1) # symptoms - testing 
x_train, x_test, y_train, y_test = train_test_split(B,A,test_size=0.2) # 20:80 

### Model

In [None]:
# Traning random forest model
mod = RandomForestClassifier(n_estimators = 100,n_jobs = 5, criterion= 'entropy',random_state = 42)
mod = mod.fit(x_train,y_train.values.ravel())
pred = mod.predict(x_test)

#### Accuracy

In [None]:
metrics.accuracy_score(y_test, pred)

In [None]:
report = classification_report(y_test, pred, output_dict=True)
pd.DataFrame(report).transpose()

In [None]:
cm = confusion_matrix(y_test, pred)
pd.DataFrame(cm)

In [None]:
test = test.join(pd.DataFrame(mod.predict(C),columns=["predicted"]))[["prognosis","predicted"]]

test['result']= ' '
for i in range(len(test)):
    if test["prognosis"][i] == test["predicted"][i]:
        test['result'].iloc[i] = 'Correct'
    else:
        test['result'].iloc[i] = 'Incorrect'
test

In [None]:
from sklearn import tree
plt.figure(figsize=(30,15))
tree.plot_tree(mod.estimators_[8],filled = True)

In [None]:
import numpy as np
l = []
sym = list(train.drop('prognosis',axis=1))
arr = np.array(sym)
arr2 = np.array(l)
plt.figure(figsize = (30,15))
for j in sym:
    h = B[j].value_counts().values
    arr2 = np.append(arr2,h[:1])

plt.xticks(rotation = 90) 
plt.ylabel("Frequency")
plt.xlabel("Symptoms")
plt.bar(arr,arr2)