#### `Import Libraries`

In [1]:
import pandas as pd
import numpy as np
import os
import six
import wandb
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree, export_graphviz
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

In [2]:
!python --version

Python 3.7.11


In [None]:
#############################
##### From Command Line #####
#############################
# !pip install wandb
# !wandb login

#### `Loading Dataset`

In [4]:
filepath = os.getcwd() + "/Play_Tennis.csv"
df = pd.read_csv(filepath).drop("Day", axis=1)
# df.head()
display(df.head(5))
display(df.tail(5))

Unnamed: 0,Outlook,Temprature,Humidity,Wind,Play_Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


Unnamed: 0,Outlook,Temprature,Humidity,Wind,Play_Tennis
9,Rain,Mild,Normal,Weak,Yes
10,Sunny,Mild,Normal,Strong,Yes
11,Overcast,Mild,High,Strong,Yes
12,Overcast,Hot,Normal,Weak,Yes
13,Rain,Mild,High,Strong,No


#### `Descriptive Stats`

In [5]:
print(df.info())
print('\nMissing values in given data :')
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Outlook      14 non-null     object
 1   Temprature   14 non-null     object
 2   Humidity     14 non-null     object
 3   Wind         14 non-null     object
 4   Play_Tennis  14 non-null     object
dtypes: object(5)
memory usage: 688.0+ bytes
None

Missing values in given data :
Outlook        0
Temprature     0
Humidity       0
Wind           0
Play_Tennis    0
dtype: int64


In [6]:
df.describe()

Unnamed: 0,Outlook,Temprature,Humidity,Wind,Play_Tennis
count,14,14,14,14,14
unique,3,3,2,2,2
top,Rain,Mild,High,Weak,Yes
freq,5,6,7,8,9


#### Dummy Variables Creation<span style='background:yellow'>(Instead of direct categorical variable, sklearn library works with encoded category variables to find Entropy and I.G for best splits)</span>

In [7]:
df_dummy = pd.get_dummies(data = df, columns=['Outlook','Temprature','Humidity','Wind'])
df_dummy.head()

Unnamed: 0,Play_Tennis,Outlook_Overcast,Outlook_Rain,Outlook_Sunny,Temprature_Cool,Temprature_Hot,Temprature_Mild,Humidity_High,Humidity_Normal,Wind_Strong,Wind_Weak
0,No,0,0,1,0,1,0,1,0,0,1
1,No,0,0,1,0,1,0,1,0,1,0
2,Yes,1,0,0,0,1,0,1,0,0,1
3,Yes,0,1,0,0,0,1,1,0,0,1
4,Yes,0,1,0,1,0,0,0,1,0,1


#### `Train & Test split`

In [8]:
X = df_dummy.drop(['Play_Tennis'],axis=1)
y = df_dummy['Play_Tennis']

y = np.where(y=='Yes', 1, 0)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 101 )

In [10]:
# https://stackoverflow.com/questions/46364047/convert-uint8-to-int64-in-python

print("--"*10)
print("Before : DataTypes")
print("--"*10)
display(X_test.dtypes)
X_test = X_test.applymap(lambda x: int(x))
print("--"*10)
print("After : DataTypes")
print("--"*10)
display(X_test.dtypes)

# display(X_train.dtypes)
X_train = X_train.applymap(lambda x: int(x))
# display(X_train.dtypes)

--------------------
Before : DataTypes
--------------------


Outlook_Overcast    uint8
Outlook_Rain        uint8
Outlook_Sunny       uint8
Temprature_Cool     uint8
Temprature_Hot      uint8
Temprature_Mild     uint8
Humidity_High       uint8
Humidity_Normal     uint8
Wind_Strong         uint8
Wind_Weak           uint8
dtype: object

--------------------
After : DataTypes
--------------------


Outlook_Overcast    int64
Outlook_Rain        int64
Outlook_Sunny       int64
Temprature_Cool     int64
Temprature_Hot      int64
Temprature_Mild     int64
Humidity_High       int64
Humidity_Normal     int64
Wind_Strong         int64
Wind_Weak           int64
dtype: object

#### `Basic Hyperparameters`

In [11]:
rtree = RandomForestClassifier(criterion='gini', max_depth=3, max_features=2, n_estimators=19)
rtree

RandomForestClassifier(max_depth=3, max_features=2, n_estimators=19)

#### `Initiate WandB`

In [13]:
wandb.init(project="RandomForest_Classifier_Example", entity="rakeshg")

#### `Model Fit`

In [14]:
rtree.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, max_features=2, n_estimators=19)

- `Learning Curve`

In [15]:
wandb.sklearn.plot_learning_curve(rtree, X, y)

![](Readme_Images/Learning_Curve.png)

- `ROC Curve`

In [19]:
y_probas = rtree.predict_proba(X_test)
labels = list(set(y.tolist()))

wandb.sklearn.plot_roc(y_test, y_probas, labels)

![](Readme_Images/ROC_Curve.png)

- `Class Proportions`

In [31]:
labels = list(set(y.tolist()))
wandb.sklearn.plot_class_proportions(y_train, y_test, labels)

![](Readme_Images/Class_Proportions.png)

- `Precision Recall Curve`

In [32]:
wandb.sklearn.plot_precision_recall(y_test, y_probas, labels)

![](Readme_Images/Precision_Recall.png)

- `Feature Importances`

In [33]:
wandb.sklearn.plot_feature_importances(rtree, X_train.columns.tolist())

![](Readme_Images/Feature_Importance.png)