# mlInteractive

The purpose of this model is to generate a random forest algorithm from the data we generated from the `generateData.ipynb` file.  We'll save the service as a BentoML model.

In [1]:
import bentoml
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from statistics import mean
import pandas as pd

In [2]:
def createCategoricalDummies(dataFrame, categoryList):
    return pd.get_dummies(dataFrame[categoryList], prefix_sep = "::")

In [3]:
data = pd.read_csv("../../data/cat-happiness.csv")
data.set_index("KEY", inplace=True)
data = data[["Shape", "Softness", "Happiness Classification"]]
data

Unnamed: 0_level_0,Shape,Softness,Happiness Classification
KEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
049f391aa77a4f1fbc501a4b4a05b687,Rectangle,Medium,High
65d1ee358a0f4876887c5895897d1df5,Circle,Hard,Lower
8b78257bc9c0465a8e1d7b58463d7943,Rectangle,Soft,High
9dd06e7581d145da9673a005ff7b677e,Square,Soft,Lower
08971c5140bc4d8d9a9e74331d7048a3,Square,Soft,Lower
...,...,...,...
a228c7b2c6b941e4b27b1f2866c32f9e,Circle,Soft,Medium
a0f5490916264e8991354b23882c01db,Square,Medium,Lower
b69ad3c25fb04f49b84141258b453f08,Circle,Soft,Lower
e5f537099b054fc08d48a4b611dd16f7,Rectangle,Hard,Medium


In [4]:
feature_cols = ["Shape", "Softness"]
target_col = "Happiness Classification"

In [5]:
data_v = pd.concat([data.drop(feature_cols, axis=1), createCategoricalDummies(data, feature_cols)], axis = 1)
data_v

Unnamed: 0_level_0,Happiness Classification,Shape::Circle,Shape::Rectangle,Shape::Square,Softness::Hard,Softness::Medium,Softness::Soft
KEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
049f391aa77a4f1fbc501a4b4a05b687,High,0,1,0,0,1,0
65d1ee358a0f4876887c5895897d1df5,Lower,1,0,0,1,0,0
8b78257bc9c0465a8e1d7b58463d7943,High,0,1,0,0,0,1
9dd06e7581d145da9673a005ff7b677e,Lower,0,0,1,0,0,1
08971c5140bc4d8d9a9e74331d7048a3,Lower,0,0,1,0,0,1
...,...,...,...,...,...,...,...
a228c7b2c6b941e4b27b1f2866c32f9e,Medium,1,0,0,0,0,1
a0f5490916264e8991354b23882c01db,Lower,0,0,1,0,1,0
b69ad3c25fb04f49b84141258b453f08,Lower,1,0,0,0,0,1
e5f537099b054fc08d48a4b611dd16f7,Medium,0,1,0,1,0,0


In [6]:
feature_cols = list(data_v.columns)
feature_cols.remove("Happiness Classification")

In [7]:
X = data_v[feature_cols].to_numpy()
y = data_v[target_col].to_numpy()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Using KNN (for fun)

In [9]:
knn = KNeighborsClassifier()  # Using default 5
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.91004

# Random Forest
Random Forest was found to be the best algorithm while I was messing around in Weka (see Weka folder).

In [10]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.91004

In [11]:
# Try cross validation to see if we have differences here.
rfc_cross = RandomForestClassifier()
scores = cross_val_score(rfc_cross, X, y, cv=10)
scores

array([0.9084, 0.9114, 0.9132, 0.9127, 0.91  , 0.9082, 0.9154, 0.9135,
       0.9104, 0.9143])

In [12]:
mean(scores)  # This closer matches what we got from Weka, which is 91.175

0.91175

In [13]:
saved_model = bentoml.sklearn.save_model("cat_toy", rfc)
print(f"Model saved: {saved_model}")
saved_model

Model saved: Model(tag="cat_toy:4pwiqyarcsq55lg6")


Model(tag="cat_toy:4pwiqyarcsq55lg6", path="/Users/dthole/bentoml/models/cat_toy/4pwiqyarcsq55lg6/")