# Import Packages

In [None]:
import json 
import pandas as pd

# For Splitting Data
from sklearn.model_selection import train_test_split

# Models and Training
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Score the Model 
from sklearn.metrics import ConfusionMatrixDisplay, f1_score, accuracy_score

# Load Data

In [None]:
# Import JSON

data_loc = "./Data/sample.json" 
my_data_raw = {}
with open(data_loc, "r+") as f: 
    my_data_raw = json.load(f)

In [None]:
data_1 = pd.DataFrame.from_records(my_data_raw['Data'])
data_1.head()

In [None]:
data_2 = pd.DataFrame.from_dict(my_data_raw['Fields'])
data_2.to_pickle("./Data/data.pki")

In [None]:
re_import = pd.read_pickle("./Data/data.pki")
re_import.head()

In [None]:
unit = [{
    "Timestamp": 123, 
    "Tested_Cell": True
}]

In [None]:
pd.DataFrame.from_records(unit)

In [None]:
data = pd.read_pickle("./Data/Curated_Count_Hit_Milestone_Q2_ML_Ready.pki")
data.head()

# Machine Learning Examples

## Split off Test Data

In [None]:
# Generate Train and Test Data 

# Seperate train and split
X = data.drop(['label'], axis=1)
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=43)
X_train = X_train.drop('code', axis=1)
X_test = X_test.drop('code', axis=1)
X_data = X.drop("code", axis=1)

In [None]:
# Check Size 
X.shape
y.shape
X.shape[0] == y.shape[0]

In [None]:
print(X_test.shape)
print(X_train.shape)
print(y_test.shape)
print(y_train.shape)

## Model Training - Random Forest Classifier

In [None]:
# Find the Best Model
param_grid = [
    {
        'n_estimators':[100,200,300]
    }
]

forest_class = RandomForestClassifier(n_jobs=12)

grid_search = GridSearchCV(forest_class, param_grid, cv=5,scoring='f1_macro', verbose=5)
grid_search.fit(X_train, y_train)

## Model Training - SVM

In [None]:
# Find the Best Model
param_grid = [
    {
        'kernel':['linear', 'poly', 'rbf', 'sigmoid']
    }
]

forest_class = SVC()

grid_search = GridSearchCV(forest_class, param_grid, cv=5,scoring='f1_macro', verbose=5)
grid_search.fit(X_train, y_train)

In [None]:
# Retrieve that model
forest_model = grid_search.best_estimator_

## Test Prediction

In [None]:
y_pred = forest_model.predict(X_test)

In [None]:
print(y_pred.shape)

In [None]:
f1_score(y_pred, y_test, average='macro')

In [None]:
accuracy_score(y_pred, y_test)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_pred, y_test)