# Baseline Classification Models for ObservationScheme Using PyCaret

In [None]:
import pandas as pd
import numpy as np
import re,string
from matplotlib import pyplot as plt
from sklearn import metrics
import seaborn as sns
from pycaret.classification import *

processed_data_dir ='/processed'

### Load Data

In [None]:
train_all = pd.read_csv(f'{processed_data_dir}/train_all.csv', compression='zip').drop(columns=['Unnamed: 0'])
val = pd.read_csv(f'{processed_data_dir}/val_all.csv', compression='zip').drop(columns=['Unnamed: 0'])
test = pd.read_csv(f'{processed_data_dir}/test_all.csv', compression='zip').drop(columns=['Unnamed: 0'])

In [None]:
print(train_all.shape)
print(val.shape)
print(test.shape)

In [None]:
train = train_all
train.head(3)

In [None]:
train_list = train.obs_scheme.unique().tolist()
val = val[val.obs_scheme.isin(train_list)]
val.obs_scheme.nunique()

In [None]:
print("Train", train.obs_scheme.nunique())
print("Val: ", val.obs_scheme.nunique())
print("Test: ", test.obs_scheme.nunique())

## PyCaret Setup

The setup() function of PyCaret initializes the environment and prepares the machine learning modeling data and deployment. There are two necessary parameters, a dataset, and the target variable. After executing the function, each feature's type is inferred, and several pre-processing tasks are performed on the data.

In [None]:
clf = setup(
    data = train,
    test_data = val,
    target = 'obs_scheme',
    session_id = 1221)


## Models

In [None]:
compare_models(
    sort='acc',
    cross_validation = False,
    exclude= ['gbc','qda', 'lightgbm'])#, fold=3)

In [None]:
model = create_model('lda', cross_validation = False)# ,class_weight = 'balanced', fold=5)

In [None]:
tuned_model = tune_model(model)#, optimize='recall') #, n_iter=20)
    # tune_model() uses the Random Grid Search method to tune and optimize the model by testing a random sample of the
    # hyperparameters. We can define a grid with specific values for the hyperparameters by using the custom_grid parameter.

## Visualise model results

In [None]:
plot_model(tuned_model, plot='confusion_matrix')

### Apply on unseen test set

In [None]:
ypred = predict_model(tuned_model, test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(ypred.obs_scheme, ypred.Label)

In [None]:
cm = metrics.confusion_matrix(test['obs_scheme'], ypred['Label'])

%matplotlib inline
plt.figure(figsize = (20,10))
sns.heatmap(pd.DataFrame(cm), annot=True)

## Finalize and Save Model

In [None]:
final_model = finalize_model(tuned_model)

### Save

In [None]:
save_model(tuned_model, 'model_name')