# Classification Models for ObservationScheme Using PyCaret
## Subsample: 10.000 of random concatenated text

In [None]:
import pandas as pd
import numpy as np
import re,string
from matplotlib import pyplot as plt
from sklearn import metrics
import seaborn as sns
from pycaret.classification import *

from os import listdir
from os.path import isfile, join

In [None]:
data_dir = '/NewData_ranCombination/Subsamples/Embeddings/xlmRoberta'

## Load Data

In [None]:
all_files = [f for f in listdir(data_dir) if isfile(join(data_dir, f))]
all_files = [f[:-4] for f in all_files]
label_files = sorted([f for f in all_files if ('label' in f)])       # alphabetically ordered
filenames = sorted([f for f in all_files if (f not in label_files)]) # alphabetically ordered

In [None]:
data = pd.DataFrame(np.load(f'{data_dir}/{filenames[1]}.npy'))
print('Shape of data:', data.shape)

train = data.iloc[:10000,:]
val = data.iloc[10000:12000,:]
test = data.iloc[-2000:,:]

print('Train:', train.shape)
print('Validation:', val.shape)
print('Test:', test.shape)

In [None]:
# Labels
labels = pd.DataFrame(np.load(f'{data_dir}/{label_files[1]}.npy'))
train['obs_scheme'] = labels.iloc[:10000,:]
val['obs_scheme'] = labels.iloc[10000:12000,:]
test['obs_scheme'] = labels.iloc[-2000:,:]

In [None]:
train_list = train.obs_scheme.unique().tolist()
val = val[val.obs_scheme.isin(train_list)]
val.obs_scheme.nunique()

In [None]:
print("Train", train.obs_scheme.nunique())
print("Val: ", val.obs_scheme.nunique())
print("Test: ", test.obs_scheme.nunique())

## PyCaret Setup

The setup() function of PyCaret initializes the environment and prepares the machine learning modeling data and deployment. There are two necessary parameters, a dataset, and the target variable. After executing the function, each feature's type is inferred, and several pre-processing tasks are performed on the data.

In [None]:
clf = setup(
    data = train,
    test_data = val,
    target = 'obs_scheme',
    silent=True,
    session_id = 1221)
    # use_gpu = True
    #feature_selection=True,      # a subset of features are selected using a combination of various permutation importance techniques
    #remove_outliers = True       # outliers removed using PCA dimensionality reduction   
    #ignore_low_variance = True, 
    #remove_multicollinearity = True,


## Models

In [None]:
compare_models(
    sort='acc',
    cross_validation = False, #, fold=3)
    exclude= ['gbc','qda', 'lightgbm'])

In [None]:
ridge = create_model('ridge', cross_validation = False)

In [None]:
lda = create_model('lda', cross_validation = False)

In [None]:
svm = create_model('svm', cross_validation = False)

In [None]:
lr = create_model('lr', cross_validation = False)

In [None]:
t_ridge = tune_model(ridge)

In [None]:
t_lda = tune_model(lda)

In [None]:
t_svm = tune_model(svm)

In [None]:
t_lr = tune_model(lr)

## Apply on unseen test set

In [None]:
ypred_ridge = predict_model(t_ridge, test)
ypred_lda = predict_model(t_lda, test)
ypred_svm = predict_model(t_svm, test)
ypred_lr = predict_model(t_lr, test)

In [None]:
#ypred.to_csv(f'{processed_data_dir}/')

In [None]:
from sklearn.metrics import accuracy_score

print("Ridge:", accuracy_score(ypred_ridge.obs_scheme, ypred_ridge.Label))
print("LDA:", accuracy_score(ypred_lda.obs_scheme, ypred_lda.Label))
print("SVM:", accuracy_score(ypred_svm.obs_scheme, ypred_svm.Label))
print("LR:", accuracy_score(ypred_lr.obs_scheme, ypred_lr.Label))

In [None]:
cm = metrics.confusion_matrix(test['obs_scheme'], ypred_lr['Label'])
cm

%matplotlib inline
plt.figure(figsize = (20,10))
sns.heatmap(pd.DataFrame(cm), annot=True)

## New test data: Standard data

In [None]:
standard_data_path = '/Testborger/Subsamples/Embeddings'
embeddings = ['LaBSE', 'tfidf','XLM-RoBERTa']
subsamples = [10000,20000,50000,100000]
embed_no = [768, 1024]

In [None]:
# LaBSE
test = pd.DataFrame(np.load(f'{standard_data_path}/{embeddings[2]}/StandardData_cleaned_subsamples_{subsamples[0]}_roberta-large_embeddings_{embed_no[1]}.npy'))
labels = pd.DataFrame(np.load(f'{standard_data_path}/{embeddings[2]}/labels_StandardData_cleaned_subsamples_{subsamples[0]}_roberta-large_embeddings_{embed_no[1]}.npy'))
test['obs_scheme'] = labels

In [None]:
ypred_ridge = predict_model(t_ridge, test)
ypred_lda = predict_model(t_lda, test)
ypred_svm = predict_model(t_svm, test)
ypred_lr = predict_model(t_lr, test)

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy for standard data as test set:")
print("Ridge:", accuracy_score(ypred_ridge.obs_scheme, ypred_ridge.Label))
print("LDA:", accuracy_score(ypred_lda.obs_scheme, ypred_lda.Label))
print("SVM:", accuracy_score(ypred_svm.obs_scheme, ypred_svm.Label))
print("LR:", accuracy_score(ypred_lr.obs_scheme, ypred_lr.Label))