# Classification Models for ObservationScheme Using PyCaret
## Subsample: 10.000 of random concatenated text

In [None]:
import pandas as pd
import numpy as np
import re,string
from matplotlib import pyplot as plt
from sklearn import metrics
import seaborn as sns
from pycaret.classification import *

from os import listdir
from os.path import isfile, join

In [None]:
data_dir = '/NewData_ranCombination/Subsamples/Embeddings/LaBSE'

## Load Data

In [None]:
all_files = [f for f in listdir(data_dir) if isfile(join(data_dir, f))]
all_files = [f[:-4] for f in all_files]
label_files = sorted([f for f in all_files if ('label' in f)])       # alphabetically ordered
filenames = sorted([f for f in all_files if (f not in label_files)]) # alphabetically ordered

In [None]:
filenames[1]

In [None]:
label_files[1]

In [None]:
data = pd.DataFrame(np.load(f'{data_dir}/{filenames[1]}.npy'))
print('Shape of data:', data.shape)

train = data.iloc[:-2000,:]
test = data.iloc[-2000:,:]

print('Train:', train.shape)
print('Test:', test.shape)

In [None]:
# Labels
labels = pd.DataFrame(np.load(f'{data_dir}/{label_files[1]}.npy'))
train['obs_scheme'] = labels.iloc[:-2000,:]
test['obs_scheme'] = labels.iloc[-2000:,:]

In [None]:
print("Train", train.obs_scheme.nunique())
print("Test: ", test.obs_scheme.nunique())

## New test data: Standard data

In [None]:
standard_data_path = '/Subsamples/Embeddings'
embeddings = ['LaBSE', 'tfidf','XLM-RoBERTa']
subsamples = [10000,20000,50000,100000]
embed_no = [768, 1024]

In [None]:
# LaBSE
test = pd.DataFrame(np.load(f'{standard_data_path}/{embeddings[0]}/StandardData_cleaned_subsamples_{subsamples[0]}_{embeddings[0]}_embeddings_{embed_no[0]}.npy'))
labels = pd.DataFrame(np.load(f'{standard_data_path}/{embeddings[0]}/labels_StandardData_cleaned_subsamples_{subsamples[0]}_{embeddings[0]}_embeddings_{embed_no[0]}.npy'))
test['obs_scheme'] = labels

In [None]:
test.head(3)

## PyCaret Setup

The setup() function of PyCaret initializes the environment and prepares the machine learning modeling data and deployment. There are two necessary parameters, a dataset, and the target variable. After executing the function, each feature's type is inferred, and several pre-processing tasks are performed on the data.

In [None]:
clf = setup(
    data = train,
    test_data = test,
    target = 'obs_scheme',
    silent=True,
    session_id = 1221)

## Models

In [None]:
ridge = create_model('ridge', cross_validation = False)

In [None]:
lda = create_model('lda', cross_validation = False)

In [None]:
svm = create_model('svm', cross_validation = False)

In [None]:
lr = create_model('lr', cross_validation = False)

In [None]:
save_model(ridge, 'ridge_labse_12000_ranCombination')
save_model(lda, 'lda_labse_12000_ranCombination')
save_model(svm, 'svm_labse_12000_ranCombination')
save_model(lr, 'LR_labse_12000_ranCombination')

## Apply on unseen test set

In [None]:
ypred_ridge = predict_model(ridge, test)
ypred_lda = predict_model(lda, test)
ypred_svm = predict_model(svm, test)
ypred_lr = predict_model(lr, test)

In [None]:
from sklearn.metrics import accuracy_score

print("Ridge:", accuracy_score(ypred_ridge.obs_scheme, ypred_ridge.Label))
print("LDA:", accuracy_score(ypred_lda.obs_scheme, ypred_lda.Label))
print("SVM:", accuracy_score(ypred_svm.obs_scheme, ypred_svm.Label))
print("LR:", accuracy_score(ypred_lr.obs_scheme, ypred_lr.Label))

In [None]:
cm = metrics.confusion_matrix(test['obs_scheme'], ypred_lr['Label'])
cm

%matplotlib inline
plt.figure(figsize = (20,10))
sns.heatmap(pd.DataFrame(cm), annot=True)

## sklearn models

In [None]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import pickle

##### Logistic Regression

In [None]:
LR = LogisticRegression(
    C=1.0,
    class_weight=None,
    dual=False,
    fit_intercept = True,
    intercept_scaling=1,
    l1_ratio=None,
    max_iter=1000,
    multi_class='auto',
    n_jobs=None,
    penalty='l2',
    random_state=1221,
    solver='lbfgs',
    tol=0.0001,
    verbose=0,
    warm_start=False
)

In [None]:
LR.fit(train.iloc[:,:-1], train.obs_scheme)
y_pred = LR.predict(test.iloc[:,:-1])
print("Accuracy LR:", accuracy_score(test.obs_scheme, y_pred))

In [None]:
LR

In [None]:
filename = 'LR_labse_12000_ranCombination_sklearn.sav'
pickle.dump(LR, open(filename, 'wb'))

##### Ridge

In [None]:
ridge = RidgeClassifier(
    alpha=1.0,
    class_weight=None,
    copy_X=True,
    fit_intercept = True,
    max_iter=None,
    normalize=False,
    random_state=1221,
    solver='auto',
    tol=0.001
)

In [None]:
ridge.fit(train.iloc[:,:-1], train.obs_scheme)
y_pred = ridge.predict(test.iloc[:,:-1])
print("Accuracy ridge:", accuracy_score(test.obs_scheme, y_pred))

In [None]:
ridge

In [None]:
filename = 'ridge_labse_12000_ranCombination_sklearn.sav'
pickle.dump(ridge, open(filename, 'wb'))

##### LDA

In [None]:
lda = LinearDiscriminantAnalysis(
    n_components=None,
    priors=None,
    shrinkage=None,
    solver='svd',
    store_covariance=False,
    tol=0.0001
)

In [None]:
lda.fit(train.iloc[:,:-1], train.obs_scheme)
y_pred = lda.predict(test.iloc[:,:-1])
print("Accuracy LDA:", accuracy_score(test.obs_scheme, y_pred))

In [None]:
lda

In [None]:
filename = 'lda_labse_12000_ranCombination_sklearn.sav'
pickle.dump(lda, open(filename, 'wb'))

##### SVM

In [None]:
svm = SGDClassifier(
    alpha=0.0001,
    average=False,
    class_weight=None,
    early_stopping=False,
    epsilon=0.1,
    eta0=0.001,
    fit_intercept=True,
    l1_ratio=0.15,
    learning_rate='optimal',
    loss='hinge',
    max_iter=1000,
    n_iter_no_change=5,
    n_jobs=-1,
    penalty='l2',
    power_t=0.5,
    random_state=1221,
    shuffle=True,
    tol=0.001,
    validation_fraction=0.1,
    verbose=0,
    warm_start=False
)

In [None]:
svm.fit(train.iloc[:,:-1], train.obs_scheme)
y_pred = svm.predict(test.iloc[:,:-1])
print("Accuracy SVM:", accuracy_score(test.obs_scheme, y_pred))

In [None]:
svm

In [None]:
filename = 'svm_labse_12000_ranCombination_sklearn.sav'
pickle.dump(svm, open(filename, 'wb'))