# PyCaret on Superuser Data

In [None]:
import pandas as pd
import numpy as np
import re,string
from matplotlib import pyplot as plt
from sklearn import metrics
import seaborn as sns
from pycaret.classification import *
from sklearn.metrics import accuracy_score
from os import listdir
from os.path import isfile, join

### Load data

In [None]:
data_dir = '/NewData_ranCombination/All_samples/Embeddings/SuperUsers'

In [None]:
all_files = [f for f in listdir(data_dir) if isfile(join(data_dir, f))]
all_files = [f[:-4] for f in all_files]
label_files = sorted([f for f in all_files if ('label' in f)])       # alphabetically ordered
filenames = sorted([f for f in all_files if (f not in label_files)]) # alphabetically ordered

In [None]:
print('Data:', filenames)
print('')
print('Labels:', label_files)

In [None]:
# LaBSE
labse_df = pd.DataFrame(np.load(f'{data_dir}/{filenames[0]}.npy'))
labse_df['obs_scheme'] = pd.DataFrame(np.load(f'{data_dir}/{label_files[0]}.npy'))

labse_train = labse_df.iloc[:-2000,:]
labse_test = labse_df.iloc[-2000:,:]

print('Train:', labse_train.shape)
print('Test:', labse_test.shape)

In [None]:
# Roberta
rob_df = pd.DataFrame(np.load(f'{data_dir}/{filenames[1]}.npy'))
rob_df['obs_scheme'] = pd.DataFrame(np.load(f'{data_dir}/{label_files[1]}.npy'))

rob_train = rob_df.iloc[:-2000,:]
rob_test = rob_df.iloc[-2000:,:]

print('Train:', rob_train.shape)
print('Test:', rob_test.shape)

In [None]:
# TF-IDF
tfidf_df = pd.DataFrame(np.load(f'{data_dir}/{filenames[2]}.npy'))
tfidf_df['obs_scheme'] = pd.DataFrame(np.load(f'{data_dir}/{label_files[2]}.npy'))

tfidf_train = tfidf_df.iloc[:-2000,:]
tfidf_test = tfidf_df.iloc[-2000:,:]

print('Train:', tfidf_train.shape)
print('Test:', tfidf_test.shape)

In [None]:
print('No. unique schemes, LaBSE: ', labse_train.obs_scheme.nunique())
print('No. unique schemes, Roberta: ', rob_train.obs_scheme.nunique())
print('No. unique schemes, TF-IDF: ', tfidf_train.obs_scheme.nunique())

## PyCaret - LaBSE

In [None]:
clf = setup(
    data = labse_train,
    test_data = labse_test,
    target = 'obs_scheme',
    silent=True,
    session_id = 1221,
    use_gpu = True)

In [None]:
lr = create_model('lr', cross_validation = False)

In [None]:
ridge = create_model('ridge', cross_validation = False)

In [None]:
lda = create_model('lda', cross_validation = False)

In [None]:
svm = create_model('svm', cross_validation = False)

#### Apply on test set

In [None]:
ypred_ridge = predict_model(ridge, labse_test)
ypred_lr = predict_model(lr, labse_test)
ypred_lda = predict_model(lda, labse_test)
ypred_svm = predict_model(svm, labse_test)

print("Ridge - LaBSE:", accuracy_score(ypred_ridge.obs_scheme, ypred_ridge.Label))
print("LR - LaBSE:", accuracy_score(ypred_lr.obs_scheme, ypred_lr.Label))
print("LDA - LaBSE:", accuracy_score(ypred_lda.obs_scheme, ypred_lda.Label))
print("SVM - LaBSE:", accuracy_score(ypred_svm.obs_scheme, ypred_svm.Label))

In [None]:
save_model(ridge, 'ridge_labse_superuser_test2000su')
save_model(lr, 'lr_labse_superuser_test2000su')
save_model(lda, 'lda_labse_superuser_test2000su')
save_model(svm, 'svm_labse_superuser_test2000su')

## PyCaret - Roberta

In [None]:
clf = setup(
    data = rob_train,
    test_data = rob_test,
    target = 'obs_scheme',
    silent=True,
    session_id = 1221,
    use_gpu = True)

In [None]:
lr = create_model('lr', cross_validation = False)

In [None]:
ridge = create_model('ridge', cross_validation = False)

In [None]:
lda = create_model('lda', cross_validation = False)

In [None]:
svm = create_model('svm', cross_validation = False)

#### Apply on test set

In [None]:
ypred_ridge = predict_model(ridge, rob_test)
ypred_lr = predict_model(lr, rob_test)
ypred_lda = predict_model(lda, rob_test)
ypred_svm = predict_model(svm, rob_test)

print("Ridge - Roberta:", accuracy_score(ypred_ridge.obs_scheme, ypred_ridge.Label))
print("LR - Roberta:", accuracy_score(ypred_lr.obs_scheme, ypred_lr.Label))
print("LDA - Roberta:", accuracy_score(ypred_lda.obs_scheme, ypred_lda.Label))
print("SVM - Roberta:", accuracy_score(ypred_svm.obs_scheme, ypred_svm.Label))

In [None]:
save_model(ridge, 'ridge_roberta_superuser')
save_model(lr, 'lr_roberta_superuser')
save_model(lda, 'lda_roberta_superuser')
save_model(svm, 'svm_roberta_superuser')

## PyCaret - TF-IDF

In [None]:
clf = setup(
    data = tfidf_train,
    test_data = tfidf_test,
    target = 'obs_scheme',
    silent=True,
    session_id = 1221,
    use_gpu = True)

In [None]:
lr = create_model('lr', cross_validation = False)

In [None]:
ridge = create_model('ridge', cross_validation = False)

In [None]:
lda = create_model('lda', cross_validation = False)

In [None]:
svm = create_model('svm', cross_validation = False)

#### Apply on test set

In [None]:
ypred_ridge = predict_model(ridge, tfidf_test)
ypred_lr = predict_model(lr, tfidf_test)
ypred_lda = predict_model(lda, tfidf_test)
ypred_svm = predict_model(svm, tfidf_test)

print("Ridge - TFIDF:", accuracy_score(ypred_ridge.obs_scheme, ypred_ridge.Label))
print("LR - TFIDF:", accuracy_score(ypred_lr.obs_scheme, ypred_lr.Label))
print("LDA - LaBSE:", accuracy_score(ypred_lda.obs_scheme, ypred_lda.Label))
print("SVM - TFIDF:", accuracy_score(ypred_svm.obs_scheme, ypred_svm.Label))

In [None]:
save_model(ridge, 'ridge_tfidf_superuser_test2000su')
save_model(lr, 'lr_tfidf_superuser_test2000su')
save_model(lda, 'lda_tfidf_superuser')
save_model(svm, 'svm_tfidf_superuser_test2000su')