# Classification Models for ObservationScheme Using PyCaret
## Subsample: 100.000 of random concatenated text

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import metrics
import seaborn as sns
from pycaret.classification import *

from os import listdir
from os.path import isfile, join

In [None]:
data_dir = '/NewData_ranCombination/Subsamples/Embeddings/LaBSE'

## Load Data

In [None]:
all_files = [f for f in listdir(data_dir) if isfile(join(data_dir, f))]
all_files = [f[:-4] for f in all_files]
label_files = sorted([f for f in all_files if ('label' in f)])       # alphabetically ordered
filenames = sorted([f for f in all_files if (f not in label_files)]) # alphabetically ordered

In [None]:
filenames[0]

In [None]:
label_files[0]

In [None]:
data = pd.DataFrame(np.load(f'{data_dir}/{filenames[0]}.npy'))
print('Shape of data:', data.shape)

train = data.iloc[:-2000,:]
test = data.iloc[-2000:,:]

print('Train:', train.shape)
print('Test:', test.shape)

In [None]:
# Labels
labels = pd.DataFrame(np.load(f'{data_dir}/{label_files[0]}.npy'))
train['obs_scheme'] = labels.iloc[:-2000,:]
test['obs_scheme'] = labels.iloc[-2000:,:]

In [None]:
print("Train", train.obs_scheme.nunique())
print("Test: ", test.obs_scheme.nunique())

In [None]:
test.head(3)

## PyCaret Setup

The setup() function of PyCaret initializes the environment and prepares the machine learning modeling data and deployment. There are two necessary parameters, a dataset, and the target variable. After executing the function, each feature's type is inferred, and several pre-processing tasks are performed on the data.

In [None]:
clf = setup(
    data = train,
    test_data = test,
    target = 'obs_scheme',
    silent=True,
    session_id = 1221)

## Models

In [None]:
ridge = create_model('ridge', cross_validation = False)

In [None]:
lda = create_model('lda', cross_validation = False)

In [None]:
lr = create_model('lr', cross_validation = False)

In [None]:
evaluate_model(lr)

### sklearn models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

##### Logistic Regression

In [None]:
LR = LogisticRegression(
    C=1.0,
    class_weight=None,
    dual=False,
    fit_intercept = True,
    intercept_scaling=1,
    l1_ratio=None,
    max_iter=1000,
    multi_class='auto',
    n_jobs=None,
    penalty='l2',
    random_state=1221,
    solver='lbfgs',
    tol=0.0001,
    verbose=0,
    warm_start=False
)

In [None]:
LR.fit(train.iloc[:,:-1], train.obs_scheme)
y_pred = LR.predict(test.iloc[:,:-1])
print("Accuracy LR:", accuracy_score(test.obs_scheme, y_pred))

In [None]:
filename = 'LR_labse_102000_ranCombination_sklearn.sav'
pickle.dump(LR, open(filename, 'wb'))

## Apply on unseen test set

In [None]:
ypred_lr = predict_model(t_lr, test)

In [None]:
from sklearn.metrics import accuracy_score
print("LR:", accuracy_score(ypred_lr.obs_scheme, ypred_lr.Label))

In [None]:
cm = metrics.confusion_matrix(test['obs_scheme'], ypred_lr['Label'])
cm

%matplotlib inline
plt.figure(figsize = (20,10))
sns.heatmap(pd.DataFrame(cm), annot=True)

In [None]:
# Save model
save_model(t_lr, 'LR_labse_100000_ranCombination')