Required packages:\
pandas==1.4.0\
numpy==1.21.5\
scikit-learn==1.0.2\
tensorflow==2.7.0\
torch==1.10.2\
transformers==4.17.0.dev0\
datasets==1.18.3\
textstat==0.7.2 (if running the ML part)\
xgboost==1.5.2 (if running the ML part)

## Read Data

In [1]:
RUN_DL = True

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("data/sample_full.csv")

In [4]:
data.fillna({'Remember': 0, 'Understand': 0, 'Apply': 0, 'Analyze': 0, 'Evaluate': 0, 'Create':0}, inplace=True)

In [5]:
data.dtypes

Learning_outcome     object
Remember            float64
Understand          float64
Apply               float64
Analyze             float64
Evaluate            float64
Create              float64
dtype: object

In [6]:
list(data.columns[1:7])

['Remember', 'Understand', 'Apply', 'Analyze', 'Evaluate', 'Create']

In [7]:
data['one_hot_encoded'] = list(data[data.columns[1:]].values)

In [8]:
data.head()

Unnamed: 0,Learning_outcome,Remember,Understand,Apply,Analyze,Evaluate,Create,one_hot_encoded
0,Analyze the health economic implications of e...,0.0,0.0,0.0,1.0,0.0,0.0,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
1,Apply research skills to operate effectively ...,0.0,0.0,1.0,0.0,0.0,0.0,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
2,Assess and synthesise diverse information abo...,0.0,0.0,0.0,0.0,1.0,1.0,"[0.0, 0.0, 0.0, 0.0, 1.0, 1.0]"
3,Describe the general characteristics of the m...,0.0,1.0,0.0,0.0,0.0,0.0,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
4,Evaluate the different models of perioperativ...,0.0,0.0,0.0,0.0,1.0,0.0,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"


In [9]:
data['Learning_outcome'] = data['Learning_outcome'].str.lower()

In [10]:
textual_data = data['Learning_outcome'].tolist()

### Some Basic Analysis

In [14]:
from nltk import word_tokenize, download
download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
lengths = []
for text in textual_data:
    lengths.append(len(word_tokenize(text)))

In [16]:
min(lengths), max(lengths), np.mean(lengths), np.percentile(lengths, 99.5)

(2, 266, 17.808372310570626, 54.0)

In [17]:
np.unique(data['Remember'], return_counts=True)

(array([0., 1.]), array([20195,  1185]))

In [18]:
np.unique(data['Understand'], return_counts=True)

(array([0., 1.]), array([15555,  5825]))

In [19]:
np.unique(data['Apply'], return_counts=True)

(array([0., 1.]), array([15299,  6081]))

In [20]:
np.unique(data['Analyze'], return_counts=True)

(array([0., 1.]), array([17921,  3459]))

In [21]:
np.unique(data['Evaluate'], return_counts=True)

(array([0., 1.]), array([17546,  3834]))

In [22]:
np.unique(data['Create'], return_counts=True)

(array([0., 1.]), array([17493,  3887]))

In [23]:
LIWC_data = pd.read_csv("data/LIWC2015 Results (Learning_outcome.csv).csv")

In [24]:
data = data.join(LIWC_data).drop(['A'], axis=1)

In [25]:
data.head()

Unnamed: 0,Learning_outcome,Remember,Understand,Apply,Analyze,Evaluate,Create,one_hot_encoded,WC,Analytic,...,Comma,Colon,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP
0,analyze the health economic implications of e...,0.0,0.0,0.0,1.0,0.0,0.0,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",9,99.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,apply research skills to operate effectively ...,0.0,0.0,1.0,0.0,0.0,0.0,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",14,99.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,assess and synthesise diverse information abo...,0.0,0.0,0.0,0.0,1.0,1.0,"[0.0, 0.0, 0.0, 0.0, 1.0, 1.0]",26,43.96,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,describe the general characteristics of the m...,0.0,1.0,0.0,0.0,0.0,0.0,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",23,99.0,...,8.7,0.0,0.0,0.0,0.0,4.35,0.0,0.0,0.0,0.0
4,evaluate the different models of perioperativ...,0.0,0.0,0.0,0.0,1.0,0.0,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]",10,98.58,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
data.columns[8:]

Index(['WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'Sixltr', 'Dic',
       'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they',
       'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate',
       'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect',
       'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family', 'friend',
       'female', 'male', 'cogproc', 'insight', 'cause', 'discrep', 'tentat',
       'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio', 'body',
       'health', 'sexual', 'ingest', 'drives', 'affiliation', 'achieve',
       'power', 'reward', 'risk', 'focuspast', 'focuspresent', 'focusfuture',
       'relativ', 'motion', 'space', 'time', 'work', 'leisure', 'home',
       'money', 'relig', 'death', 'informal', 'swear', 'netspeak', 'assent',
       'nonflu', 'filler', 'AllPunc', 'Period', 'Comma', 'Colon', 'SemiC',
       'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP'],
      dtype='o

## Undersample - not used

In [28]:
from imblearn.under_sampling import RandomUnderSampler

In [29]:
def rus(X, Y):
    Y = np.where(Y==1.0, 1, 0)
    r = RandomUnderSampler(random_state=0)
    X = X.to_numpy()
    #X = np.reshape(X, (-1, 1))
    X_resampled, y_resampled = r.fit_resample(X, Y)
    #X_resampled = X_resampled.flatten()
    return X_resampled, y_resampled

## BERT

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, cohen_kappa_score, f1_score
import torch
import tensorflow as tf
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from datasets import load_metric, list_metrics

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
# evaluation metric: F1 Score
metric = load_metric("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

Downloading: 5.27kB [00:00, 5.79MB/s]                   


In [32]:
class EncodeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)



def createBERT(dir_name, X, Y, test_X, test_Y, batch_size=64, nepochs=3, patience=10):
    # function to fine-tune BERT with given data and print out performance on the testing set
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_cache=False)
    model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, use_cache=False)
    training_args = TrainingArguments(
        output_dir=dir_name,          # output directory
        num_train_epochs=nepochs,              # total number of training epochs
        per_device_train_batch_size=batch_size,  # batch size per device during training
        per_device_eval_batch_size=batch_size,   # batch size for evaluation
        warmup_steps=5,                # number of warmup steps for learning rate scheduler
        weight_decay=0.05,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
        metric_for_best_model="f1",
        evaluation_strategy="steps",
        save_strategy="steps",
        save_steps=10,
        load_best_model_at_end=True,
        save_total_limit=3
    )
    train_x, val_x, train_y, val_y = train_test_split(X, Y, test_size=0.2, random_state=666, stratify=Y)

    train_encoded = tokenizer(train_x, truncation=True, padding=True, max_length=55)
    val_encoded = tokenizer(val_x, truncation=True, padding=True, max_length=55)
    test_encoded = tokenizer(test_X, truncation=True, padding=True, max_length=55)

    train_set = EncodeDataset(train_encoded, train_y)
    val_set = EncodeDataset(val_encoded, val_y)
    test_set = EncodeDataset(test_encoded, test_Y)
    trainer = Trainer(model=model, args=training_args, train_dataset=train_set, eval_dataset=val_set, compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)])
    print("Started training model for column", dir_name)
    trainer.train()
    trainer.save_model()
    print("Training Completed. Started testing...")
    predicted = trainer.predict(test_set)
    predicted_result = np.argmax(predicted.predictions, axis=-1)
    print("Accuracy Score -> ", accuracy_score(test_Y, predicted_result))
    print("Kappa Score -> ", cohen_kappa_score(test_Y, predicted_result))
    print("ROC AUC Score -> ", roc_auc_score(test_Y, predicted_result))
    print("F1 Score -> ", f1_score(test_Y, predicted_result))
    print("Classification report -> \n", classification_report(test_Y, predicted_result))
    return trainer

In [33]:
remember_bert = None
understand_bert = None
apply_bert = None
analyze_bert = None
evaluate_bert = None
create_bert = None

## ML Libraries

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [35]:
import textstat
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

  from pandas import MultiIndex, Int64Index


In [36]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn import metrics

In [38]:
import matplotlib.pyplot as plt

In [39]:
def generateX(data_x, test_x, textual_column_index, start_index_LIWC, end_index_LIWC):
    # generating ML features based on previous literature
    column_names = []
    print("Getting Unigram...")
    uni_cv = CountVectorizer(stop_words='english', ngram_range=(1, 1), max_features=1000)
    unigram = uni_cv.fit_transform(data_x[:, textual_column_index])
    unigram = unigram.toarray()
    unigram_test = uni_cv.transform(test_x[:,textual_column_index]).toarray()
    temp = uni_cv.get_feature_names_out().tolist()
    column_names += ["uni_"+name for name in temp]
    print("Getting Bigram...")
    bi_cv = CountVectorizer(stop_words='english', ngram_range=(2, 2), max_features=1000)
    bigram = bi_cv.fit_transform(data_x[:, textual_column_index])
    bigram = bigram.toarray()
    bigram_test = bi_cv.transform(test_x[:, textual_column_index]).toarray()
    temp = bi_cv.get_feature_names_out().tolist()
    column_names += ["bi_"+name for name in temp]
    print("Getting Tfidf...")
    tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), max_features=1000)
    t = tfidf.fit_transform(data_x[:, textual_column_index])
    t = t.toarray()
    t_test = tfidf.transform(test_x[:, textual_column_index]).toarray()
    temp = tfidf.get_feature_names_out().tolist()
    column_names += ["tfidf_"+name for name in temp]
    print("Getting ARI...")
    ari = [textstat.automated_readability_index(text) for text in data_x[:, textual_column_index]]
    ari_test = [textstat.automated_readability_index(text) for text in test_x[:, textual_column_index]]
    column_names.append("ari")
    combined_data_x = []
    combined_test_x = []
    print("Combining...")
    for i in range(len(data_x)):
        combined_data_x.append(unigram[i].tolist()
                              + bigram[i].tolist()
                              + t[i].tolist()
                              + [ari[i]]
                              + data_x[i, start_index_LIWC:end_index_LIWC].tolist())
    for i in range(len(test_x)):
        combined_test_x.append(unigram_test[i].tolist()
                              + bigram_test[i].tolist()
                              + t_test[i].tolist()
                              + [ari_test[i]]
                              + test_x[i, start_index_LIWC:end_index_LIWC].tolist())
    print("Generated feature shape is", np.array(combined_data_x).shape)
    print("Generated test feature is", np.array(combined_test_x).shape)
    return combined_data_x, column_names, combined_test_x

In [40]:
def performancePrinter(test_y, pred_y):
    # performance printer
    print("Accuracy Score -> ", accuracy_score(test_y, pred_y))
    print("Kappa Score -> ", cohen_kappa_score(test_y, pred_y))
    print("ROC AUC Score -> ", roc_auc_score(test_y, pred_y))
    print("F1 Score -> ", f1_score(test_y, pred_y))
    print("Classification report -> \n", classification_report(test_y, pred_y))

#### Grid Search Parameters

In [41]:
params_nb = {'var_smoothing': [1e-8, 1e-9, 1e-10]}

In [42]:
params_svm = {'C': [0.1, 1, 10, 100],
              'gamma': ['scale', 'auto'],
              'kernel': ['linear', 'poly', 'rbf']}

In [43]:
params_lr = {'penalty': ['l1', 'l2', 'none'],
             'C': [0.1, 1, 10],
             'solver': ['saga'],
             'tol': [0.01, 0.001, 0.0001],
             'max_iter': [200, 500]}

In [44]:
params_rf = {'n_estimators': [50, 100, 250],
             'max_depth': [None, 5, 10],
             'max_features':['auto', 'sqrt'],
             'min_samples_split': [2, 5, 10],
             'min_samples_leaf': [1, 2, 4],
             'bootstrap': [True, False]}

In [45]:
params_xgb = {'gamma':[0.1, 0.5],
              'learning_rate': [0.1, 0.5],
              'max_depth': [5, 7, 10],
              'n_estimators': [50, 100]}

## Experiment

In [46]:
split_train_x, split_test_x, split_train_y, split_test_y = train_test_split(data.drop(columns=list(data.columns[1:8])), data[data.columns[1:7]], test_size=0.2, random_state=666)

### Remember

#### Data Preparation

In [47]:
remember_x, remember_y = split_train_x.to_numpy(), split_train_y['Remember'].astype('long').to_numpy()#rus(split_train_x, split_train_y['Remember'].to_numpy())

In [48]:
remember_x.shape, remember_y.shape

((17104, 94), (17104,))

#### BERT Experiment

In [49]:
remember_x_bert = remember_x[:, 0].tolist()

In [52]:
if RUN_DL:
    remember_bert = createBERT('remember', remember_x_bert, remember_y, split_test_x['Learning_outcome'].tolist(), split_test_y['Remember'].astype('long').to_numpy(), 64)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": false,
  "vocab_size": 30522
}



loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a

Started training model for column remember


Step,Training Loss,Validation Loss,F1
10,0.5122,0.213149,0.485719
20,0.152,0.205192,0.485719
30,0.1583,0.103703,0.815644
40,0.1081,0.115944,0.846829
50,0.1235,0.083672,0.85438
60,0.067,0.093513,0.821066
70,0.0632,0.097263,0.856249
80,0.0631,0.072625,0.895086
90,0.0724,0.071031,0.880947
100,0.0806,0.070725,0.87208


***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to remember/checkpoint-10
Configuration saved in remember/checkpoint-10/config.json
Model weights saved in remember/checkpoint-10/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to remember/checkpoint-20
Configuration saved in remember/checkpoint-20/config.json
Model weights saved in remember/checkpoint-20/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to remember/checkpoint-30
Configuration saved in remember/checkpoint-30/config.json
Model weights saved in remember/checkpoint-30/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to remember/checkpoint-40
Configuration saved in remember/checkpoint-40/config.json
Model weights saved in remember/checkpoint-40/pytorch_model.bin
Deleting older checkpoint [r

Training Completed. Started testing...


Accuracy Score ->  0.990645463049579
Kappa Score ->  0.9106600936230403
ROC AUC Score ->  0.9553300468115202
F1 Score ->  0.9156118143459916
Classification report -> 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4039
           1       0.92      0.92      0.92       237

    accuracy                           0.99      4276
   macro avg       0.96      0.96      0.96      4276
weighted avg       0.99      0.99      0.99      4276



#### Traditional ML Algorithm

In [53]:
combined_remember_x, column_names_remember, test_remember_x = generateX(remember_x, split_test_x.to_numpy(), 0, 1, 94)
train_remember_x = combined_remember_x
train_remember_y = remember_y
test_remember_y = split_test_y['Remember'].astype('long').to_numpy()

Getting Unigram...


Getting Bigram...
Getting Tfidf...
Getting ARI...
Combining...
Generated feature shape is (17104, 3094)
Generated test feature is (4276, 3094)


In [54]:
column_names_remember += data.columns[8:].tolist()

##### Naive Bayes

In [55]:
gnb_remember = GaussianNB()
gnb_remember_gs = GridSearchCV(gnb_remember, params_nb, scoring="f1", n_jobs=-1, cv=3, verbose=3)
gnb_remember_gs.fit(train_remember_x, train_remember_y)
pred_remember_y_gnb = gnb_remember_gs.predict(test_remember_x)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fitting 3 folds for each of 3 candidates, totalling 9 fits


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [56]:
gnb_remember_gs.best_params_

{'var_smoothing': 1e-08}

In [57]:
performancePrinter(test_remember_y, pred_remember_y_gnb)

Accuracy Score ->  0.6396164639850327
Kappa Score ->  0.11147063503407495
ROC AUC Score ->  0.7158971128543117
F1 Score ->  0.1978136387298282
Classification report -> 
               precision    recall  f1-score   support

           0       0.98      0.63      0.77      4039
           1       0.11      0.80      0.20       237

    accuracy                           0.64      4276
   macro avg       0.55      0.72      0.48      4276
weighted avg       0.93      0.64      0.74      4276



##### Support Vector Machine

In [58]:
svm_remember = SVC()
svm_remember_gs = GridSearchCV(svm_remember, params_svm, scoring="f1", n_jobs=-1, cv=3, verbose=3)
svm_remember_gs.fit(train_remember_x, train_remember_y)
pred_remember_y_svm = svm_remember_gs.predict(test_remember_x)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 2/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.000 total time= 9.3min
[CV 1/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.000 total time= 9.4min
[CV 3/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.000 total time= 9.8min
[CV 1/3] END .C=0.1, gamma=scale, kernel=linear;, score=0.799 total time= 9.9min
[CV 3/3] END .C=0.1, gamma=scale, kernel=linear;, score=0.796 total time=10.0min
[CV 2/3] END .C=0.1, gamma=scale, kernel=linear;, score=0.777 total time=10.0min
[CV 1/3] END ..C=0.1, gamma=auto, kernel=linear;, score=0.799 total time=10.0min
[CV 2/3] END ..C=0.1, gamma=auto, kernel=linear;, score=0.777 total time=10.3min
[CV 3/3] END ..C=0.1, gamma=auto, kernel=linear;, score=0.796 total time=10.6min
[CV 1/3] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.000 total time=10.8min
[CV 2/3] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.000 total time=11.0min
[CV 1/3] END .....C=1, gamma=scale, kernel=poly;

In [59]:
svm_remember_gs.best_params_

{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}

In [60]:
performancePrinter(test_remember_y, pred_remember_y_svm)

Accuracy Score ->  0.9815247895229187
Kappa Score ->  0.8273257585206419
ROC AUC Score ->  0.9226993563807726
F1 Score ->  0.8371134020618556
Classification report -> 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4039
           1       0.82      0.86      0.84       237

    accuracy                           0.98      4276
   macro avg       0.91      0.92      0.91      4276
weighted avg       0.98      0.98      0.98      4276



##### Logistic Regression

In [61]:
lr_remember = LogisticRegression()
lr_remember_gs = GridSearchCV(lr_remember, params_lr, scoring="f1", n_jobs=-1, cv=3, verbose=3)
lr_remember_gs.fit(train_remember_x, train_remember_y)
pred_remember_y_lr = lr_remember_gs.predict(test_remember_x)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has alread

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.030 total time=  54.6s
[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.019 total time=  58.9s
[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.036 total time=  59.7s
[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.019 total time=  53.2s
[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.030 total time=  54.8s
[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.048 total time=  55.1s
[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.030 total time= 1.0min
[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.019 total time= 1.0min
[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.048 total time= 1.0min
[CV 2/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.019 tot



[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.054 total time= 2.1min




[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.084 total time= 2.0min




[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.072 total time= 2.1min




[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.099 total time= 2.0min




[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.072 total time= 2.0min
[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.093 total time= 2.2min




[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.093 total time= 2.2min
[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.084 total time= 2.1min
[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.054 total time= 2.3min
[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.066 total time= 2.2min




[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.099 total time= 2.1min
[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.066 total time= 2.3min




[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.078 total time= 2.0min




[CV 3/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.048 total time= 1.0min
[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.095 total time= 2.0min
[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.099 total time= 2.0min




[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.078 total time= 2.0min




[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.095 total time= 2.1min
[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.099 total time= 2.1min
[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.019 total time=  56.9s
[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.030 total time= 1.0min
[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.048 total time= 1.0min
[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.030 total time=  58.9s
[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.019 total time= 1.0min
[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.030 total time= 1.4min
[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.048 total time= 1.4min
[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.019 total



[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.095 total time= 1.9min




[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.078 total time= 2.1min
[CV 2/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.019 total time= 1.0min




[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.099 total time= 2.9min
[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.090 total time= 2.9min




[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.072 total time= 3.1min




[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.099 total time= 3.0min
[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.072 total time= 3.1min




[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.078 total time= 1.9min
[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.090 total time= 3.1min




[CV 3/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.099 total time= 2.1min




[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.095 total time= 2.0min




[CV 3/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.099 total time= 2.1min
[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.048 total time= 1.0min
[CV 2/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.285 total time= 4.7min
[CV 1/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.317 total time= 5.0min




[CV 1/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.281 total time= 5.8min




[CV 3/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.258 total time= 5.7min




[CV 2/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.257 total time= 6.0min




[CV 3/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.303 total time= 5.1min




[CV 2/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.257 total time= 5.7min
[CV 1/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.317 total time= 5.1min




[CV 2/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.306 total time= 5.1min




[CV 1/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.281 total time= 5.9min




[CV 3/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.258 total time= 6.0min




[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.350 total time= 4.9min
[CV 3/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.303 total time= 5.1min
[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.019 total time= 1.4min




[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.342 total time= 5.0min




[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.030 total time= 1.6min
[CV 2/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.095 total time= 2.0min
[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.345 total time= 5.0min
[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.099 total time= 2.0min
[CV 1/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.078 total time= 2.1min
[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.048 total time= 1.6min
[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.030 total time=  58.2s
[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.322 total time= 5.1min




[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.019 total time= 1.0min
[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.342 total time= 5.1min




[CV 1/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.078 total time= 2.1min
[CV 2/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.095 total time= 2.0min
[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.326 total time= 5.2min
[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.048 total time=  58.1s




[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.099 total time= 2.1min
[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.030 total time=  59.3s
[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.019 total time=  54.7s
[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.048 total time=  58.0s
[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.030 total time=  53.8s
[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.019 total time=  54.0s
[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.030 total time= 1.4min
[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.019 total time= 1.6min
[CV 3/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.048 total time= 1.6min
[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.048 total time= 1



[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.078 total time= 1.9min




[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.019 total time=  54.9s




[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.030 total time= 1.0min




[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.095 total time= 2.1min




[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.078 total time= 3.0min




[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.078 total time= 2.0min




[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.099 total time= 2.1min




[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.095 total time= 3.3min
[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.095 total time= 1.9min
[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.095 total time= 3.2min




[CV 3/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.099 total time= 3.3min
[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.078 total time= 3.4min




[CV 3/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.099 total time= 3.3min
[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.048 total time=  59.6s




[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.099 total time= 2.1min
[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.322 total time= 4.8min




[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.345 total time= 4.9min




[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.345 total time= 5.1min




[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.335 total time= 5.0min
[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.335 total time= 5.1min




[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.322 total time= 5.1min
[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.322 total time= 4.8min




[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.342 total time= 4.8min




[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.345 total time= 4.9min
[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.078 total time= 2.1min
[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.019 total time= 1.4min
[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.030 total time= 1.4min




[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.342 total time= 5.0min
[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.095 total time= 2.1min
[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.099 total time= 2.0min
[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.322 total time= 5.0min




[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.345 total time= 5.2min




[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.048 total time= 1.6min




[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.078 total time= 2.0min
[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.030 total time= 1.0min




[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.019 total time= 1.0min
[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.048 total time=  56.4s




[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.095 total time= 2.1min




[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.099 total time= 2.1min
[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.030 total time=  53.4s
[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.019 total time=  56.0s
[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.308 total time= 7.6min




[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.048 total time=  59.7s




[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.323 total time= 7.6min
[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.345 total time= 7.5min




[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.345 total time= 7.8min
[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.308 total time= 7.5min




[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.323 total time= 7.5min
[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.322 total time= 4.1min




[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.342 total time= 4.1min




[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.350 total time= 4.4min




[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.342 total time= 4.1min




[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.326 total time= 4.2min
[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.345 total time= 4.2min




[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.322 total time= 4.1min
[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.350 total time= 4.1min




[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.326 total time= 4.2min
[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.342 total time= 4.3min




[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.345 total time= 4.4min




[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.342 total time= 4.4min
[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.322 total time= 6.4min




[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.345 total time= 6.6min




[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.334 total time= 6.5min




[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.322 total time= 6.5min




[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.334 total time= 6.5min




[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.345 total time= 7.0min




In [62]:
lr_remember_gs.best_params_

{'C': 10, 'max_iter': 500, 'penalty': 'none', 'solver': 'saga', 'tol': 0.0001}

In [63]:
performancePrinter(test_remember_y, pred_remember_y_lr)

Accuracy Score ->  0.9597754911131899
Kappa Score ->  0.4849022642751588
ROC AUC Score ->  0.6808208573998452
F1 Score ->  0.5028901734104045
Classification report -> 
               precision    recall  f1-score   support

           0       0.96      0.99      0.98      4039
           1       0.80      0.37      0.50       237

    accuracy                           0.96      4276
   macro avg       0.88      0.68      0.74      4276
weighted avg       0.95      0.96      0.95      4276



##### Random Forest

In [64]:
rf_remember = RandomForestClassifier()
rf_remember_gs = GridSearchCV(rf_remember, params_rf, scoring="f1", n_jobs=-1, cv=3, verbose=3)
rf_remember_gs.fit(train_remember_x, train_remember_y)
pred_remember_y_rf = rf_remember_gs.predict(test_remember_x)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has alrea

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [65]:
rf_remember_gs.best_params_

{'bootstrap': False,
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 250}

In [66]:
performancePrinter(test_remember_y, pred_remember_y_rf)

Accuracy Score ->  0.9831618334892422
Kappa Score ->  0.8297196396077499
ROC AUC Score ->  0.8917913215348663
F1 Score ->  0.8385650224215246
Classification report -> 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4039
           1       0.89      0.79      0.84       237

    accuracy                           0.98      4276
   macro avg       0.94      0.89      0.91      4276
weighted avg       0.98      0.98      0.98      4276



##### XGBoost

In [67]:
xgb_remember = XGBClassifier()
xgb_remember_gs = GridSearchCV(xgb_remember, params_xgb, scoring="f1", n_jobs=-1, cv=3)
xgb_remember_gs.fit(train_remember_x, train_remember_y)
pred_remember_y_xgb = xgb_remember_gs.predict(test_remember_x)

  from pandas import MultiIndex, Int64Index


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index








  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
































































































In [68]:
xgb_remember_gs.best_params_

{'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

In [69]:
performancePrinter(test_remember_y, pred_remember_y_xgb)

Accuracy Score ->  0.9808231992516371
Kappa Score ->  0.8197173945011111
ROC AUC Score ->  0.9163702424567219
F1 Score ->  0.8298755186721992
Classification report -> 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4039
           1       0.82      0.84      0.83       237

    accuracy                           0.98      4276
   macro avg       0.90      0.92      0.91      4276
weighted avg       0.98      0.98      0.98      4276



### Understand

#### Data Preparation

In [70]:
understand_x, understand_y = split_train_x.to_numpy(), split_train_y['Understand'].astype('long').to_numpy() #rus(split_train_x, split_train_y['Understand'].to_numpy())

In [71]:
understand_x.shape

(17104, 94)

#### BERT Experiment

In [72]:
understand_x_bert = understand_x[:, 0].tolist()

In [73]:
if RUN_DL:
    understand_bert = createBERT('understand', understand_x_bert, understand_y, split_test_x['Learning_outcome'].tolist(), split_test_y['Understand'].astype('long').to_numpy(), 64)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": false,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolve/m

Started training model for column understand


Step,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64


Saving model checkpoint to understand/checkpoint-10
Configuration saved in understand/checkpoint-10/config.json
Model weights saved in understand/checkpoint-10/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to understand/checkpoint-20
Configuration saved in understand/checkpoint-20/config.json
Model weights saved in understand/checkpoint-20/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to understand/checkpoint-30
Configuration saved in understand/checkpoint-30/config.json
Model weights saved in understand/checkpoint-30/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to understand/checkpoint-40
Configuration saved in understand/checkpoint-40/config.json
Model weights saved in understand/checkpoint-40/pytorch_model.bin
Deleting older checkpoint [understand/checkpoint-10] due to args.save_total

Training Completed. Started testing...


Accuracy Score ->  0.9635173058933583
Kappa Score ->  0.908479543396537
ROC AUC Score ->  0.9484682488079758
F1 Score ->  0.9336170212765957
Classification report -> 
               precision    recall  f1-score   support

           0       0.97      0.98      0.97      3076
           1       0.95      0.91      0.93      1200

    accuracy                           0.96      4276
   macro avg       0.96      0.95      0.95      4276
weighted avg       0.96      0.96      0.96      4276



#### Traditional ML Algorithm

In [74]:
combined_understand_x, column_names_understand, test_understand_x = generateX(understand_x, split_test_x.to_numpy(), 0, 1, 94)
train_understand_x = combined_understand_x
train_understand_y = understand_y
test_understand_y = split_test_y['Understand'].astype('long').to_numpy()

Getting Unigram...
Getting Bigram...
Getting Tfidf...
Getting ARI...
Combining...
Generated feature shape is (17104, 3094)
Generated test feature is (4276, 3094)


In [75]:
column_names_understand += data.columns[8:].tolist()

##### Naive Bayes

In [76]:
gnb_understand = GaussianNB()
gnb_understand_gs = GridSearchCV(gnb_understand, params_nb, scoring="f1", n_jobs=-1, cv=3, verbose=3)
gnb_understand_gs.fit(train_understand_x, train_understand_y)
pred_understand_y_gnb = gnb_understand_gs.predict(test_understand_x)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [77]:
gnb_understand_gs.best_params_

{'var_smoothing': 1e-08}

In [78]:
performancePrinter(test_understand_y, pred_understand_y_gnb)

Accuracy Score ->  0.641955098222638
Kappa Score ->  0.3273875303759528
ROC AUC Score ->  0.7158154529692242
F1 Score ->  0.5808924171913495
Classification report -> 
               precision    recall  f1-score   support

           0       0.92      0.55      0.69      3076
           1       0.43      0.88      0.58      1200

    accuracy                           0.64      4276
   macro avg       0.68      0.72      0.63      4276
weighted avg       0.79      0.64      0.66      4276



##### Support Vector Machine

In [79]:
svm_understand = SVC()
svm_understand_gs = GridSearchCV(svm_understand, params_svm, scoring="f1", n_jobs=-1, cv=3, verbose=3)
svm_understand_gs.fit(train_understand_x, train_understand_y)
pred_understand_y_svm = svm_understand_gs.predict(test_understand_x)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[CV 2/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.008 total time=29.4min
[CV 2/3] END .....C=1, gamma=scale, kernel=poly;, score=0.415 total time=28.4min
[CV 3/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.012 total time=29.7min
[CV 3/3] END ..C=0.1, gamma=auto, kernel=linear;, score=0.854 total time=29.8min
[CV 3/3] END ....C=0.1, gamma=auto, kernel=poly;, score=0.785 total time=29.8min
[CV 1/3] END .....C=1, gamma=scale, kernel=poly;, score=0.424 total time=28.9min
[CV 1/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.006 total time=30.3min
[CV 3/3] END .C=0.1, gamma=scale, kernel=linear;, score=0.854 total time=31.0min
[CV 3/3] END .....C=1, gamma=scale, kernel=poly;, score=0.424 total time=29.6min
[CV 1/3] END ..C=0.1, gamma=auto, kernel=linear;, score=0.852 total time=31.0min
[CV 1/3] END ....C=0.1, gamma=auto, kernel=poly;, score=0.798 total time=31.0min
[CV 2/3] END .C=0.1, gamma=scale, kernel=linear;, score=0.857 total time=31.5min
[CV 2/3] END ....C=0.1, gamm

In [80]:
svm_understand_gs.best_params_

{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}

In [81]:
performancePrinter(test_understand_y, pred_understand_y_svm)

Accuracy Score ->  0.9216557530402245
Kappa Score ->  0.8012661606685524
ROC AUC Score ->  0.8911649328131773
F1 Score ->  0.8547897702644125
Classification report -> 
               precision    recall  f1-score   support

           0       0.93      0.96      0.95      3076
           1       0.89      0.82      0.85      1200

    accuracy                           0.92      4276
   macro avg       0.91      0.89      0.90      4276
weighted avg       0.92      0.92      0.92      4276



In [82]:
svm_understand = SVC(C=0.1, gamma='scale', kernel='linear')
svm_understand.fit(train_understand_x, train_understand_y)
pred_understand_y = svm_understand.predict(test_understand_x)
performancePrinter(test_understand_y, pred_understand_y)

Accuracy Score ->  0.9216557530402245
Kappa Score ->  0.8012661606685524
ROC AUC Score ->  0.8911649328131773
F1 Score ->  0.8547897702644125
Classification report -> 
               precision    recall  f1-score   support

           0       0.93      0.96      0.95      3076
           1       0.89      0.82      0.85      1200

    accuracy                           0.92      4276
   macro avg       0.91      0.89      0.90      4276
weighted avg       0.92      0.92      0.92      4276



##### Logistic Regression

In [83]:
lr_understand = LogisticRegression()
lr_understand_gs = GridSearchCV(lr_understand, params_lr, scoring="f1", n_jobs=-1, cv=3, verbose=3)
lr_understand_gs.fit(train_understand_x, train_understand_y)
pred_understand_y_lr = lr_understand_gs.predict(test_understand_x)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has alread



[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.575 total time=  53.9s
[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.588 total time=  49.4s
[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.617 total time=  49.2s
[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.608 total time=  51.1s
[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.598 total time= 1.0min
[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.594 total time= 1.0min
[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.589 total time=  50.9s
[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.608 total time=  49.8s
[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.620 total time=  56.7s
[CV 2/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.598 tot



[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.690 total time= 1.9min
[CV 3/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.617 total time=  55.4s




[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.679 total time= 1.9min
[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.694 total time= 2.0min
[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.679 total time= 2.0min




[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.694 total time= 1.9min




[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.662 total time= 2.3min




[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.690 total time= 2.1min




[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.680 total time= 2.0min
[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.673 total time= 2.3min




[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.673 total time= 2.4min
[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.694 total time= 2.1min




[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.662 total time= 2.4min
[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.693 total time= 2.0min




[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.680 total time= 2.1min
[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.671 total time= 2.5min
[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.670 total time= 2.5min




[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.695 total time= 2.2min




[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.695 total time= 2.1min
[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.589 total time=  55.6s
[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.620 total time=  54.0s
[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.609 total time=  56.2s
[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.589 total time=  49.4s
[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.609 total time=  55.6s
[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.587 total time= 1.4min
[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.616 total time= 1.4min
[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.609 total time= 1.5min
[CV 3/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.620 total time=



[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.680 total time= 2.0min




[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.693 total time= 2.1min




[CV 3/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.695 total time= 2.1min




[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.680 total time= 2.1min
[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.620 total time=  53.5s




[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.693 total time= 2.0min
[CV 3/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.695 total time= 2.0min




[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.678 total time= 3.5min




[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.692 total time= 3.5min
[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.692 total time= 3.6min
[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.688 total time= 3.5min
[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.677 total time= 3.6min
[CV 2/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.763 total time= 4.8min




[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.689 total time= 3.7min
[CV 1/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.748 total time= 5.1min




[CV 1/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.737 total time= 6.0min




[CV 1/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.749 total time= 5.0min
[CV 2/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.748 total time= 6.0min




[CV 2/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.764 total time= 4.9min




[CV 3/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.759 total time= 5.2min




[CV 1/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.679 total time= 1.9min




[CV 3/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.759 total time= 5.0min




[CV 3/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.744 total time= 6.2min
[CV 1/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.737 total time= 6.0min
[CV 2/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.748 total time= 6.0min
[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.768 total time= 4.8min




[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.768 total time= 4.8min




[CV 3/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.744 total time= 6.1min
[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.751 total time= 4.9min




[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.765 total time= 4.8min
[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.589 total time=  52.3s
[CV 2/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.694 total time= 2.0min




[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.765 total time= 5.0min
[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.751 total time= 5.1min




[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.695 total time= 2.0min
[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.609 total time= 1.5min
[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.587 total time= 1.6min
[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.608 total time=  54.3s
[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.620 total time=  53.4s
[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.616 total time= 1.5min




[CV 1/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.680 total time= 2.0min




[CV 2/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.693 total time= 2.0min




[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.695 total time= 2.1min
[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.608 total time=  51.4s
[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.589 total time=  54.5s
[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.620 total time=  55.2s
[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.589 total time= 1.3min
[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.609 total time=  49.5s
[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.589 total time=  55.9s
[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.610 total time= 1.5min
[CV 3/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.620 total time= 1.5min
[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.620 total time=  



[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.610 total time=  50.0s
[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.589 total time=  54.9s




[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.694 total time= 2.0min




[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.679 total time= 2.1min




[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.695 total time= 1.9min




[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.620 total time=  50.2s




[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.693 total time= 3.2min




[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.680 total time= 2.0min




[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.695 total time= 2.0min
[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.680 total time= 3.3min




[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.694 total time= 2.1min




[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.680 total time= 3.5min
[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.693 total time= 3.5min
[CV 3/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.695 total time= 3.4min




[CV 3/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.695 total time= 3.5min




[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.764 total time= 4.8min




[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.751 total time= 5.0min
[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.768 total time= 5.0min




[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.751 total time= 4.7min




[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.679 total time= 2.0min
[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.751 total time= 5.0min




[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.768 total time= 5.0min




[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.764 total time= 5.0min




[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.768 total time= 4.8min
[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.751 total time= 4.9min
[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.768 total time= 4.9min




[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.696 total time= 2.0min




[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.694 total time= 2.1min
[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.589 total time= 1.3min




[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.765 total time= 5.1min




[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.765 total time= 5.1min
[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.620 total time= 1.4min
[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.609 total time= 1.4min




[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.589 total time=  50.4s




[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.679 total time= 1.9min




[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.610 total time=  54.9s




[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.693 total time= 2.0min
[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.619 total time=  54.4s




[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.695 total time= 2.1min
[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.589 total time=  49.7s
[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.610 total time=  49.9s
[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.620 total time=  55.7s




[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.765 total time= 8.4min
[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.749 total time= 8.4min
[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.762 total time= 8.4min




[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.765 total time= 8.4min
[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.763 total time= 8.4min




[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.749 total time= 8.9min




[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.765 total time= 4.1min




[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.765 total time= 4.3min




[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.768 total time= 4.3min
[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.751 total time= 4.1min




[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.751 total time= 4.5min




[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.768 total time= 4.5min




[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.765 total time= 4.2min




[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.751 total time= 4.2min




[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.765 total time= 4.1min




[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.751 total time= 4.8min




[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.768 total time= 4.8min




[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.768 total time= 4.7min




[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.768 total time= 6.7min




[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.764 total time= 6.8min




[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.751 total time= 6.8min




[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.768 total time= 6.9min




[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.751 total time= 7.0min




[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.764 total time= 6.9min




In [84]:
lr_understand_gs.best_params_

{'C': 1, 'max_iter': 500, 'penalty': 'none', 'solver': 'saga', 'tol': 0.0001}

In [85]:
performancePrinter(test_understand_y, pred_understand_y_lr)

Accuracy Score ->  0.8905519176800748
Kappa Score ->  0.7141064411940176
ROC AUC Score ->  0.8385435630689206
F1 Score ->  0.7868852459016394
Classification report -> 
               precision    recall  f1-score   support

           0       0.90      0.96      0.93      3076
           1       0.87      0.72      0.79      1200

    accuracy                           0.89      4276
   macro avg       0.88      0.84      0.86      4276
weighted avg       0.89      0.89      0.89      4276



##### Random Forest

In [86]:
rf_understand = RandomForestClassifier()
rf_understand_gs = GridSearchCV(rf_understand, params_rf, scoring="f1", n_jobs=-1, cv=3, verbose=3)
rf_understand_gs.fit(train_understand_x, train_understand_y)
pred_understand_y_rf = rf_understand_gs.predict(test_understand_x)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has alrea

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [87]:
rf_understand_gs.best_params_

{'bootstrap': False,
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 250}

In [88]:
performancePrinter(test_understand_y, pred_understand_y_rf)

Accuracy Score ->  0.9197848456501403
Kappa Score ->  0.7926343798689639
ROC AUC Score ->  0.8796998266146511
F1 Score ->  0.8465324384787473
Classification report -> 
               precision    recall  f1-score   support

           0       0.92      0.97      0.95      3076
           1       0.91      0.79      0.85      1200

    accuracy                           0.92      4276
   macro avg       0.92      0.88      0.90      4276
weighted avg       0.92      0.92      0.92      4276



##### XGBoost

In [89]:
xgb_understand = XGBClassifier()
xgb_understand_gs = GridSearchCV(xgb_understand, params_xgb, scoring="f1", n_jobs=-1, cv=3)
xgb_understand_gs.fit(train_understand_x, train_understand_y)
pred_understand_y_xgb = xgb_understand_gs.predict(test_understand_x)

  from pandas import MultiIndex, Int64Index


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index








  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index








  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index








  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index








  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
















































































































In [90]:
xgb_understand_gs.best_params_

{'gamma': 0.1, 'learning_rate': 0.5, 'max_depth': 10, 'n_estimators': 100}

In [91]:
performancePrinter(test_understand_y, pred_understand_y_xgb)

Accuracy Score ->  0.9282039289055192
Kappa Score ->  0.8182551703323023
ROC AUC Score ->  0.9002904204594712
F1 Score ->  0.8673866090712743
Classification report -> 
               precision    recall  f1-score   support

           0       0.94      0.96      0.95      3076
           1       0.90      0.84      0.87      1200

    accuracy                           0.93      4276
   macro avg       0.92      0.90      0.91      4276
weighted avg       0.93      0.93      0.93      4276



### Apply

#### Data Preparation

In [92]:
apply_x, apply_y = split_train_x.to_numpy(), split_train_y['Apply'].astype('long').to_numpy()#rus(split_train_x, split_train_y['Apply'].to_numpy())

In [93]:
apply_x.shape

(17104, 94)

#### BERT Experiment

In [94]:
apply_x_bert = apply_x[:, 0].tolist()

In [95]:
if RUN_DL:    
    apply_bert = createBERT('apply', apply_x_bert, apply_y, split_test_x['Learning_outcome'].tolist(), split_test_y['Apply'].astype('long').to_numpy(), 64)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": false,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolve/m

Started training model for column apply


Step,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to apply/checkpoint-10
Configuration saved in apply/checkpoint-10/config.json
Model weights saved in apply/checkpoint-10/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to apply/checkpoint-20
Configuration saved in apply/checkpoint-20/config.json
Model weights saved in apply/checkpoint-20/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to apply/checkpoint-30
Configuration saved in apply/checkpoint-30/config.json
Model weights saved in apply/checkpoint-30/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to apply/checkpoint-40
Configuration saved in apply/checkpoint-40/config.json
Model weights saved in apply/checkpoint-40/pytorch_model.bin
Deleting older checkpoint [apply/checkpoint-10] due to args.save

Training Completed. Started testing...


Accuracy Score ->  0.9630495790458372
Kappa Score ->  0.9095297423476137
ROC AUC Score ->  0.956342449260406
F1 Score ->  0.9354047424366312
Classification report -> 
               precision    recall  f1-score   support

           0       0.98      0.97      0.97      3060
           1       0.93      0.94      0.94      1216

    accuracy                           0.96      4276
   macro avg       0.95      0.96      0.95      4276
weighted avg       0.96      0.96      0.96      4276



#### Traditional ML Algorithm

In [96]:
combined_apply_x, column_names_apply, test_apply_x = generateX(apply_x, split_test_x.to_numpy(), 0, 1, 94)
train_apply_x = combined_apply_x
train_apply_y = apply_y
test_apply_y = split_test_y['Apply'].astype('long').to_numpy()

Getting Unigram...


Getting Bigram...
Getting Tfidf...
Getting ARI...
Combining...
Generated feature shape is (17104, 3094)
Generated test feature is (4276, 3094)


In [97]:
column_names_apply += data.columns[8:].tolist()

##### Naive Bayes

In [98]:
gnb_apply = GaussianNB()
gnb_apply_gs = GridSearchCV(gnb_apply, params_nb, scoring="f1", n_jobs=-1, cv=3, verbose=3)
gnb_apply_gs.fit(train_apply_x, train_apply_y)
pred_apply_y_gnb = gnb_apply_gs.predict(test_apply_x)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible


	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after par

In [99]:
gnb_apply_gs.best_params_

{'var_smoothing': 1e-08}

In [100]:
performancePrinter(test_apply_y, pred_apply_y_gnb)

Accuracy Score ->  0.7778297474275023
Kappa Score ->  0.5071471070401455
ROC AUC Score ->  0.780842578259374
F1 Score ->  0.6685275645498953
Classification report -> 
               precision    recall  f1-score   support

           0       0.90      0.77      0.83      3060
           1       0.58      0.79      0.67      1216

    accuracy                           0.78      4276
   macro avg       0.74      0.78      0.75      4276
weighted avg       0.81      0.78      0.79      4276



##### Support Vector Machine

In [101]:
svm_apply = SVC()
svm_apply_gs = GridSearchCV(svm_apply, params_svm, scoring="f1", n_jobs=-1, cv=3, verbose=3)
svm_apply_gs.fit(train_apply_x, train_apply_y)
pred_apply_y_svm = svm_apply_gs.predict(test_apply_x)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[CV 1/3] END ..C=0.1, gamma=auto, kernel=linear;, score=0.860 total time=31.6min
[CV 2/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.000 total time=32.2min
[CV 3/3] END ..C=0.1, gamma=auto, kernel=linear;, score=0.850 total time=32.1min
[CV 3/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.000 total time=32.3min
[CV 1/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.000 total time=32.5min
[CV 1/3] END .C=0.1, gamma=scale, kernel=linear;, score=0.860 total time=32.7min
[CV 3/3] END ....C=0.1, gamma=auto, kernel=poly;, score=0.784 total time=32.4min
[CV 3/3] END .C=0.1, gamma=scale, kernel=linear;, score=0.850 total time=32.8min
[CV 1/3] END ....C=0.1, gamma=auto, kernel=poly;, score=0.788 total time=32.7min
[CV 2/3] END ..C=0.1, gamma=auto, kernel=linear;, score=0.858 total time=33.0min
[CV 2/3] END ....C=0.1, gamma=auto, kernel=poly;, score=0.803 total time=33.0min
[CV 1/3] END .....C=1, gamma=scale, kernel=poly;, score=0.012 total time=32.6min
[CV 2/3] END .C=0.1, gamma=s

In [102]:
svm_apply_gs.best_params_

{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}

In [103]:
performancePrinter(test_apply_y, pred_apply_y_svm)

Accuracy Score ->  0.9230589335827877
Kappa Score ->  0.8050196892409649
ROC AUC Score ->  0.8902423030615756
F1 Score ->  0.8575140753572976
Classification report -> 
               precision    recall  f1-score   support

           0       0.93      0.97      0.95      3060
           1       0.91      0.81      0.86      1216

    accuracy                           0.92      4276
   macro avg       0.92      0.89      0.90      4276
weighted avg       0.92      0.92      0.92      4276



##### Logistic Regression

In [104]:
lr_apply = LogisticRegression()
lr_apply_gs = GridSearchCV(lr_apply, params_lr, scoring="f1", n_jobs=-1, cv=3, verbose=3)
lr_apply_gs.fit(train_apply_x, train_apply_y)
pred_apply_y_lr = lr_apply_gs.predict(test_apply_x)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has alread

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.587 total time=  53.8s
[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.620 total time=  45.9s
[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.613 total time=  45.3s
[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.597 total time=  48.9s
[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.608 total time=  58.1s
[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.602 total time= 1.0min
[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.598 total time=  52.1s
[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.621 total time=  51.2s
[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.616 total time=  53.6s
[CV 1/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.585 tot



[CV 3/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.614 total time=  50.6s




[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.725 total time= 1.9min




[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.735 total time= 1.9min




[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.746 total time= 2.0min




[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.734 total time= 1.9min
[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.725 total time= 2.0min




[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.745 total time= 2.1min




[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.739 total time= 2.4min




[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.720 total time= 2.4min




[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.717 total time= 2.4min
[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.738 total time= 2.0min
[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.748 total time= 2.0min
[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.726 total time= 2.0min




[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.717 total time= 2.5min




[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.726 total time= 2.2min
[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.748 total time= 2.1min
[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.720 total time= 2.4min




[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.738 total time= 2.1min




[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.740 total time= 2.6min
[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.599 total time=  50.1s
[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.622 total time=  48.8s
[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.616 total time=  51.8s
[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.598 total time=  50.6s
[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.620 total time=  48.8s
[CV 3/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.616 total time=  48.4s
[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.622 total time= 1.3min
[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.597 total time= 1.4min
[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.613 total time= 1



[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.726 total time= 2.1min




[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.748 total time= 2.1min
[CV 2/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.789 total time= 4.1min
[CV 1/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.766 total time= 4.3min




[CV 3/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.738 total time= 1.9min




[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.726 total time= 2.1min
[CV 3/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.738 total time= 1.9min
[CV 1/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.762 total time= 5.2min
[CV 3/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.772 total time= 4.2min
[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.616 total time=  54.6s




[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.748 total time= 2.1min
[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.725 total time= 3.4min
[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.737 total time= 3.3min
[CV 3/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.768 total time= 5.1min
[CV 2/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.782 total time= 5.4min




[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.747 total time= 3.6min




[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.725 total time= 3.6min




[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.738 total time= 3.6min
[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.747 total time= 3.6min
[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.776 total time= 4.2min




[CV 1/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.775 total time= 4.9min
[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.795 total time= 4.4min




[CV 3/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.780 total time= 4.8min
[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.773 total time= 4.5min




[CV 2/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.796 total time= 5.1min




[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.597 total time=  53.8s
[CV 1/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.726 total time= 2.0min
[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.622 total time=  50.5s




[CV 1/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.767 total time= 6.1min




[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.597 total time= 1.3min
[CV 3/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.772 total time= 6.1min




[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.615 total time= 1.3min
[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.614 total time=  52.7s




[CV 2/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.786 total time= 6.2min
[CV 2/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.748 total time= 2.1min
[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.619 total time= 1.5min




[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.780 total time= 5.0min




[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.738 total time= 2.1min




[CV 1/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.726 total time= 2.0min
[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.781 total time= 5.0min




[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.801 total time= 5.1min
[CV 2/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.748 total time= 2.0min
[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.599 total time=  48.8s




[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.738 total time= 2.0min
[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.622 total time=  52.4s
[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.616 total time=  49.8s
[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.598 total time=  47.7s
[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.622 total time=  47.4s
[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.616 total time=  47.6s
[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.621 total time= 1.3min
[CV 3/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.616 total time= 1.4min
[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.598 total time= 1.5min




[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.622 total time=  51.5s
[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.597 total time=  53.9s




[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.726 total time= 2.0min




[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.748 total time= 2.1min




[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.726 total time= 3.1min
[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.726 total time= 1.9min




[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.615 total time=  47.2s




[CV 3/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.739 total time= 3.2min
[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.738 total time= 2.1min




[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.726 total time= 3.1min
[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.747 total time= 3.2min
[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.748 total time= 2.1min




[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.748 total time= 3.2min
[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.738 total time= 2.1min
[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.794 total time= 4.4min
[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.772 total time= 4.5min




[CV 3/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.738 total time= 3.3min
[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.776 total time= 4.4min
[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.776 total time= 4.1min
[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.795 total time= 4.3min




[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.773 total time= 4.6min




[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.801 total time= 5.0min
[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.780 total time= 5.0min




[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.599 total time=  47.5s
[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.621 total time=  47.2s
[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.726 total time= 2.1min




[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.801 total time= 4.7min
[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.781 total time= 5.2min




[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.781 total time= 4.7min




[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.748 total time= 2.2min




[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.616 total time= 1.3min




[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.598 total time= 1.4min
[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.622 total time= 1.4min




[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.739 total time= 2.0min
[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.616 total time=  52.0s




[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.780 total time= 5.2min




[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.726 total time= 2.0min




[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.738 total time= 2.0min




[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.748 total time= 2.1min
[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.598 total time=  50.3s
[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.615 total time=  48.4s
[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.623 total time=  52.1s
[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.776 total time= 7.4min
[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.794 total time= 7.6min
[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.771 total time= 7.7min




[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.780 total time= 8.3min




[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.779 total time= 8.7min




[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.800 total time= 8.7min
[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.772 total time= 4.0min
[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.794 total time= 3.9min
[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.776 total time= 3.6min
[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.776 total time= 4.0min
[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.795 total time= 3.8min
[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.773 total time= 3.9min




[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.780 total time= 4.2min
[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.801 total time= 4.1min




[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.781 total time= 4.3min




[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.781 total time= 4.1min




[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.780 total time= 4.2min
[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.801 total time= 4.2min
[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.794 total time= 5.8min
[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.776 total time= 5.9min
[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.773 total time= 6.7min




[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.801 total time= 6.6min




[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.780 total time= 6.9min
[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.781 total time= 6.8min




In [105]:
lr_apply_gs.best_params_

{'C': 0.1, 'max_iter': 500, 'penalty': 'none', 'solver': 'saga', 'tol': 0.0001}

In [106]:
performancePrinter(test_apply_y, pred_apply_y_lr)

Accuracy Score ->  0.8961646398503275
Kappa Score ->  0.7255734997883849
ROC AUC Score ->  0.8372570519435845
F1 Score ->  0.793296089385475
Classification report -> 
               precision    recall  f1-score   support

           0       0.89      0.97      0.93      3060
           1       0.91      0.70      0.79      1216

    accuracy                           0.90      4276
   macro avg       0.90      0.84      0.86      4276
weighted avg       0.90      0.90      0.89      4276



##### Random Forest

In [107]:
rf_apply = RandomForestClassifier()
rf_apply_gs = GridSearchCV(rf_apply, params_rf, scoring="f1", n_jobs=-1, cv=3, verbose=3)
rf_apply_gs.fit(train_apply_x, train_apply_y)
pred_apply_y_rf = rf_apply_gs.predict(test_apply_x)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has alrea

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [108]:
rf_apply_gs.best_params_

{'bootstrap': False,
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 250}

In [109]:
performancePrinter(test_apply_y, pred_apply_y_rf)

Accuracy Score ->  0.9359214218896165
Kappa Score ->  0.837073864268455
ROC AUC Score ->  0.9044327270381837
F1 Score ->  0.8806620209059234
Classification report -> 
               precision    recall  f1-score   support

           0       0.94      0.98      0.96      3060
           1       0.94      0.83      0.88      1216

    accuracy                           0.94      4276
   macro avg       0.94      0.90      0.92      4276
weighted avg       0.94      0.94      0.93      4276



##### XGBoost

In [110]:
xgb_apply = XGBClassifier()
xgb_apply_gs = GridSearchCV(xgb_apply, params_xgb, scoring="f1", n_jobs=-1, cv=3)
xgb_apply_gs.fit(train_apply_x, train_apply_y)
pred_apply_y_xgb = xgb_apply_gs.predict(test_apply_x)

  from pandas import MultiIndex, Int64Index


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index








  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




















































































































In [111]:
xgb_apply_gs.best_params_

{'gamma': 0.5, 'learning_rate': 0.5, 'max_depth': 10, 'n_estimators': 100}

In [112]:
performancePrinter(test_apply_y, pred_apply_y_xgb)

Accuracy Score ->  0.9375584658559402
Kappa Score ->  0.8436813525347704
ROC AUC Score ->  0.914001225490196
F1 Score ->  0.886720407297412
Classification report -> 
               precision    recall  f1-score   support

           0       0.95      0.97      0.96      3060
           1       0.92      0.86      0.89      1216

    accuracy                           0.94      4276
   macro avg       0.93      0.91      0.92      4276
weighted avg       0.94      0.94      0.94      4276



### Analyze

#### Data Preparation

In [113]:
analyze_x, analyze_y = split_train_x.to_numpy(), split_train_y['Analyze'].astype('long').to_numpy()#rus(split_train_x, split_train_y['Analyze'].to_numpy())

In [114]:
analyze_x.shape

(17104, 94)

#### BERT Experiment

In [115]:
analyze_x_bert = analyze_x[:, 0].tolist()

In [116]:
if RUN_DL:
    analyze_bert = createBERT('analyze', analyze_x_bert, analyze_y, split_test_x['Learning_outcome'].tolist(), split_test_y['Analyze'].astype('long').to_numpy(), 64)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": false,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolve/m

***** Running training *****
  Num examples = 13683
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 642


Started training model for column analyze


Step,Training Loss,Validation Loss,F1
10,0.4449,0.394028,0.456121
20,0.3191,0.277553,0.456121
30,0.2354,0.198349,0.88358
40,0.1947,0.169187,0.902331
50,0.1372,0.183021,0.899473
60,0.2147,0.157006,0.906942
70,0.1896,0.151012,0.910036
80,0.1349,0.140924,0.918155
90,0.1315,0.143762,0.915158
100,0.1291,0.142527,0.908937


***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to analyze/checkpoint-10
Configuration saved in analyze/checkpoint-10/config.json
Model weights saved in analyze/checkpoint-10/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to analyze/checkpoint-20
Configuration saved in analyze/checkpoint-20/config.json
Model weights saved in analyze/checkpoint-20/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to analyze/checkpoint-30
Configuration saved in analyze/checkpoint-30/config.json
Model weights saved in analyze/checkpoint-30/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to analyze/checkpoint-40
Configuration saved in analyze/checkpoint-40/config.json
Model weights saved in analyze/checkpoint-40/pytorch_model.bin
Deleting older checkpoint [analyze/check

Training Completed. Started testing...


Accuracy Score ->  0.9742750233863424
Kappa Score ->  0.9052870967261264
ROC AUC Score ->  0.9484907674351325
F1 Score ->  0.9206349206349206
Classification report -> 
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      3575
           1       0.93      0.91      0.92       701

    accuracy                           0.97      4276
   macro avg       0.96      0.95      0.95      4276
weighted avg       0.97      0.97      0.97      4276



#### Traditional ML Algorithm

In [117]:
combined_analyze_x, column_names_analyze, test_analyze_x = generateX(analyze_x, split_test_x.to_numpy(), 0, 1, 94)
train_analyze_x = combined_analyze_x
train_analyze_y = analyze_y
test_analyze_y = split_test_y['Analyze'].astype('long').to_numpy()

Getting Unigram...
Getting Bigram...


Getting Tfidf...
Getting ARI...
Combining...
Generated feature shape is (17104, 3094)
Generated test feature is (4276, 3094)


In [118]:
column_names_analyze += data.columns[8:].tolist()

##### Naive Bayes

In [119]:
gnb_analyze = GaussianNB()
gnb_analyze_gs = GridSearchCV(gnb_analyze, params_nb, scoring="f1", n_jobs=-1, cv=3, verbose=3)
gnb_analyze_gs.fit(train_analyze_x, train_analyze_y)
pred_analyze_y_gnb = gnb_analyze_gs.predict(test_analyze_x)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [120]:
gnb_analyze_gs.best_params_

{'var_smoothing': 1e-08}

In [121]:
performancePrinter(test_analyze_y, pred_analyze_y_gnb)

Accuracy Score ->  0.5493451824134705
Kappa Score ->  0.1829401988743411
ROC AUC Score ->  0.6840435741149008
F1 Score ->  0.3915377328702242
Classification report -> 
               precision    recall  f1-score   support

           0       0.96      0.48      0.64      3575
           1       0.25      0.88      0.39       701

    accuracy                           0.55      4276
   macro avg       0.60      0.68      0.52      4276
weighted avg       0.84      0.55      0.60      4276



##### Support Vector Machine

In [122]:
svm_analyze = SVC()
svm_analyze_gs = GridSearchCV(svm_analyze, params_svm, scoring="f1", n_jobs=-1, cv=3, verbose=3)
svm_analyze_gs.fit(train_analyze_x, train_analyze_y)
pred_analyze_y_svm = svm_analyze_gs.predict(test_analyze_x)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[CV 1/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.000 total time=18.7min
[CV 1/3] END .C=0.1, gamma=scale, kernel=linear;, score=0.844 total time=19.3min
[CV 3/3] END .C=0.1, gamma=scale, kernel=linear;, score=0.829 total time=19.4min
[CV 2/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.000 total time=19.4min
[CV 2/3] END .C=0.1, gamma=scale, kernel=linear;, score=0.856 total time=19.5min
[CV 3/3] END ..C=0.1, gamma=auto, kernel=linear;, score=0.829 total time=19.3min
[CV 2/3] END ..C=0.1, gamma=auto, kernel=linear;, score=0.856 total time=19.5min
[CV 3/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.000 total time=19.8min
[CV 2/3] END ....C=0.1, gamma=auto, kernel=poly;, score=0.766 total time=19.8min
[CV 3/3] END ....C=0.1, gamma=auto, kernel=poly;, score=0.758 total time=20.1min
[CV 1/3] END ..C=0.1, gamma=auto, kernel=linear;, score=0.844 total time=20.4min
[CV 1/3] END ....C=0.1, gamma=auto, kernel=poly;, score=0.778 total time=20.5min
[CV 3/3] END .....C=1, gamma

In [123]:
svm_analyze_gs.best_params_

{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}

In [124]:
performancePrinter(test_analyze_y, pred_analyze_y_svm)

Accuracy Score ->  0.9560336763330215
Kappa Score ->  0.8321028208567374
ROC AUC Score ->  0.8974432129924284
F1 Score ->  0.8580060422960725
Classification report -> 
               precision    recall  f1-score   support

           0       0.96      0.98      0.97      3575
           1       0.91      0.81      0.86       701

    accuracy                           0.96      4276
   macro avg       0.94      0.90      0.92      4276
weighted avg       0.96      0.96      0.95      4276



##### Logistic Regression

In [125]:
lr_analyze = LogisticRegression()
lr_analyze_gs = GridSearchCV(lr_analyze, params_lr, scoring="f1", n_jobs=-1, cv=3, verbose=3)
lr_analyze_gs.fit(train_analyze_x, train_analyze_y)
pred_analyze_y_lr = lr_analyze_gs.predict(test_analyze_x)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has alread

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.423 total time=  55.7s
[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.471 total time=  55.3s
[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.439 total time=  47.5s
[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.471 total time=  48.6s
[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.460 total time=  57.8s
[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.484 total time=  50.9s
[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.445 total time=  49.4s
[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.489 total time=  51.5s
[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.473 total time=  53.0s
[CV 1/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.423 tot



[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.650 total time= 2.0min




[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.666 total time= 2.0min
[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.650 total time= 2.0min
[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.625 total time= 2.0min




[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.648 total time= 2.2min




[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.666 total time= 2.0min




[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.625 total time= 2.1min




[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.654 total time= 2.0min




[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.615 total time= 2.3min
[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.615 total time= 2.4min
[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.654 total time= 2.0min
[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.660 total time= 2.4min
[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.630 total time= 2.0min




[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.659 total time= 2.4min
[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.648 total time= 2.4min




[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.669 total time= 2.1min
[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.630 total time= 2.0min




[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.669 total time= 2.1min
[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.445 total time=  52.7s
[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.490 total time=  52.9s
[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.471 total time=  54.4s
[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.488 total time=  48.2s
[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.445 total time=  50.9s
[CV 3/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.471 total time=  52.1s
[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.446 total time= 1.4min
[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.469 total time= 1.5min
[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.487 total time=



[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.669 total time= 2.0min
[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.654 total time= 2.1min
[CV 1/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.720 total time= 4.1min
[CV 2/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.724 total time= 4.2min




[CV 3/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.629 total time= 2.0min




[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.654 total time= 2.1min
[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.472 total time=  48.8s
[CV 3/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.691 total time= 4.1min




[CV 1/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.711 total time= 5.2min
[CV 3/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.629 total time= 1.9min




[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.654 total time= 3.4min
[CV 2/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.714 total time= 5.3min
[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.626 total time= 3.4min




[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.669 total time= 2.1min




[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.626 total time= 3.5min




[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.667 total time= 3.5min
[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.668 total time= 3.6min
[CV 3/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.696 total time= 5.3min




[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.654 total time= 3.6min
[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.704 total time= 4.4min




[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.722 total time= 4.5min




[CV 3/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.698 total time= 5.8min
[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.731 total time= 4.5min




[CV 1/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.718 total time= 5.8min




[CV 2/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.719 total time= 5.8min
[CV 1/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.728 total time= 5.1min
[CV 2/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.737 total time= 5.1min




[CV 3/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.708 total time= 5.0min
[CV 1/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.654 total time= 2.0min
[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.445 total time=  54.3s
[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.488 total time=  53.9s
[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.473 total time=  54.0s




[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.487 total time= 1.4min
[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.446 total time= 1.4min
[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.630 total time= 2.0min




[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.743 total time= 5.0min




[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.469 total time= 1.5min
[CV 2/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.669 total time= 2.1min
[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.736 total time= 5.0min




[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.713 total time= 5.1min
[CV 1/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.654 total time= 2.2min




[CV 2/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.669 total time= 2.1min
[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.630 total time= 2.1min
[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.445 total time=  52.2s
[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.488 total time=  50.4s
[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.471 total time=  50.0s
[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.489 total time=  47.9s
[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.445 total time=  55.2s
[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.471 total time=  53.8s
[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.488 total time= 1.4min
[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.445 total time



[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.445 total time=  53.3s
[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.654 total time= 1.9min
[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.488 total time=  54.7s




[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.669 total time= 2.0min




[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.732 total time= 4.2min




[CV 3/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.630 total time= 3.1min
[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.721 total time= 4.4min




[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.630 total time= 2.1min
[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.473 total time=  53.9s
[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.654 total time= 2.0min




[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.655 total time= 3.3min
[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.654 total time= 3.4min




[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.670 total time= 3.3min




[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.669 total time= 2.1min
[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.670 total time= 3.5min
[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.630 total time= 2.1min
[CV 3/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.629 total time= 3.3min
[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.703 total time= 4.3min
[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.722 total time= 4.2min
[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.732 total time= 4.3min
[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.704 total time= 4.4min




[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.735 total time= 4.9min




[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.712 total time= 4.9min
[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.742 total time= 4.9min




[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.742 total time= 4.7min
[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.630 total time= 1.9min




[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.669 total time= 2.1min
[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.654 total time= 2.1min
[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.445 total time=  54.0s




[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.489 total time=  55.0s




[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.471 total time=  48.5s




[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.488 total time= 1.4min




[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.445 total time= 1.4min
[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.471 total time= 1.4min




[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.736 total time= 5.2min




[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.713 total time= 5.2min




[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.655 total time= 1.9min




[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.669 total time= 2.1min




[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.630 total time= 2.1min
[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.445 total time=  48.6s
[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.489 total time=  49.8s
[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.473 total time=  54.0s
[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.730 total time= 7.2min
[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.722 total time= 7.4min
[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.702 total time= 7.5min




[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.733 total time= 8.2min




[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.739 total time= 8.2min




[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.709 total time= 8.5min
[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.731 total time= 3.7min
[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.722 total time= 4.1min
[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.722 total time= 3.7min
[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.703 total time= 4.0min




[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.732 total time= 3.8min
[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.736 total time= 4.1min




[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.742 total time= 4.2min
[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.705 total time= 4.0min




[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.713 total time= 4.1min
[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.713 total time= 4.4min




[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.742 total time= 4.2min
[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.736 total time= 4.2min
[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.703 total time= 5.9min
[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.722 total time= 6.2min
[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.731 total time= 6.4min




[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.736 total time= 6.6min




[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.712 total time= 6.6min




[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.743 total time= 6.8min




In [126]:
lr_analyze_gs.best_params_

{'C': 0.1, 'max_iter': 500, 'penalty': 'none', 'solver': 'saga', 'tol': 0.0001}

In [127]:
performancePrinter(test_analyze_y, pred_analyze_y_lr)

Accuracy Score ->  0.936155285313377
Kappa Score ->  0.7318909215517486
ROC AUC Score ->  0.8178931197190826
F1 Score ->  0.7672634271099744
Classification report -> 
               precision    recall  f1-score   support

           0       0.93      0.99      0.96      3575
           1       0.95      0.64      0.77       701

    accuracy                           0.94      4276
   macro avg       0.94      0.82      0.87      4276
weighted avg       0.94      0.94      0.93      4276



##### Random Forest

In [128]:
rf_analyze = RandomForestClassifier()
rf_analyze_gs = GridSearchCV(rf_analyze, params_rf, scoring="f1", n_jobs=-1, cv=3, verbose=3)
rf_analyze_gs.fit(train_analyze_x, train_analyze_y)
pred_analyze_y_rf = rf_analyze_gs.predict(test_analyze_x)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has alrea

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [129]:
rf_analyze_gs.best_params_

{'bootstrap': False,
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [130]:
performancePrinter(test_analyze_y, pred_analyze_y_rf)

Accuracy Score ->  0.9593077642656689
Kappa Score ->  0.8425262712312821
ROC AUC Score ->  0.8971076284628353
F1 Score ->  0.8663594470046082
Classification report -> 
               precision    recall  f1-score   support

           0       0.96      0.99      0.98      3575
           1       0.94      0.80      0.87       701

    accuracy                           0.96      4276
   macro avg       0.95      0.90      0.92      4276
weighted avg       0.96      0.96      0.96      4276



##### XGBoost

In [131]:
xgb_analyze = XGBClassifier()
xgb_analyze_gs = GridSearchCV(xgb_analyze, params_xgb, scoring="f1", n_jobs=-1, cv=3)
xgb_analyze_gs.fit(train_analyze_x, train_analyze_y)
pred_analyze_y_xgb = xgb_analyze_gs.predict(test_analyze_x)

  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index








  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




















































































































In [132]:
xgb_analyze_gs.best_params_

{'gamma': 0.5, 'learning_rate': 0.5, 'max_depth': 7, 'n_estimators': 100}

In [133]:
performancePrinter(test_analyze_y, pred_analyze_y_xgb)

Accuracy Score ->  0.9590739008419084
Kappa Score ->  0.843618863874223
ROC AUC Score ->  0.9027018345420628
F1 Score ->  0.8677248677248677
Classification report -> 
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      3575
           1       0.92      0.82      0.87       701

    accuracy                           0.96      4276
   macro avg       0.94      0.90      0.92      4276
weighted avg       0.96      0.96      0.96      4276



### Evaluate

#### Data Preparation

In [134]:
evaluate_x, evaluate_y = split_train_x.to_numpy(), split_train_y['Evaluate'].astype('long').to_numpy()#rus(split_train_x, split_train_y['Evaluate'].to_numpy())

In [135]:
evaluate_x.shape

(17104, 94)

#### BERT Experiment

In [136]:
evaluate_x_bert = evaluate_x[:, 0].tolist()

In [137]:
if RUN_DL:
    evaluate_bert = createBERT('evaluate', evaluate_x_bert, evaluate_y, split_test_x['Learning_outcome'].tolist(), split_test_y['Evaluate'].astype('long').to_numpy(), 64, 3, 15)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": false,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolve/m

Started training model for column evaluate


Step,Training Loss,Validation Loss,F1
10,0.5275,0.440007,0.451323


***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to evaluate/checkpoint-10
Configuration saved in evaluate/checkpoint-10/config.json


Model weights saved in evaluate/checkpoint-10/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to evaluate/checkpoint-20
Configuration saved in evaluate/checkpoint-20/config.json
Model weights saved in evaluate/checkpoint-20/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to evaluate/checkpoint-30
Configuration saved in evaluate/checkpoint-30/config.json
Model weights saved in evaluate/checkpoint-30/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to evaluate/checkpoint-40
Configuration saved in evaluate/checkpoint-40/config.json
Model weights saved in evaluate/checkpoint-40/pytorch_model.bin
Deleting older checkpoint [evaluate/checkpoint-10] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to evaluate/checkpoint-50
Config

Training Completed. Started testing...


Accuracy Score ->  0.9667913938260057
Kappa Score ->  0.8890071382219338
ROC AUC Score ->  0.937647829127796
F1 Score ->  0.909323116219668
Classification report -> 
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      3477
           1       0.93      0.89      0.91       799

    accuracy                           0.97      4276
   macro avg       0.95      0.94      0.94      4276
weighted avg       0.97      0.97      0.97      4276



#### Traditional ML Algorithm

In [138]:
combined_evaluate_x, column_names_evaluate, test_evaluate_x = generateX(evaluate_x, split_test_x.to_numpy(), 0, 1, 94)
train_evaluate_x = combined_evaluate_x
train_evaluate_y = evaluate_y
test_evaluate_y = split_test_y['Evaluate'].astype('long').to_numpy()

Getting Unigram...
Getting Bigram...
Getting Tfidf...
Getting ARI...
Combining...
Generated feature shape is (17104, 3094)
Generated test feature is (4276, 3094)


In [139]:
column_names_evaluate += data.columns[8:].tolist()

##### Naive Bayes

In [140]:
gnb_evaluate = GaussianNB()
gnb_evaluate_gs = GridSearchCV(gnb_evaluate, params_nb, scoring="f1", n_jobs=-1, cv=3, verbose=3)
gnb_evaluate_gs.fit(train_evaluate_x, train_evaluate_y)
pred_evaluate_y_gnb = gnb_evaluate_gs.predict(test_evaluate_x)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [141]:
gnb_evaluate_gs.best_params_

{'var_smoothing': 1e-08}

In [142]:
performancePrinter(test_evaluate_y, pred_evaluate_y_gnb)

Accuracy Score ->  0.5956501403180543
Kappa Score ->  0.23351157092927388
ROC AUC Score ->  0.7026861301677427
F1 Score ->  0.44672
Classification report -> 
               precision    recall  f1-score   support

           0       0.95      0.53      0.68      3477
           1       0.30      0.87      0.45       799

    accuracy                           0.60      4276
   macro avg       0.62      0.70      0.56      4276
weighted avg       0.83      0.60      0.64      4276



##### Support Vector Machine

In [143]:
svm_evaluate = SVC()
svm_evaluate_gs = GridSearchCV(svm_evaluate, params_svm, scoring="f1", n_jobs=-1, cv=3, verbose=3)
svm_evaluate_gs.fit(train_evaluate_x, train_evaluate_y)
pred_evaluate_y_svm = svm_evaluate_gs.predict(test_evaluate_x)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[CV 3/3] END ..C=0.1, gamma=auto, kernel=linear;, score=0.872 total time=19.3min
[CV 1/3] END .C=0.1, gamma=scale, kernel=linear;, score=0.883 total time=19.9min
[CV 2/3] END ..C=0.1, gamma=auto, kernel=linear;, score=0.875 total time=20.5min
[CV 3/3] END .C=0.1, gamma=scale, kernel=linear;, score=0.872 total time=20.8min
[CV 2/3] END .C=0.1, gamma=scale, kernel=linear;, score=0.875 total time=22.1min
[CV 1/3] END ..C=0.1, gamma=auto, kernel=linear;, score=0.883 total time=22.1min
[CV 1/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.000 total time=22.4min
[CV 2/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.000 total time=22.7min
[CV 3/3] END ...C=0.1, gamma=scale, kernel=poly;, score=0.000 total time=23.1min
[CV 2/3] END ....C=0.1, gamma=auto, kernel=poly;, score=0.757 total time=24.7min
[CV 1/3] END ....C=0.1, gamma=auto, kernel=poly;, score=0.717 total time=25.0min
[CV 3/3] END ....C=0.1, gamma=auto, kernel=poly;, score=0.746 total time=25.0min
[CV 1/3] END .....C=1, gamma



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[CV 1/3] END ...C=1, gamma=scale, kernel=linear;, score=0.867 total time=28.8min
[CV 1/3] END ......C=1, gamma=auto, kernel=poly;, score=0.838 total time=27.0min
[CV 2/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.000 total time=28.3min
[CV 2/3] END ......C=1, gamma=auto, kernel=poly;, score=0.834 total time=27.2min
[CV 3/3] END ...C=1, gamma=scale, kernel=linear;, score=0.849 total time=30.0min
[CV 3/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.000 total time=30.2min
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environ

In [144]:
svm_evaluate_gs.best_params_

{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}

In [145]:
performancePrinter(test_evaluate_y, pred_evaluate_y_svm)

Accuracy Score ->  0.9586061739943873
Kappa Score ->  0.8610375055448982
ROC AUC Score ->  0.9220111924490024
F1 Score ->  0.8863198458574182
Classification report -> 
               precision    recall  f1-score   support

           0       0.97      0.98      0.97      3477
           1       0.91      0.86      0.89       799

    accuracy                           0.96      4276
   macro avg       0.94      0.92      0.93      4276
weighted avg       0.96      0.96      0.96      4276



##### Logistic Regression

In [146]:
lr_evaluate = LogisticRegression()
lr_evaluate_gs = GridSearchCV(lr_evaluate, params_lr, scoring="f1", n_jobs=-1, cv=3, verbose=3)
lr_evaluate_gs.fit(train_evaluate_x, train_evaluate_y)
pred_evaluate_y_lr = lr_evaluate_gs.predict(test_evaluate_x)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has alread

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.367 total time=  52.3s
[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.382 total time=  45.1s
[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.375 total time=  55.7s
[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.401 total time=  54.9s
[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.392 total time=  48.5s
[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.414 total time=  50.5s
[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.387 total time=  49.8s
[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.420 total time=  49.2s
[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.397 total time=  55.0s
[CV 1/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.367 tot



[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.595 total time= 2.0min




[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.596 total time= 2.0min
[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.619 total time= 2.0min




[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.619 total time= 2.0min




[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.594 total time= 2.0min




[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.588 total time= 2.3min
[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.622 total time= 2.3min
[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.595 total time= 2.1min




[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.592 total time= 2.3min
[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.592 total time= 2.4min




[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.623 total time= 2.0min
[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.621 total time= 2.3min
[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.596 total time= 2.1min




[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.601 total time= 2.0min




[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.588 total time= 2.5min
[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.601 total time= 2.0min
[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.622 total time= 2.1min




[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.596 total time= 2.1min
[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.385 total time=  48.8s
[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.396 total time=  52.1s
[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.420 total time=  54.6s
[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.386 total time=  48.3s
[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.398 total time=  54.4s
[CV 3/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.419 total time=  48.7s
[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.379 total time= 1.5min
[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.394 total time= 1.5min
[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.416 total time=



[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.596 total time= 2.1min




[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.622 total time= 2.0min
[CV 2/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.710 total time= 4.2min




[CV 1/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.658 total time= 4.3min
[CV 3/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.601 total time= 1.9min
[CV 2/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.697 total time= 5.0min




[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.622 total time= 1.9min
[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.596 total time= 2.0min
[CV 3/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.686 total time= 4.2min
[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.420 total time=  48.7s




[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.623 total time= 3.4min
[CV 1/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.640 total time= 5.4min




[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.623 total time= 3.4min




[CV 3/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.601 total time= 2.1min




[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.599 total time= 3.5min
[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.595 total time= 3.6min




[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.595 total time= 3.7min




[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.599 total time= 3.6min
[CV 3/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.678 total time= 5.6min
[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.724 total time= 4.3min
[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.669 total time= 4.3min




[CV 1/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.672 total time= 5.0min
[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.698 total time= 4.3min
[CV 1/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.649 total time= 5.7min




[CV 2/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.727 total time= 5.1min




[CV 2/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.710 total time= 6.0min
[CV 3/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.687 total time= 6.0min




[CV 3/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.708 total time= 5.0min
[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.397 total time=  51.0s
[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.395 total time= 1.3min
[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.386 total time=  55.8s




[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.419 total time=  53.3s
[CV 1/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.596 total time= 2.1min
[CV 2/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.622 total time= 2.0min
[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.385 total time= 1.5min
[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.416 total time= 1.5min




[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.738 total time= 4.9min
[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.687 total time= 5.0min




[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.601 total time= 2.1min




[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.601 total time= 2.0min




[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.714 total time= 5.2min
[CV 1/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.596 total time= 2.1min
[CV 2/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.623 total time= 2.1min
[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.386 total time=  54.6s
[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.397 total time=  51.4s
[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.420 total time=  53.9s
[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.386 total time=  48.4s
[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.386 total time= 1.3min
[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.418 total time=  48.9s
[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.398 total



[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.387 total time=  54.8s
[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.397 total time=  54.7s




[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.596 total time= 2.0min




[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.622 total time= 1.9min




[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.422 total time=  54.8s
[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.596 total time= 3.1min
[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.601 total time= 1.9min
[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.725 total time= 4.2min
[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.601 total time= 2.1min




[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.622 total time= 2.0min
[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.596 total time= 2.0min




[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.669 total time= 4.4min




[CV 3/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.601 total time= 3.3min




[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.624 total time= 3.4min
[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.596 total time= 3.5min




[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.624 total time= 3.4min




[CV 3/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.601 total time= 3.4min
[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.697 total time= 4.5min
[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.725 total time= 4.2min
[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.697 total time= 4.3min
[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.669 total time= 4.5min




[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.687 total time= 5.0min




[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.714 total time= 4.9min




[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.596 total time= 2.0min




[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.622 total time= 1.9min
[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.738 total time= 5.1min




[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.601 total time= 1.9min




[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.386 total time=  51.9s
[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.396 total time=  51.7s
[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.385 total time= 1.4min




[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.422 total time=  52.1s
[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.739 total time= 5.0min




[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.396 total time= 1.5min
[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.422 total time= 1.5min




[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.714 total time= 5.0min




[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.687 total time= 5.1min




[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.601 total time= 2.0min
[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.622 total time= 2.0min
[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.596 total time= 2.0min
[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.386 total time=  52.7s
[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.398 total time=  51.4s
[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.420 total time=  49.3s
[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.666 total time= 7.3min
[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.717 total time= 7.3min
[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.694 total time= 7.2min




[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.737 total time= 7.9min




[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.713 total time= 8.3min




[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.684 total time= 8.5min
[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.725 total time= 3.8min
[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.669 total time= 4.2min
[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.697 total time= 4.1min




[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.738 total time= 4.1min
[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.698 total time= 3.6min




[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.714 total time= 4.2min
[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.669 total time= 3.8min
[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.724 total time= 3.8min




[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.687 total time= 4.4min




[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.739 total time= 4.0min
[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.714 total time= 4.0min




[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.687 total time= 4.3min
[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.697 total time= 6.0min
[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.669 total time= 6.3min




[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.687 total time= 6.6min
[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.738 total time= 6.6min
[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.726 total time= 6.7min




[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.715 total time= 6.6min




In [147]:
lr_evaluate_gs.best_params_

{'C': 1, 'max_iter': 500, 'penalty': 'none', 'solver': 'saga', 'tol': 0.0001}

In [148]:
performancePrinter(test_evaluate_y, pred_evaluate_y_lr)

Accuracy Score ->  0.9197848456501403
Kappa Score ->  0.6939989351172623
ROC AUC Score ->  0.799334118755721
F1 Score ->  0.7387661843107387
Classification report -> 
               precision    recall  f1-score   support

           0       0.92      0.99      0.95      3477
           1       0.94      0.61      0.74       799

    accuracy                           0.92      4276
   macro avg       0.93      0.80      0.85      4276
weighted avg       0.92      0.92      0.91      4276



##### Random Forest

In [149]:
rf_evaluate = RandomForestClassifier()
rf_evaluate_gs = GridSearchCV(rf_evaluate, params_rf, scoring="f1", n_jobs=-1, cv=3, verbose=3)
rf_evaluate_gs.fit(train_evaluate_x, train_evaluate_y)
pred_evaluate_y_rf = rf_evaluate_gs.predict(test_evaluate_x)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has alrea

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [150]:
rf_evaluate_gs.best_params_

{'bootstrap': False,
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [151]:
performancePrinter(test_evaluate_y, pred_evaluate_y_rf)

Accuracy Score ->  0.9667913938260057
Kappa Score ->  0.8876876105890226
ROC AUC Score ->  0.9318640679336372
F1 Score ->  0.9079118028534371
Classification report -> 
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      3477
           1       0.94      0.88      0.91       799

    accuracy                           0.97      4276
   macro avg       0.96      0.93      0.94      4276
weighted avg       0.97      0.97      0.97      4276



##### XGBoost

In [152]:
xgb_evaluate = XGBClassifier()
xgb_evaluate_gs = GridSearchCV(xgb_evaluate, params_xgb, scoring="f1", n_jobs=-1, cv=3)
xgb_evaluate_gs.fit(train_evaluate_x, train_evaluate_y)
pred_evaluate_y_xgb = xgb_evaluate_gs.predict(test_evaluate_x)

  from pandas import MultiIndex, Int64Index


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index








  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index








  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index








  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index








  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index








  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index












































































































In [153]:
xgb_evaluate_gs.best_params_

{'gamma': 0.5, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100}

In [154]:
performancePrinter(test_evaluate_y, pred_evaluate_y_xgb)

Accuracy Score ->  0.9642188961646398
Kappa Score ->  0.8780814698474202
ROC AUC Score ->  0.9240165032289787
F1 Score ->  0.899803536345776
Classification report -> 
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      3477
           1       0.94      0.86      0.90       799

    accuracy                           0.96      4276
   macro avg       0.96      0.92      0.94      4276
weighted avg       0.96      0.96      0.96      4276



### Create

#### Data Preparation

In [155]:
create_x, create_y = split_train_x.to_numpy(), split_train_y['Create'].astype('long').to_numpy()#rus(split_train_x, split_train_y['Create'].to_numpy())

In [156]:
create_x.shape

(17104, 94)

#### BERT Experiment

In [157]:
create_x_bert = create_x[:, 0].tolist()

In [158]:
if RUN_DL:
    create_bert = createBERT('create', create_x_bert, create_y, split_test_x['Learning_outcome'].tolist(), split_test_y['Create'].astype('long').to_numpy(), 64)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": false,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolve/m

***** Running training *****
  Num examples = 13683
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 642


Started training model for column create


Step,Training Loss,Validation Loss,F1
10,0.533,0.481171,0.449292
20,0.4435,0.376486,0.449292
30,0.3173,0.22861,0.850344
40,0.2819,0.17908,0.882979
50,0.2271,0.216736,0.864922
60,0.2235,0.148736,0.913478
70,0.2063,0.155844,0.892321
80,0.1698,0.135276,0.913262
90,0.1843,0.1122,0.934444
100,0.142,0.112677,0.932403


***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to create/checkpoint-10
Configuration saved in create/checkpoint-10/config.json
Model weights saved in create/checkpoint-10/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to create/checkpoint-20
Configuration saved in create/checkpoint-20/config.json
Model weights saved in create/checkpoint-20/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to create/checkpoint-30
Configuration saved in create/checkpoint-30/config.json
Model weights saved in create/checkpoint-30/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to create/checkpoint-40
Configuration saved in create/checkpoint-40/config.json
Model weights saved in create/checkpoint-40/pytorch_model.bin
Deleting older checkpoint [create/checkpoint-10] due

Training Completed. Started testing...


Accuracy Score ->  0.9572029934518241
Kappa Score ->  0.8452626533317724
ROC AUC Score ->  0.908832894707142
F1 Score ->  0.8708539167254763
Classification report -> 
               precision    recall  f1-score   support

           0       0.97      0.98      0.97      3537
           1       0.91      0.83      0.87       739

    accuracy                           0.96      4276
   macro avg       0.94      0.91      0.92      4276
weighted avg       0.96      0.96      0.96      4276



#### Traditional ML Algorithm

In [159]:
combined_create_x, column_names_create, test_create_x = generateX(create_x, split_test_x.to_numpy(), 0, 1, 94)
train_create_x = combined_create_x
train_create_y = create_y
test_create_y = split_test_y['Create'].astype('long').to_numpy()

Getting Unigram...


Getting Bigram...
Getting Tfidf...
Getting ARI...
Combining...
Generated feature shape is (17104, 3094)
Generated test feature is (4276, 3094)


In [160]:
column_names_create += data.columns[8:].tolist()

##### Naive Bayes

In [161]:
gnb_create = GaussianNB()
gnb_create_gs = GridSearchCV(gnb_create, params_nb, scoring="f1", n_jobs=-1, cv=3)
gnb_create_gs.fit(train_create_x, train_create_y)
pred_create_y_gnb = gnb_create_gs.predict(test_create_x)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [162]:
gnb_create_gs.best_params_

{'var_smoothing': 1e-08}

In [163]:
performancePrinter(test_create_y, pred_create_y_gnb)

Accuracy Score ->  0.6760991580916744
Kappa Score ->  0.29991762985827797
ROC AUC Score ->  0.7426614758422752
F1 Score ->  0.47398404861374865
Classification report -> 
               precision    recall  f1-score   support

           0       0.95      0.64      0.77      3537
           1       0.33      0.84      0.47       739

    accuracy                           0.68      4276
   macro avg       0.64      0.74      0.62      4276
weighted avg       0.84      0.68      0.72      4276



##### Support Vector Machine

In [164]:
svm_create = SVC()
svm_create_gs = GridSearchCV(svm_create, params_svm, scoring="f1", n_jobs=-1, cv=3)
svm_create_gs.fit(train_create_x, train_create_y)
pred_create_y_svm = svm_create_gs.predict(test_create_x)

In [165]:
svm_create_gs.best_params_

{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}

In [166]:
performancePrinter(test_create_y, pred_create_y_svm)

Accuracy Score ->  0.9424695977549111
Kappa Score ->  0.7907186897535529
ROC AUC Score ->  0.8801236340514713
F1 Score ->  0.825035561877667
Classification report -> 
               precision    recall  f1-score   support

           0       0.96      0.98      0.97      3537
           1       0.87      0.78      0.83       739

    accuracy                           0.94      4276
   macro avg       0.91      0.88      0.90      4276
weighted avg       0.94      0.94      0.94      4276



##### Logistic Regression

In [167]:
lr_create = LogisticRegression()
lr_create_gs = GridSearchCV(lr_create, params_lr, scoring="f1", n_jobs=-1, cv=3, verbose=3)
lr_create_gs.fit(train_create_x, train_create_y)
pred_create_y_lr = lr_create_gs.predict(test_create_x)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.337 total time=  55.0s
[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.389 total time=  48.7s
[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.361 total time=  51.0s
[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.367 total time=  59.8s
[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.406 total time=  51.8s
[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.347 total time= 1.0min
[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.364 total time=  52.9s
[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.410 total time=  53.2s
[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.393 total time=  54.4s
[CV 1/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.337 tot



[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.496 total time= 2.0min




[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.534 total time= 2.0min
[CV 3/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.389 total time=  57.2s




[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.498 total time= 2.1min
[CV 1/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.496 total time= 2.0min




[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.499 total time= 2.2min
[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.460 total time= 2.3min




[CV 2/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.534 total time= 2.1min




[CV 3/3] END C=0.1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.498 total time= 2.1min
[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.538 total time= 2.0min




[CV 1/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.460 total time= 2.3min




[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.503 total time= 2.1min




[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.499 total time= 2.1min
[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.461 total time= 2.4min




[CV 3/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.462 total time= 2.5min
[CV 1/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.504 total time= 2.1min




[CV 2/3] END C=0.1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.499 total time= 2.5min
[CV 3/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.499 total time= 2.1min
[CV 2/3] END C=0.1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.539 total time= 2.1min
[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.364 total time=  58.9s
[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.392 total time=  53.0s
[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.411 total time=  58.1s
[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.411 total time=  54.3s
[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.363 total time=  58.0s
[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.408 total time= 1.5min
[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.360 t



[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.538 total time= 1.9min




[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.503 total time= 2.0min




[CV 1/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.503 total time= 2.0min




[CV 3/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.499 total time= 2.0min




[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.392 total time=  51.1s
[CV 2/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.538 total time= 1.9min




[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.494 total time= 3.5min
[CV 1/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.493 total time= 3.4min




[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.533 total time= 3.5min
[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.498 total time= 3.5min
[CV 3/3] END C=1, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.499 total time= 2.1min




[CV 1/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.619 total time= 5.0min




[CV 2/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.534 total time= 3.6min




[CV 3/3] END C=1, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.498 total time= 3.6min
[CV 2/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.651 total time= 5.0min




[CV 2/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.651 total time= 4.8min




[CV 1/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.590 total time= 6.1min




[CV 2/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.617 total time= 6.1min




[CV 3/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.632 total time= 5.1min
[CV 3/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.592 total time= 6.0min




[CV 1/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.619 total time= 5.1min




[CV 1/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.590 total time= 6.0min




[CV 3/3] END C=0.1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.632 total time= 5.1min




[CV 1/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.505 total time= 2.1min




[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.641 total time= 4.9min




[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.631 total time= 4.9min
[CV 2/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.540 total time= 2.1min




[CV 3/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.592 total time= 6.2min
[CV 2/3] END C=0.1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.617 total time= 6.2min




[CV 1/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.631 total time= 5.0min
[CV 3/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.641 total time= 4.9min
[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.364 total time=  54.7s
[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.656 total time= 4.9min




[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.499 total time= 2.1min




[CV 2/3] END C=0.1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.656 total time= 5.1min
[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.360 total time= 1.5min
[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.412 total time=  51.2s
[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.406 total time= 1.5min
[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.391 total time=  54.1s
[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.391 total time= 1.6min




[CV 2/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.539 total time= 1.9min




[CV 1/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.503 total time= 2.1min




[CV 3/3] END C=1, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.499 total time= 2.0min
[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.410 total time=  51.7s
[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.364 total time=  58.6s
[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.392 total time=  56.2s
[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.410 total time= 1.3min
[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.362 total time=  56.6s
[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.364 total time= 1.4min
[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.411 total time=  56.9s
[CV 3/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.01;, score=0.391 total time= 1.4min




[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.01;, score=0.392 total time=  55.4s




[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.410 total time=  50.7s
[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.364 total time=  58.0s




[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.504 total time= 2.1min




[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.540 total time= 2.1min




[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.503 total time= 3.1min
[CV 1/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.503 total time= 3.2min
[CV 1/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.503 total time= 1.9min




[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.538 total time= 3.2min




[CV 3/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.498 total time= 3.3min
[CV 2/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.001;, score=0.538 total time= 3.4min




[CV 3/3] END C=10, max_iter=200, penalty=l1, solver=saga, tol=0.0001;, score=0.498 total time= 3.3min




[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.001;, score=0.499 total time= 2.1min
[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.01;, score=0.393 total time=  56.9s
[CV 2/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.539 total time= 2.1min




[CV 3/3] END C=10, max_iter=200, penalty=l2, solver=saga, tol=0.0001;, score=0.499 total time= 2.1min




[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.639 total time= 4.7min




[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.629 total time= 5.0min




[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.655 total time= 4.9min
[CV 3/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.639 total time= 4.8min




[CV 2/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.655 total time= 5.2min
[CV 1/3] END C=1, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.629 total time= 5.1min




[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.503 total time= 2.1min
[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.640 total time= 4.8min
[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.656 total time= 4.8min




[CV 3/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.641 total time= 4.9min
[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.364 total time= 1.4min




[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.540 total time= 1.9min
[CV 2/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.656 total time= 5.0min
[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.631 total time= 5.0min
[CV 1/3] END C=1, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.631 total time= 5.0min




[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.411 total time= 1.4min
[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.001;, score=0.499 total time= 2.0min




[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.01;, score=0.392 total time= 1.5min




[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.363 total time=  55.9s




[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.411 total time=  50.9s
[CV 1/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.504 total time= 2.0min




[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.01;, score=0.392 total time=  53.2s
[CV 2/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.540 total time= 2.1min




[CV 3/3] END C=10, max_iter=200, penalty=none, solver=saga, tol=0.0001;, score=0.499 total time= 2.1min
[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.364 total time=  52.2s
[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.409 total time=  51.0s
[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.01;, score=0.392 total time=  54.2s




[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.631 total time= 8.4min
[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.657 total time= 8.2min




[CV 2/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.657 total time= 8.5min




[CV 1/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.631 total time= 8.5min




[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.632 total time= 8.8min




[CV 3/3] END C=1, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.632 total time= 8.8min




[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.631 total time= 4.2min




[CV 1/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.631 total time= 4.2min




[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.640 total time= 4.3min




[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.631 total time= 4.1min




[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.001;, score=0.656 total time= 4.6min




[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.656 total time= 4.2min




[CV 2/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.656 total time= 4.1min




[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.641 total time= 4.1min




[CV 3/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.641 total time= 4.5min




[CV 2/3] END C=10, max_iter=500, penalty=l2, solver=saga, tol=0.0001;, score=0.656 total time= 4.7min




[CV 3/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.001;, score=0.640 total time= 4.5min




[CV 1/3] END C=10, max_iter=500, penalty=none, solver=saga, tol=0.0001;, score=0.631 total time= 4.5min




[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.656 total time= 6.6min




[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.641 total time= 6.8min




[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.631 total time= 6.9min




[CV 1/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.631 total time= 6.9min




[CV 2/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.001;, score=0.656 total time= 7.0min




[CV 3/3] END C=10, max_iter=500, penalty=l1, solver=saga, tol=0.0001;, score=0.641 total time= 6.7min




In [168]:
lr_create_gs.best_params_

{'C': 0.1, 'max_iter': 500, 'penalty': 'none', 'solver': 'saga', 'tol': 0.001}

In [169]:
performancePrinter(test_create_y, pred_create_y_lr)

Accuracy Score ->  0.9022450888681011
Kappa Score ->  0.6047910599899615
ROC AUC Score ->  0.7616092473801983
F1 Score ->  0.6590538336052203
Classification report -> 
               precision    recall  f1-score   support

           0       0.91      0.98      0.94      3537
           1       0.83      0.55      0.66       739

    accuracy                           0.90      4276
   macro avg       0.87      0.76      0.80      4276
weighted avg       0.90      0.90      0.89      4276



##### Random Forest

In [170]:
rf_create = RandomForestClassifier()
rf_create_gs = GridSearchCV(rf_create, params_rf, scoring="f1", n_jobs=-1, cv=3)
rf_create_gs.fit(train_create_x, train_create_y)
pred_create_y_rf = rf_create_gs.predict(test_create_x)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [171]:
rf_create_gs.best_params_

{'bootstrap': False,
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 250}

In [172]:
performancePrinter(test_create_y, pred_create_y_rf)

Accuracy Score ->  0.9443405051449953
Kappa Score ->  0.7938523230323775
ROC AUC Score ->  0.8748318089495046
F1 Score ->  0.826783114992722
Classification report -> 
               precision    recall  f1-score   support

           0       0.95      0.98      0.97      3537
           1       0.89      0.77      0.83       739

    accuracy                           0.94      4276
   macro avg       0.92      0.87      0.90      4276
weighted avg       0.94      0.94      0.94      4276



##### XGBoost

In [173]:
xgb_create = XGBClassifier()
xgb_create_gs = GridSearchCV(xgb_create, params_xgb, scoring="f1", n_jobs=-1, cv=3)
xgb_create_gs.fit(train_create_x, train_create_y)
pred_create_y_xgb = xgb_create_gs.predict(test_create_x)

  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index








  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index








  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index








  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




  from pandas import MultiIndex, Int64Index




































































































In [174]:
xgb_create_gs.best_params_

{'gamma': 0.5, 'learning_rate': 0.5, 'max_depth': 10, 'n_estimators': 50}

In [175]:
performancePrinter(test_create_y, pred_create_y_xgb)

Accuracy Score ->  0.9441066417212348
Kappa Score ->  0.7961063316999984
ROC AUC Score ->  0.881648400458635
F1 Score ->  0.8294075660242684
Classification report -> 
               precision    recall  f1-score   support

           0       0.96      0.98      0.97      3537
           1       0.88      0.79      0.83       739

    accuracy                           0.94      4276
   macro avg       0.92      0.88      0.90      4276
weighted avg       0.94      0.94      0.94      4276

