AES dataset: https://www.kaggle.com/competitions/asap-aes


AI detection dataset: https://www.kaggle.com/datasets/thedrcat/daigt-v2-train-dataset

# Mounting Drive




In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Pip installing gensim pyspellchecker and optuna

In [None]:
!pip install gensim pyspellchecker optuna

# Imports

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    AutoModel
)
from sklearn.metrics import r2_score
import numpy as np
import tensorflow as tf
from datasets import DatasetDict, Dataset
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import cohen_kappa_score
from torch import nn
import re
import optuna
import nltk

nltk.download('punkt_tab')



# Reading in data and performing EDA for the AES system

### Reading in the data

In [None]:
full_data = pd.read_excel(r'/content/drive/MyDrive/ASAP/training_set_rel3.xlsx')
display(full_data)


### Essay distribution counts for prompt 8

In [None]:
prompt_8_isolated = full_data[full_data['essay_set'] == 8].copy()

bins = [0, 10, 20, 30, 40, 50, 60]
score_rng = ['0-10', '11-20', '21-30', '31-40', '41-50', '51-60']

prompt_8_isolated['score_rng'] = pd.cut(prompt_8_isolated['domain1_score'], bins=bins, labels=score_rng, right = False)
counts = prompt_8_isolated.groupby('score_rng').size()
counts = pd.Series(counts)
counts = counts.reset_index()
counts = counts.rename(columns={0 : 'counts'})
display(counts)

plt.figure(figsize=(10, 6))
plt.bar(counts['score_rng'], counts['counts'])
plt.xlabel('Score Range')
plt.ylabel('Number of Essays')
plt.title('Number of Essays in Each Score Range for Prompt 8')

### Finding the number of essays in each prompt

In [None]:
grouped_data = full_data.groupby('essay_set').count().reset_index()
grouped_data_filtered = grouped_data[['essay_set', 'essay_id']]
grouped_data_filtered.rename(columns={'essay_id': 'essay_count'}, inplace=True)
display(grouped_data_filtered)

plt.figure(figsize=(10, 6))
plt.bar(grouped_data_filtered['essay_set'], grouped_data_filtered['essay_count'])
plt.xlabel('Prompt Number')
plt.ylabel('Number of Essays')
plt.title('Number of Essays in Each Set')


# Selecting the prompt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
prompt_data = full_data[full_data['essay_set'] == 4]
prompt_data = prompt_data[~prompt_data['domain1_score'].isna()]
prompt_data_train = prompt_data['essay'].astype(str)
prompt_data_label = prompt_data['domain1_score'].astype(int)

binned_labels = pd.qcut(prompt_data_label, q=5, duplicates='drop', labels = False)
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(prompt_data_train, prompt_data_label, test_size=0.2, random_state=42, stratify = binned_labels)

X_train_all = pd.Series(X_train_all)
X_test_all = pd.Series(X_test_all)
y_train_all = pd.Series(y_train_all)
y_test_all = pd.Series(y_test_all)

# Investigating base-learner individual performances for the regression models

### Creatng BERT-based model

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold


# Prompt 1 Best Params:  {'lr': 5e-05, 'batch_size': 8}
# Prompt 7 Best Params: {'lr': 3e-05, 'batch_size': 8}
# Prompt 8 Best Params: {'lr': 5e-05, 'batch_size': 8}
def generate_stacking_preds(model_name, X_train, X_test, y_train, y_test, lr = 0.00001, bs = 16, epochs = 30):
  class EssayDatasetRegression(Dataset):
            def __init__(self, encodings, labels):
                self.encodings = encodings
                self.labels = labels

            def __getitem__(self, idx):
                item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
                item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
                return item

            def __len__(self):
                return len(self.labels)

  tokenizer = AutoTokenizer.from_pretrained(model_name)


  X_train = X_train.reset_index(drop=True)
  y_train = y_train.reset_index(drop=True)

  X_test = X_test.reset_index(drop=True)
  y_test = y_test.reset_index(drop=True)

  min_score = min(min(y_train), min(y_test))
  max_score = max(max(y_train), max(y_test))

  def make_compute_metrics(min_score, max_score):
    def compute_metrics(eval_pred):
          preds, labels = eval_pred
          preds_rounded = np.clip(np.rint(preds), min_score, max_score).astype(int)
          labels_rounded = np.rint(labels).astype(int)
          return {'eval_qwk': cohen_kappa_score(preds_rounded, labels_rounded, weights='quadratic')}
    return compute_metrics


  def tokenize_texts(text):
        cleaned_texts = []
        for t in text:
          t = t.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
          t = re.sub(r'\s+', ' ', t)
          t = t.strip()
          cleaned_texts.append(t)
        return tokenizer(cleaned_texts, padding='max_length', truncation=True, max_length=512, return_tensors='pt')







  bined_y = pd.qcut(y_train, q=5, duplicates='drop', labels = False)
  X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=bined_y)

  X_train_split = X_train_split.tolist()
  X_val_split = X_val_split.tolist()
  y_train_split = y_train_split.tolist()
  y_val_split = y_val_split.tolist()
  X_test = X_test.tolist()
  y_test = y_test.tolist()

  tokenizer = AutoTokenizer.from_pretrained(model_name)
  X_train_split_encoding = tokenize_texts(X_train_split)
  X_test_encoding = tokenize_texts(X_test)
  X_val_split_encoding = tokenize_texts(X_val_split)


  X_train_split_dataset = EssayDatasetRegression(X_train_split_encoding, y_train_split)
  X_test_dataset = EssayDatasetRegression(X_test_encoding, y_test)
  X_val_split_dataset = EssayDatasetRegression(X_val_split_encoding, y_val_split)
  # true_labels = y_test.tolist()

  model2 = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type = 'regression', num_labels=1)

  for name, param in model2.named_parameters():
      param.requires_grad = False

  for name, param in model2.named_parameters():
      if any(layer in name for layer in ['encoder.layer.8', 'encoder.layer.9','encoder.layer.10', 'encoder.layer.11', 'pooler', 'classifier']):
          param.requires_grad = True



  training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=lr,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    num_train_epochs= epochs,
    logging_strategy = 'epoch',
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_qwk',
    greater_is_better=True,
    report_to = 'none',


  )


  trainer2 = Trainer(
      model=model2,
      args=training_args,
      train_dataset=X_train_split_dataset,
      compute_metrics = make_compute_metrics(min_score,max_score),
      eval_dataset=X_val_split_dataset,
      callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
  )

  trainer2.train()

  results = trainer2.predict(X_test_dataset)
  preds_test = np.rint(results.predictions.squeeze()).astype(int)

  qwk = cohen_kappa_score(preds_test, y_test, weights='quadratic')
  print(f'QWK: {qwk}')



  ### This code below is for hyper-parameter tuning


def generate_stacking_preds_hp_tune(model_name, X_train, X_test, y_train, y_test, lr = 0.00001, bs = 16, epochs = 30):
  class EssayDatasetRegression(Dataset):
            def __init__(self, encodings, labels):
                self.encodings = encodings
                self.labels = labels

            def __getitem__(self, idx):
                item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
                item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
                return item

            def __len__(self):
                return len(self.labels)

  tokenizer = AutoTokenizer.from_pretrained(model_name)


  X_train = X_train.reset_index(drop=True)
  y_train = y_train.reset_index(drop=True)

  X_test = X_test.reset_index(drop=True)
  y_test = y_test.reset_index(drop=True)

  min_score = min(min(y_train), min(y_test))
  max_score = max(max(y_train), max(y_test))

  def make_compute_metrics(min_score, max_score):
    def compute_metrics(eval_pred):
          preds, labels = eval_pred
          preds_rounded = np.clip(np.rint(preds), min_score, max_score).astype(int)
          labels_rounded = np.rint(labels).astype(int)
          return {'eval_qwk': cohen_kappa_score(preds_rounded, labels_rounded, weights='quadratic')}
    return compute_metrics

  def tokenize_texts(text):
          cleaned_texts = []
          for t in text:
            t = t.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
            t = re.sub(r'\s+', ' ', t)
            t = t.strip()
            cleaned_texts.append(t)
          return tokenizer(cleaned_texts, padding='max_length', truncation=True, max_length=512, return_tensors='pt')



  model = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type = 'regression', num_labels=1)

  binned_y = pd.qcut(y_train, q=5, duplicates='drop', labels = False)

  X_train_ft, X_val, y_train_ft, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify= binned_y)


  train_data = X_train_ft.tolist()
  val_data = X_val.tolist()
  train_labels = y_train_ft.tolist()
  val_labels = y_val.tolist()

  train_encoding = tokenize_texts(train_data)
  val_encoding = tokenize_texts(val_data)


  train_dataset = EssayDatasetRegression(train_encoding, train_labels)
  val_dataset = EssayDatasetRegression(val_encoding, val_labels)




  for name, param in model.named_parameters():
      param.requires_grad = False

  for name, param in model.named_parameters():
    if any(layer in name for layer in ['encoder.layer.8', 'encoder.layer.9','encoder.layer.10', 'encoder.layer.11', 'pooler', 'classifier']):
        param.requires_grad = True

        # min_score = min(y_train)
        # max_score = max(y_train)


  training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=lr,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    num_train_epochs= epochs,
    logging_strategy = 'epoch',
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_loss',
    greater_is_better=False,
    report_to = 'none',


  )



  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      compute_metrics=make_compute_metrics(min_score, max_score),
      callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
  )

  trainer.train()

  results = trainer.predict(val_dataset)
  preds_val = np.rint(results.predictions.squeeze()).astype(int)

  qwk_val = cohen_kappa_score(preds_val, y_val, weights='quadratic')
  print(f'QWK: {qwk_val}')

  return qwk_val

def tune_hyperparameters(trial):
    lr = trial.suggest_float('lr', 1e-5, 5e-5, step=1e-5)
    bs = trial.suggest_categorical('batch_size', [8, 16, 32])

    print(f'Trial {trial.number}: lr={lr}, batch_size={bs}')



    qwk  = generate_stacking_preds_hp_tune('bert-base-uncased', X_train_all, X_test_all, y_train_all, y_test_all, lr=lr, bs = bs)



    return qwk


# study = optuna.create_study(direction='maximize')
# study.optimize(tune_hyperparameters, n_trials=10)
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

### Checking the run time and performance for the BERT-based model

In [None]:
import time

start_time = time.time()

generate_stacking_preds('bert-base-uncased', X_train_all, X_test_all, y_train_all, y_test_all, lr = 0.00005, bs=8
                                                                                                                , epochs=80)

end_time = time.time()
execution_time = end_time - start_time
print(f'Execution time: {execution_time} seconds')

### Downloading the NLTK package

In [None]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize


### Creating the LSTM-based model



*   Although in my dissertation I said I used a similar structure to Taghipor's and Ng's model - this was based of their research paper rather than the code (which I could not find). No part of their actual code was used here.




In [None]:
import gensim.downloader as api
from gensim.models import KeyedVectors
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import KFold, StratifiedKFold
import optuna
import time
import re
# Prompt 1 optim params: {'hidden_dim': 256, 'lr': 0.00040147129315448076, 'weight_decay': 4.194966656572804e-05, 'dropout1': 0.48972751622541333, 'dropout2': 0.5637289114635329, 'dropout3': 0.4006279296851355, 'batch_size': 64}
# Prompt 2 Best Params: {'hidden_dim': 128, 'lr': 0.0008329626514872093, 'weight_decay': 1.10904162552094e-06, 'dropout1': 0.4113129372112616, 'dropout2': 0.5339519738927687, 'dropout3': 0.6474745827693765, 'batch_size': 32}
# Prompt 3 Best Params: {'hidden_dim': 64, 'lr': 0.0009176211868710996, 'weight_decay': 8.47468900298905e-05, 'dropout1': 0.4402196829382994, 'dropout2': 0.6543812752462479, 'dropout3': 0.30572393075878385, 'batch_size': 128}
# Prompt 4 Best Params: Best trial: {'hidden_dim': 512, 'lr': 0.0006392739979334983, 'weight_decay': 1.0246979767749631e-06, 'dropout1': 0.6331948007619779, 'dropout2': 0.3024115352348601, 'dropout3': 0.4826430525213473, 'batch_size': 32}
# Prompt 5 Best Params: {'hidden_dim': 256, 'lr': 0.0007888666643834901, 'weight_decay': 1.9973208515938436e-05, 'dropout1': 0.5150492631843749, 'dropout2': 0.6836007071470228, 'dropout3': 0.44931934750457436, 'batch_size': 16}
# Prompt 6 Best Params: {'hidden_dim': 256, 'lr': 0.0004221715369392723, 'weight_decay': 1.6693933987910344e-06, 'dropout1': 0.642989336447883, 'dropout2': 0.6149595736926003, 'dropout3': 0.4561023529421714, 'batch_size': 16}
# Prompt 7 Best Params: {'hidden_dim': 128, 'lr': 0.0007031295542079031, 'weight_decay': 1.1167323499868158e-05, 'dropout1': 0.43014203679075136, 'dropout2': 0.6072345773651249, 'dropout3': 0.5301384805467484, 'batch_size': 64}
# Prompt 8 Best Params: {'hidden_dim': 256, 'lr': 0.00044054250769868446, 'weight_decay': 1.8209940215624457e-05, 'dropout1': 0.6820871115494689, 'dropout2': 0.5530971947983455, 'dropout3': 0.6417406365573428, 'batch_size': 64}
def lstm_model(X_train, X_test, y_train, y_test):
    w2v_model = api.load('word2vec-google-news-300')
    embedding_size = w2v_model.vector_size

    class Vocab:
        def __init__(self, token_freqs, min_freq=1, specials=['<pad>', '<unk>']):
            self.itos = list(specials)
            self.stoi = {tok: i for i, tok in enumerate(self.itos)}
            for token, freq in token_freqs.items():
                if freq >= min_freq and token not in self.stoi:
                    self.stoi[token] = len(self.itos)
                    self.itos.append(token)
        def __len__(self):
            return len(self.itos)




    class EssayDataset(Dataset):
        def __init__(self, essays, labels, max_len, vocab):
            self.essays = essays
            self.labels = labels
            self.max_length = max_len
            self.vocab = vocab

        def create_encodings(self, text):
            word_token = word_tokenize(text.lower())
            input_ids = [self.vocab.stoi.get(word, self.vocab.stoi['<unk>']) for word in word_token]
            if len(input_ids) < self.max_length:
                input_ids += [self.vocab.stoi['<pad>']] * (self.max_length - len(input_ids))
            else:
                input_ids = input_ids[:self.max_length]
            return torch.tensor(input_ids)

        def __getitem__(self, index):
            essay = self.create_encodings(self.essays[index])
            label = self.labels[index]
            return essay, label

        def __len__(self):
            return len(self.essays)

    class BiLSTM_CNN(nn.Module):
        def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding_matrix):
            super().__init__()
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
            self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
            self.conv1 = nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=3, padding=1)
            self.dropout1 = nn.Dropout( 0.6820871115494689)
            self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
            self.dropout2 = nn.Dropout(0.5530971947983455)
            self.pool = nn.AdaptiveAvgPool1d(1)
            self.dropout3 = nn.Dropout(0.6417406365573428)
            self.fc = nn.Linear(hidden_dim, 1)

        def forward(self, x):
            embedded = self.embedding(x)
            lstm_out, _ = self.lstm(embedded)
            conv1_out = torch.relu(self.conv1(lstm_out.permute(0, 2, 1)))
            conv2_out = torch.relu(self.conv2(conv1_out))
            pool_out = self.pool(conv2_out).squeeze(2)
            output = self.fc(pool_out)
            return output.squeeze()

    def cleaning_data(text):
      text = text.lower()
      text = text.replace('\n', ' ')
      text = text.replace('\t', ' ')
      text = text.replace('\r', ' ')
      text = re.sub(r'\s+', ' ', text)
      text = text.strip()
      return text

    essay_data = X_train.tolist() + X_test.tolist()
    essays = [cleaning_data(essay) for essay in essay_data]
    X_train = pd.Series(essays[:len(X_train)])
    X_test = pd.Series(essays[len(X_train):])
    counter = Counter()
    for essay in essays:
        counter.update(word_tokenize(essay.lower()))
    most_common = counter.most_common(4000)
    vocab = Vocab(dict(most_common), min_freq=1)

    embedding_matrix = np.zeros((len(vocab), embedding_size))
    for i, word in enumerate(vocab.itos):
        embedding_matrix[i] = w2v_model[word] if word in w2v_model else np.random.normal(scale=0.6, size=(embedding_size,))
    embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

    min_score = int(min(y_train.min(), y_test.min()))
    max_score = int(max(y_train.max(), y_test.max()))





    bins = pd.qcut(y_train, q=5, duplicates='drop', labels = False)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(y_train))
    y_train_values = np.zeros(len(y_train))
    test_preds = []
    scaler = MinMaxScaler()
    scaler.fit(y_train.values.reshape(-1, 1))

    #Comment this out to do fine-tuning
    # def tuning_parameeters(trial):
    #   split = int(0.8 * len(X_train))
    #   X_tr, X_val = X_train[:split], X_train[split:]
    #   y_tr, y_val = y_train[:split], y_train[split:]

    #   hidden_dim = trial.suggest_categorical('hidden_dim', [64, 128, 256, 512])
    #   lr = trial.suggest_float('lr', 1e-4, 1e-3, step=1e-4)
    #   weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-4, step = 1e-5)
    #   dropout1 = trial.suggest_float('dropout1', 0.2, 0.8, step=0.1)
    #   dropout2 = trial.suggest_float('dropout2', 0.2, 0.8, step=0.1)
    #   dropout3 = trial.suggest_float('dropout3', 0.2, 0.8, step=0.1)
    #   batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])


    #   y_train_scaled_full = scaler.transform(y_tr.values.reshape(-1, 1)).flatten()
    #   y_val_scaled = scaler.transform(y_val.values.reshape(-1, 1)).flatten()

    #   full_train_dataset = EssayDataset(X_tr.tolist(), y_train_scaled_full, 512, vocab)
    #   full_train_loader = DataLoader(full_train_dataset, batch_size=batch_size, shuffle=True)

    #   full_val_dataset = EssayDataset(X_val.tolist(), y_val_scaled, 512, vocab)
    #   full_val_loader = DataLoader(full_val_dataset, batch_size=batch_size, shuffle=False)

    #   model = BiLSTM_CNN(len(vocab), embedding_size, hidden_dim, embedding_matrix)
    #   model.dropout1.p = dropout1
    #   model.dropout2.p = dropout2
    #   model.dropout3.p = dropout3
    #   optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    #   criterion = nn.MSELoss()

    #   for epoch in range(5):
    #       model.train()
    #       total_loss = 0
    #       for essays, labels in full_train_loader:
    #           optimizer.zero_grad()
    #           output = model(essays)
    #           loss = criterion(output, labels.float())
    #           total_loss += loss.item()
    #           loss.backward()
    #           optimizer.step()

    #   model.eval()
    #   val_preds = []
    #   with torch.no_grad():
    #       for essays, _ in full_val_loader:
    #           output = model(essays).cpu().numpy()
    #           val_preds.extend(output)
    #   val_preds = scaler.inverse_transform(np.array(val_preds).reshape(-1, 1)).flatten()
    #   val_preds_rounded = np.clip(np.rint(val_preds), min_score, max_score).astype(int)
    #   qwk = cohen_kappa_score(val_preds_rounded, y_val.values, weights='quadratic')
    #   return qwk

    # study = optuna.create_study(direction='maximize')
    # study.optimize(tuning_parameeters, n_trials=10)
    # print('Number of finished trials:', len(study.trials))
    # print('Best trial:', study.best_trial.params)





    y_train_scaled_full = scaler.transform(y_train.values.reshape(-1, 1)).flatten()

    full_train_dataset = EssayDataset(X_train.tolist(), y_train_scaled_full, 512, vocab)
    full_train_loader = DataLoader(full_train_dataset, batch_size=64, shuffle=True)

    model2 = BiLSTM_CNN(len(vocab), embedding_size, 256, embedding_matrix)
    optimizer = torch.optim.Adam(model2.parameters(), lr = 0.00044054250769868446, weight_decay = 1.8209940215624457e-05)
    criterion = nn.MSELoss()

    for epoch in range(10):
        model2.train()
        total_loss = 0
        for essays, labels in full_train_loader:
            optimizer.zero_grad()
            output = model2(essays)
            loss = criterion(output, labels.float())
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(full_train_loader)}')


    model2.eval()
    test_dataset = EssayDataset(X_test.tolist(), np.zeros(len(X_test)), 512, vocab)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

    final_test_preds = []
    with torch.no_grad():
        for essays, _ in test_loader:
            output = model2(essays).cpu().numpy().flatten()
            final_test_preds.extend(output)

    final_test_preds = scaler.inverse_transform(np.array(final_test_preds).reshape(-1, 1)).flatten()
    final_preds_rounded = np.clip(np.rint(final_test_preds), min_score, max_score).astype(int)

    qwk = cohen_kappa_score(y_test.astype(int), final_preds_rounded, weights='quadratic')
    print(f'QWK: {qwk}')


    # return oof_preds, final_preds_rounded
    # return oof_preds, final_preds_rounded, avg_test_preds, y_train_values, y_test



# start_time = time.time()
# _ = bert_lstm_model(X_train_all, X_test_all, y_train_all, y_test_all)
# end_time = time.time()
# execution_time = end_time - start_time
# print(f'Execution time: {execution_time} seconds')

### Checking the run time and performance for the LSTM-based model

In [None]:
import time

start_time = time.time()

lstm_model(X_train_all, X_test_all, y_train_all, y_test_all)

end_time = time.time()
execution_time = end_time - start_time
print(f'Execution time: {execution_time} seconds')

### Creating the Random Forest model

In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from sklearn.base import clone
import spacy
from spellchecker import SpellChecker

# Prompt 1 Best Params: n_estimators=500, max_depth=10, random_state=42, n_jobs=-1, max_features = 'sqrt', min_samples_split = 2
# Prompt 7 Best: Params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 500}
# Prompt 8 Best Params:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 100}


def random_forest_model_with_gridsearch(X_train, X_test, y_train, y_test):
    nlp = spacy.load('en_core_web_sm')
    spell = SpellChecker()

    def cleaning_data(text):
      text = text.replace('\n', ' ')
      text = text.replace('\t', ' ')
      text = text.replace('\r', ' ')
      text = re.sub(r'\s+', ' ', text)
      text = text.strip()
      return text

    X_train = X_train.apply(cleaning_data)
    X_test = X_test.apply(cleaning_data)

    def feature_creation(essay):
        doc = nlp(essay)
        num_sentences = len(list(doc.sents))
        num_words = len(essay.split())
        num_unique_words = len(set(essay.split()))
        num_stop_words = len([token for token in doc if token.is_stop])
        num_proper_nouns = len([token for token in doc if token.pos_ == 'PROPN'])
        num_verbs = len([token for token in doc if token.pos_ == 'VERB'])
        num_adjectives = len([token for token in doc if token.pos_ == 'ADJ'])
        num_adverbs = len([token for token in doc if token.pos_ == 'ADV'])
        num_nouns = len([token for token in doc if token.pos_ == 'NOUN'])
        num_prepositions = len([token for token in doc if token.pos_ == 'ADP'])
        num_pronouns = len([token for token in doc if token.pos_ == 'PRON'])
        num_conjunctions = len([token for token in doc if token.pos_ == 'CCONJ'])
        num_interjections = len([token for token in doc if token.pos_ == 'INTJ'])
        num_punctuation = len([token for token in doc if token.pos_ == 'PUNCT'])
        num_digits = len([token for token in doc if token.pos_ == 'NUM'])
        num_entities = len(list(doc.ents))
        num_spelling_errors = len(spell.unknown(essay.split()))
        avh_word_length = np.mean([len(word) for word in essay.split()])
        avg_sentence_length = np.mean([len(sent) for sent in list(doc.sents)])
        return [num_sentences, num_words, num_unique_words, num_stop_words, num_proper_nouns, num_verbs, num_adjectives, num_adverbs, num_nouns, num_prepositions, num_pronouns,
                num_conjunctions, num_interjections, num_punctuation, num_digits, num_entities, num_spelling_errors, avh_word_length, avg_sentence_length]

    features = np.array([feature_creation(essay) for essay in X_train])
    features_test = np.array([feature_creation(essay) for essay in X_test])

    min_score = min(y_train.min(), y_test.min())
    max_score = max(y_train.max(), y_test.max())

    binned_y = pd.qcut(y_train, q=5, duplicates='drop', labels=False)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(y_train))
    y_train_values = np.zeros(len(y_train))
    y_preds = []

    param_grid = {
        'n_estimators': [100, 300, 500],
        'max_depth': [None, 10, 20, 50],
        'min_samples_split': [2, 5, 7],
        'max_features': ['sqrt', 'log2']
    }

    grid_search = GridSearchCV(
            RandomForestRegressor(random_state=42),
            param_grid,
            cv=3,
            n_jobs=-1,
            scoring='neg_mean_squared_error'
        )
    grid_search.fit(features, y_train)
    print('Best Parameters:', grid_search.best_params_)

    best_model = grid_search.best_estimator_




    model = clone(best_model)
    model.fit(features, y_train)


    y_pred = model.predict(features_test)
    y_pred_rounded = np.clip(np.rint(y_pred), min_score, max_score).astype(int)
    qwk = cohen_kappa_score(y_test, y_pred_rounded, weights='quadratic')
    print(f'QWK: {qwk}')







### Checking the run time and performance for the Random Forest model








In [None]:
start_time = time.time()
random_forest_model_with_gridsearch(X_train_all, X_test_all, y_train_all, y_test_all)
end_time = time.time()
execution_time = end_time - start_time
print(f'Execution time: {execution_time} seconds')

# Investigating base-learner individual performance for classification models

### Creaating the BERT-based model

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import cohen_kappa_score, accuracy_score
import optuna
from torch.nn.functional import softmax
import re


# Prompt 1 Best Params:: {'lr': 4e-05, 'batch_size': 32}
# Prompt 2 Best Params: {'lr': 4e-05, 'batch_size': 8}
# Prompt 3 Best Params: {'lr': 3e-05, 'batch_size': 8}
# Prompt 4 Best Params: {'lr': 4e-05, 'batch_size': 8}
# Prompt 5 Best Params: {'lr': 4e-05, 'batch_size': 8}
# Prompt 6 Best Params: {'lr': 5e-05, 'batch_size': 8}
# Prompt 7 Best Params: {'lr': 3e-05, 'batch_size': 8}
# Prompt 8 Best Params: {'lr': 5e-05, 'batch_size': 8}
def generate_stacking_preds_classification(model_name, X_train, X_test, y_train, y_test, lr = 0.00001, bs = 16, epochs = 30):
  class EssayDatasetClassification(Dataset):
        def __init__(self, encodings, labels, seq_len = 512):
            self.encodings = encodings
            self.labels = labels
            self.seq_len = 512

        def __getitem__(self, item):
            return {
                'input_ids': self.encodings['input_ids'][item].clone().detach(),
                'attention_mask': self.encodings['attention_mask'][item].clone().detach(),
                'labels': torch.tensor(self.labels[item], dtype=torch.long)
            }

        def __len__(self):
            return len(self.labels)

  tokenizer = AutoTokenizer.from_pretrained(model_name)


  X_train = X_train.reset_index(drop=True)
  y_train = y_train.reset_index(drop=True)

  X_test = X_test.reset_index(drop=True)
  y_test = y_test.reset_index(drop=True)

  unique_scores = sorted(list(set(np.concatenate([np.array(y_train), np.array(y_test)]))))
  score_to_class = {score: i for i, score in enumerate(unique_scores)}
  class_to_score = {i: score for score, i in score_to_class.items()}
  num_classes = len(score_to_class)

  y_train = pd.Series([score_to_class[s] for s in y_train]).reset_index(drop=True)
  y_test = pd.Series([score_to_class[s] for s in y_test]).reset_index(drop=True)

  min_score = min(min(y_train), min(y_test))
  max_score = max(max(y_train), max(y_test))



  def make_compute_metrics(min_score, max_score):
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        true_scores = [class_to_score[i] for i in labels]
        pred_scores = [class_to_score[i] for i in preds]
        qwk = cohen_kappa_score(true_scores, pred_scores, weights='quadratic')
        acc = accuracy_score(true_scores, pred_scores)
        return {'eval_qwk': qwk, 'accuracy': acc}
    return compute_metrics

  def tokenize_texts(text):
    cleaned_texts = []
    for t in text:
      t = t.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
      t = re.sub(r'\s+', ' ', t)
      t = t.strip()
      cleaned_texts.append(t)

    return tokenizer(cleaned_texts, padding='max_length', truncation=True, max_length=512, return_tensors='pt')




  binned_y = pd.qcut(y_train, q=5, duplicates='drop', labels = False)
  kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  preds = np.zeros((len(y_train), num_classes))
  test_preds = []


  X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify = y_train)

  X_train_split = X_train_split.tolist()
  X_val_split = X_val_split.tolist()
  y_train_split = y_train_split.tolist()
  y_val_split = y_val_split.tolist()
  X_test = X_test.tolist()
  y_test = y_test.tolist()

  tokenizer = AutoTokenizer.from_pretrained(model_name)
  X_train_split_encoding = tokenize_texts(X_train_split)
  X_test_encoding = tokenize_texts(X_test)
  X_val_split_encoding = tokenize_texts(X_val_split)


  X_train_split_dataset = EssayDatasetClassification(X_train_split_encoding, y_train_split)
  X_test_dataset = EssayDatasetClassification(X_test_encoding, y_test)
  X_val_split_dataset = EssayDatasetClassification(X_val_split_encoding, y_val_split)
  # true_labels = y_test.tolist()

  model2 = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type='single_label_classification', num_labels = num_classes)

  for name, param in model2.named_parameters():
      param.requires_grad = False

  for name, param in model2.named_parameters():
      if any(layer in name for layer in ['encoder.layer.8', 'encoder.layer.9','encoder.layer.10', 'encoder.layer.11', 'pooler', 'classifier']):
          param.requires_grad = True



  training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=lr,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    num_train_epochs= epochs,
    logging_strategy = 'epoch',
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_qwk',
    greater_is_better=True,
    report_to = 'none',


  )


  trainer2 = Trainer(
      model=model2,
      args=training_args,
      train_dataset=X_train_split_dataset,
      compute_metrics = make_compute_metrics(min_score,max_score),
      eval_dataset=X_val_split_dataset,
      callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
  )

  trainer2.train()




  results_test = trainer2.predict(X_test_dataset)
  test_preds_output = np.argmax(results_test.predictions, axis=1)
  test_pred_scores = [class_to_score[i] for i in test_preds_output.astype(int)]
  true_test_scores = [class_to_score[i] for i in results_test.label_ids]
  qwk_whole_test = cohen_kappa_score(test_pred_scores, true_test_scores, weights='quadratic')
  print(f'QWK: {qwk_whole_test}')














#   ### This code below is for hyper-parameter tuning


def generate_stacking_preds_hp_tune(model_name, X_train, X_test, y_train, y_test, lr = 0.00001, bs = 16, epochs = 30):
  class EssayDatasetClassification(Dataset):
        def __init__(self, encodings, labels, seq_len = 512):
            self.encodings = encodings
            self.labels = labels
            self.seq_len = 512

        def __getitem__(self, item):
            return {
                'input_ids': self.encodings['input_ids'][item].clone().detach(),
                'attention_mask': self.encodings['attention_mask'][item].clone().detach(),
                'labels': torch.tensor(self.labels[item], dtype=torch.long)
            }

        def __len__(self):
            return len(self.labels)

  tokenizer = AutoTokenizer.from_pretrained(model_name)


  X_train = X_train.reset_index(drop=True)
  y_train = y_train.reset_index(drop=True)

  X_test = X_test.reset_index(drop=True)
  y_test = y_test.reset_index(drop=True)

  unique_scores = sorted(list(set(np.concatenate([np.array(y_train), np.array(y_test)]))))
  score_to_class = {score: i for i, score in enumerate(unique_scores)}
  class_to_score = {i: score for score, i in score_to_class.items()}
  num_classes = len(score_to_class)

  y_train = [score_to_class[int(s)] for s in y_train]
  y_test = [score_to_class[int(s)] for s in y_test]

  min_score = min(min(y_train), min(y_test))
  max_score = max(max(y_train), max(y_test))

  def make_compute_metrics(min_score, max_score):
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        true_scores = [class_to_score[i] for i in labels]
        pred_scores = [class_to_score[i] for i in preds]
        qwk = cohen_kappa_score(true_scores, pred_scores, weights='quadratic')
        acc = accuracy_score(true_scores, pred_scores)
        return {'eval_qwk': qwk, 'accuracy': acc}
    return compute_metrics

  def tokenize_texts(text):
        return tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')



  model = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type = 'single_label_classification', num_labels = num_classes)

  X_train_ft, X_val, y_train_ft, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify = y_train)


  train_data = X_train_ft.tolist()
  val_data = X_val.tolist()
  train_labels = y_train_ft
  val_labels = y_val

  train_encoding = tokenize_texts(train_data)
  val_encoding = tokenize_texts(val_data)


  train_dataset = EssayDatasetClassification(train_encoding, train_labels)
  val_dataset = EssayDatasetClassification(val_encoding, val_labels)




  for name, param in model.named_parameters():
      param.requires_grad = False

  for name, param in model.named_parameters():
    if any(layer in name for layer in ['encoder.layer.8', 'encoder.layer.9','encoder.layer.10', 'encoder.layer.11', 'pooler', 'classifier']):
        param.requires_grad = True

        # min_score = min(y_train)
        # max_score = max(y_train)


  training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=lr,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    num_train_epochs= epochs,
    logging_strategy = 'epoch',
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_qwk',
    greater_is_better=True,
    report_to = 'none',


  )



  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      compute_metrics=make_compute_metrics(min_score, max_score),
      callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
  )

  trainer.train()

  results = trainer.predict(val_dataset)
  val_preds = np.argmax(results.predictions, axis=1)
  val_pred_scores = [class_to_score[i] for i in val_preds.astype(int)]
  true_val_scores = [class_to_score[i] for i in results.label_ids]




  qwk_val = cohen_kappa_score(val_pred_scores, true_val_scores, weights='quadratic')
  print(f'QWK: {qwk_val}')

  return qwk_val

def tune_hyperparameters(trial):
    lr = trial.suggest_float('lr', 1e-5, 5e-5, step=1e-5)
    bs = trial.suggest_categorical('batch_size', [8, 16, 32])

    print(f'Trial {trial.number}: lr={lr}, batch_size={bs}')



    qwk  = generate_stacking_preds_hp_tune('bert-base-uncased', X_train_all, X_test_all, y_train_all, y_test_all, lr=lr, bs = bs)




    return qwk


# study = optuna.create_study(direction='maximize')
# study.optimize(tune_hyperparameters, n_trials=10)
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)



### Checking the run time and performance for the BERT-based model


In [None]:
import time
start_time = time.time()
generate_stacking_preds_classification('bert-base-uncased', X_train_all, X_test_all, y_train_all, y_test_all, lr=0.00005, bs=8)
end_time = time.time()
print(f'Time taken: {end_time - start_time} seconds')

### Creating the LSTM-based model

In [None]:
from tkinter.constants import W
import gensim.downloader as api
from gensim.models import KeyedVectors
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import KFold, StratifiedKFold
import optuna
import torch.nn.functional as F
import time
from nltk import word_tokenize
import re
# Prompt 1 optim params: {'hidden_dim': 256, 'lr': 0.00040147129315448076, 'weight_decay': 4.194966656572804e-05, 'dropout1': 0.48972751622541333, 'dropout2': 0.5637289114635329, 'dropout3': 0.4006279296851355, 'batch_size': 64}
# Prompt 2 Best Params: {'hidden_dim': 128, 'lr': 0.0008329626514872093, 'weight_decay': 1.10904162552094e-06, 'dropout1': 0.4113129372112616, 'dropout2': 0.5339519738927687, 'dropout3': 0.6474745827693765, 'batch_size': 32}
# Prompt 3 Best Params: {'hidden_dim': 64, 'lr': 0.0009176211868710996, 'weight_decay': 8.47468900298905e-05, 'dropout1': 0.4402196829382994, 'dropout2': 0.6543812752462479, 'dropout3': 0.30572393075878385, 'batch_size': 128}
# Prompt 4 Best Params: Best trial: {'hidden_dim': 256, 'lr': 0.0006392739979334983, 'weight_decay': 1.0246979767749631e-06, 'dropout1': 0.6331948007619779, 'dropout2': 0.3024115352348601, 'dropout3': 0.4826430525213473, 'batch_size': 32}
# Prompt 5 Best Params: {'hidden_dim': 256, 'lr': 0.0007888666643834901, 'weight_decay': 1.9973208515938436e-05, 'dropout1': 0.5150492631843749, 'dropout2': 0.6836007071470228, 'dropout3': 0.44931934750457436, 'batch_size': 64}
# Prompt 6 Best Params: {'hidden_dim': 256, 'lr': 0.0004221715369392723, 'weight_decay': 1.6693933987910344e-06, 'dropout1': 0.642989336447883, 'dropout2': 0.6149595736926003, 'dropout3': 0.4561023529421714, 'batch_size': 16}
# Prompt 7 Best Params: {'hidden_dim': 128, 'lr': 0.0007031295542079031, 'weight_decay': 1.1167323499868158e-05, 'dropout1': 0.43014203679075136, 'dropout2': 0.6072345773651249, 'dropout3': 0.5301384805467484, 'batch_size': 64}
# Prompt 8 Best Params: {'hidden_dim': 256, 'lr': 0.00044054250769868446, 'weight_decay': 1.8209940215624457e-05, 'dropout1': 0.6820871115494689, 'dropout2': 0.5530971947983455, 'dropout3': 0.6417406365573428, 'batch_size': 64}
def lstm_model_classification(X_train, X_test, y_train, y_test):
    w2v_model = api.load('word2vec-google-news-300')
    embedding_size = w2v_model.vector_size

    X_train = X_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)

    X_test = X_test.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    unique_scores = sorted(list(set(np.concatenate([np.array(y_train), np.array(y_test)]))))
    score_to_class = {score: i for i, score in enumerate(unique_scores)}
    class_to_score = {i: score for score, i in score_to_class.items()}
    num_classes = len(score_to_class)

    y_train = pd.Series([score_to_class[s] for s in y_train])
    y_test = pd.Series([score_to_class[s] for s in y_test])

    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    min_score = min(min(y_train), min(y_test))
    max_score = max(max(y_train), max(y_test))


    class Vocab:
        def __init__(self, token_freqs, min_freq=1, specials=['<pad>', '<unk>']):
            self.itos = list(specials)
            self.stoi = {tok: i for i, tok in enumerate(self.itos)}
            for token, freq in token_freqs.items():
                if freq >= min_freq and token not in self.stoi:
                    self.stoi[token] = len(self.itos)
                    self.itos.append(token)
        def __len__(self):
            return len(self.itos)






    class EssayDataset(Dataset):
        def __init__(self, essays, labels, max_len, vocab):
            self.essays = essays
            self.labels = labels
            self.max_length = max_len
            self.vocab = vocab

        def create_encodings(self, text):
            word_token = word_tokenize(text.lower())
            input_ids = [self.vocab.stoi.get(word, self.vocab.stoi['<unk>']) for word in word_token]
            if len(input_ids) < self.max_length:
                input_ids += [self.vocab.stoi['<pad>']] * (self.max_length - len(input_ids))
            else:
                input_ids = input_ids[:self.max_length]
            return torch.tensor(input_ids)

        def __getitem__(self, index):
            essay = self.create_encodings(self.essays[index])
            label = self.labels[index]
            return essay, label

        def __len__(self):
            return len(self.essays)

    class BiLSTM_CNN(nn.Module):
        def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding_matrix):
            super().__init__()
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
            self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
            self.conv1 = nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=3, padding=1)
            self.dropout1 = nn.Dropout(0.4402196829382994)
            self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
            self.dropout2 = nn.Dropout(0.6543812752462479)
            self.pool = nn.AdaptiveAvgPool1d(1)
            self.dropout3 = nn.Dropout(0.30572393075878385)
            self.fc = nn.Linear(hidden_dim, num_classes)

        def forward(self, x):
            embedded = self.embedding(x)
            lstm_out, _ = self.lstm(embedded)
            conv1_out = torch.relu(self.conv1(lstm_out.permute(0, 2, 1)))
            conv2_out = torch.relu(self.conv2(conv1_out))
            pool_out = self.pool(conv2_out).squeeze(2)
            output = self.fc(pool_out)
            return output

    def cleaning_data(text):
      text = text.lower()
      text = text.replace('\n', ' ')
      text = text.replace('\t', ' ')
      text = text.replace('\r', ' ')
      text = re.sub(r'\s+', ' ', text)
      text = text.strip()
      return text

    essay_data = X_train.tolist() + X_test.tolist()
    essays = [cleaning_data(essay) for essay in essay_data]
    X_train = pd.Series(essays[:len(X_train)])
    X_test = pd.Series(essays[len(X_train):])
    counter = Counter()
    for essay in essays:
        counter.update(word_tokenize(essay.lower()))
    most_common = counter.most_common(4000)
    vocab = Vocab(dict(most_common), min_freq=1)

    embedding_matrix = np.zeros((len(vocab), embedding_size))
    for i, word in enumerate(vocab.itos):
        embedding_matrix[i] = w2v_model[word] if word in w2v_model else np.random.normal(scale=0.6, size=(embedding_size,))
    embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

    min_score = int(min(y_train.min(), y_test.min()))
    max_score = int(max(y_train.max(), y_test.max()))





    # scaler = MinMaxScaler()
    # scaler.fit(y_train.values.reshape(-1, 1))

    # Comment this out to do fine-tuning
    # def tuning_parameters(trial):
    #   split = int(0.8 * len(X_train))
    #   X_tr, X_val = X_train[:split].reset_index(drop=True), X_train[split:].reset_index(drop=True)
    #   y_tr, y_val = y_train[:split].reset_index(drop=True), y_train[split:].reset_index(drop=True)

    #   hidden_dim = trial.suggest_categorical('hidden_dim', [64, 128, 256, 512])
    #   lr = trial.suggest_float('lr', 1e-5, 1e-3, log = True)
    #   weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-4, log=True)
    #   dropout1 = trial.suggest_float('dropout1', 0.2, 0.8)
    #   dropout2 = trial.suggest_float('dropout2', 0.2, 0.8)
    #   dropout3 = trial.suggest_float('dropout3', 0.2, 0.8)
    #   batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])




    #   full_train_dataset = EssayDataset(X_tr.tolist(), y_tr, 512, vocab)
    #   full_train_loader = DataLoader(full_train_dataset, batch_size=batch_size, shuffle=True)

    #   full_val_dataset = EssayDataset(X_val.tolist(), y_val, 512, vocab)
    #   full_val_loader = DataLoader(full_val_dataset, batch_size=batch_size, shuffle=False)

    #   model = BiLSTM_CNN(len(vocab), embedding_size, hidden_dim, embedding_matrix)
    #   model.dropout1.p = dropout1
    #   model.dropout2.p = dropout2
    #   model.dropout3.p = dropout3
    #   optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    #   criterion = nn.CrossEntropyLoss()

    #   for epoch in range(5):
    #       model.train()
    #       total_loss = 0
    #       for essays, labels in full_train_loader:
    #           optimizer.zero_grad()
    #           output = model(essays)
    #           loss = criterion(output, labels.long())
    #           total_loss += loss.item()
    #           loss.backward()
    #           optimizer.step()

    #   model.eval()
    #   val_preds = []
    #   with torch.no_grad():
    #       for essays, _ in full_val_loader:
    #           output = model(essays).cpu().numpy()
    #           val_preds.extend(torch.argmax(torch.tensor(output), dim=1).numpy())
    #   val_pred_scores = [class_to_score[i] for i in val_preds]
    #   val_true_scores = [class_to_score[i] for i in y_val.tolist()]
    #   qwk = cohen_kappa_score(val_pred_scores, val_true_scores, weights='quadratic')
    #   print(f'QWK: {qwk}')
    #   return qwk

    # study = optuna.create_study(direction='maximize')
    # study.optimize(tuning_parameters, n_trials=10)
    # print('Number of finished trials:', len(study.trials))
    # print('Best trial:', study.best_trial.params)









    full_train_dataset = EssayDataset(X_train.tolist(), y_train, 512, vocab)
    full_train_loader = DataLoader(full_train_dataset, batch_size=128, shuffle=True)

    model2 = BiLSTM_CNN(len(vocab), embedding_size, 64, embedding_matrix)
    optimizer = torch.optim.Adam(model2.parameters(), lr = 0.0009176211868710996, weight_decay = 8.47468900298905e-05)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(10):
        model2.train()
        total_loss = 0
        for essays, labels in full_train_loader:
            optimizer.zero_grad()
            output = model2(essays)
            loss = criterion(output, labels.long())
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(full_train_loader)}')


    model2.eval()
    test_dataset = EssayDataset(X_test.tolist(), np.zeros(len(X_test)), 512, vocab)
    test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

    final_test_preds = []
    with torch.no_grad():
        for essays, _ in test_loader:
            output = model2(essays).cpu().numpy()
            output = torch.argmax(torch.tensor(output), dim=1).numpy()
            final_test_preds.extend(output)

    final_test_preds = [class_to_score[i] for i in final_test_preds]
    true_scores = [class_to_score[i] for i in y_test.tolist()]


    qwk = cohen_kappa_score(final_test_preds, true_scores, weights = 'quadratic')
    print(f'QWK : {qwk}')





### Checking the run time and performance for the LSTM-based model


In [None]:
start_time = time.time()
lstm_model_classification(X_train_all, X_test_all, y_train_all, y_test_all)
end_time = time.time()
print(f'Time taken: {end_time - start_time} seconds')

### Creating the Random Forest model

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import cohen_kappa_score
from sklearn.base import clone
import pandas as pd
import numpy as np
import spacy
from spellchecker import SpellChecker

def random_forest_model_with_gridsearch(X_train, X_test, y_train, y_test):
    nlp = spacy.load('en_core_web_sm')
    spell = SpellChecker()

    def cleaning_data(text):
      text = text.replace('\n', ' ')
      text = text.replace('\t', ' ')
      text = text.replace('\r', ' ')
      text = re.sub(r'\s+', ' ', text)
      text = text.strip()
      return text

    X_train = X_train.apply(cleaning_data)
    X_test = X_test.apply(cleaning_data)

    def feature_creation(essay):
        doc = nlp(essay)
        num_sentences = len(list(doc.sents))
        num_words = len(essay.split())
        num_unique_words = len(set(essay.split()))
        num_stop_words = len([token for token in doc if token.is_stop])
        num_proper_nouns = len([token for token in doc if token.pos_ == 'PROPN'])
        num_verbs = len([token for token in doc if token.pos_ == 'VERB'])
        num_adjectives = len([token for token in doc if token.pos_ == 'ADJ'])
        num_adverbs = len([token for token in doc if token.pos_ == 'ADV'])
        num_nouns = len([token for token in doc if token.pos_ == 'NOUN'])
        num_prepositions = len([token for token in doc if token.pos_ == 'ADP'])
        num_pronouns = len([token for token in doc if token.pos_ == 'PRON'])
        num_conjunctions = len([token for token in doc if token.pos_ == 'CCONJ'])
        num_interjections = len([token for token in doc if token.pos_ == 'INTJ'])
        num_punctuation = len([token for token in doc if token.pos_ == 'PUNCT'])
        num_digits = len([token for token in doc if token.pos_ == 'NUM'])
        num_entities = len(list(doc.ents))
        num_spelling_errors = len(spell.unknown(essay.split()))
        avg_word_length = np.mean([len(word) for word in essay.split()])
        avg_sentence_length = np.mean([len(sent) for sent in list(doc.sents)])
        return [
            num_sentences, num_words, num_unique_words, num_stop_words, num_proper_nouns,
            num_verbs, num_adjectives, num_adverbs, num_nouns, num_prepositions,
            num_pronouns, num_conjunctions, num_interjections, num_punctuation, num_digits,
            num_entities, num_spelling_errors, avg_word_length, avg_sentence_length
        ]

    features = np.array([feature_creation(essay.strip()) for essay in X_train])
    features_test = np.array([feature_creation(essay.strip()) for essay in X_test])

    unique_scores = sorted(list(set(np.concatenate([np.array(y_train), np.array(y_test)]))))
    score_to_class = {score: i for i, score in enumerate(unique_scores)}
    class_to_score = {i: score for score, i in score_to_class.items()}

    y_train_class = y_train.map(score_to_class)
    y_test_class = y_test.map(score_to_class)

    binned_y = pd.qcut(y_train_class, q=5, duplicates='drop', labels=False)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    oof_preds = np.zeros((len(y_train), len(score_to_class)))
    y_train_values = np.zeros(len(y_train))
    test_probs_folds = []

    param_grid = {
        'n_estimators': [100, 300, 500],
        'max_depth': [None, 10, 20, 50],
        'min_samples_split': [2, 5, 7],
        'max_features': ['sqrt', 'log2']
    }

    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=3,
        n_jobs=-1,
        scoring='accuracy'
    )
    grid_search.fit(features, y_train_class)
    print('Best Parameters:', grid_search.best_params_)

    best_model = grid_search.best_estimator_


    model2 = clone(best_model)
    model2.fit(features, y_train_class)

    final_test_preds = model2.predict(features_test)
    final_test_scores_preds = [class_to_score[i] for i in final_test_preds]
    true_test_scores = [class_to_score[i] for i in y_test_class]
    qwk = cohen_kappa_score(final_test_scores_preds, true_test_scores, weights='quadratic')
    print(f'QWK: {qwk}')








### Checking the run time and performance for the Random Forest model


In [None]:
start_time = time.time()
random_forest_model_with_gridsearch(X_train_all, X_test_all, y_train_all, y_test_all)
end_time = time.time()
print(f'Time taken: {end_time - start_time} seconds')

# Creating the stacking set up for regression models

### Creating the BERT-based model

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
import optuna


# Prompt 1 Best Params:  {'lr': 5e-05, 'batch_size': 8}
# Prompt 7 Best Params: {'lr': 3e-05, 'batch_size': 8}
# Prompt 8 Best Params: {'lr': 5e-05, 'batch_size': 8}
def generate_stacking_preds_time(model_name, X_train, X_test, y_train, y_test, lr = 0.00001, bs = 16, epochs = 30):
  class EssayDatasetRegression(Dataset):
            def __init__(self, encodings, labels):
                self.encodings = encodings
                self.labels = labels

            def __getitem__(self, idx):
                item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
                item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
                return item

            def __len__(self):
                return len(self.labels)

  tokenizer = AutoTokenizer.from_pretrained(model_name)


  X_train = X_train.reset_index(drop=True)
  y_train = y_train.reset_index(drop=True)

  X_test = X_test.reset_index(drop=True)
  y_test = y_test.reset_index(drop=True)

  min_score = min(min(y_train), min(y_test))
  max_score = max(max(y_train), max(y_test))

  def make_compute_metrics(min_score, max_score):
    def compute_metrics(eval_pred):
          preds, labels = eval_pred
          preds_rounded = np.clip(np.rint(preds), min_score, max_score).astype(int)
          labels_rounded = np.rint(labels).astype(int)
          return {'eval_qwk': cohen_kappa_score(preds_rounded, labels_rounded, weights='quadratic')}
    return compute_metrics

  def tokenize_texts(text):
        cleaned_texts = []
        for t in text:
          t = t.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
          t = re.sub(r'\s+', ' ', t)
          t = t.strip()
          cleaned_texts.append(t)
        return tokenizer(cleaned_texts, padding='max_length', truncation=True, max_length=512, return_tensors='pt')




  binned_y = pd.qcut(y_train, q=5, duplicates='drop', labels = False)
  kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  preds = np.zeros(len(X_train))
  test_preds = []

  for train_idx, val_idx in kf.split(X_train, binned_y):
          model = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type = 'regression', num_labels=1)
          train_data = X_train.iloc[train_idx].tolist()
          val_data = X_train.iloc[val_idx].tolist()
          train_labels = y_train.iloc[train_idx].tolist()
          val_labels = y_train.iloc[val_idx].tolist()

          train_encoding = tokenize_texts(train_data)
          val_encoding = tokenize_texts(val_data)

          test_encoding = tokenize_texts(X_test.tolist())





          train_dataset = EssayDatasetRegression(train_encoding, train_labels)
          test_dataset = EssayDatasetRegression(test_encoding, y_test)
          val_dataset = EssayDatasetRegression(val_encoding, val_labels)




          for name, param in model.named_parameters():
              param.requires_grad = False

          for name, param in model.named_parameters():
            if any(layer in name for layer in ['encoder.layer.8', 'encoder.layer.9','encoder.layer.10', 'encoder.layer.11', 'pooler', 'classifier']):
                param.requires_grad = True

          # min_score = min(y_train)
          # max_score = max(y_train)


          training_args = TrainingArguments(
            output_dir='./results',
            learning_rate=lr,
            per_device_train_batch_size=bs,
            per_device_eval_batch_size=bs,
            num_train_epochs= epochs,
            logging_strategy = 'epoch',
            eval_strategy = 'epoch',
            save_strategy = 'epoch',
            load_best_model_at_end = True,
            metric_for_best_model = 'eval_loss',
            greater_is_better=False,
            report_to = 'none',


        )

      #   train_dataset = EssayDatasetRegression(train_encoding, y_train)
      #   test_dataset = EssayDatasetRegression(test_encoding, y_test)
      #   val_dataset = EssayDatasetRegression(val_encoding, y_val)

          trainer = Trainer(
              model=model,
              args=training_args,
              train_dataset=train_dataset,
              eval_dataset=val_dataset,
              compute_metrics=make_compute_metrics(min_score, max_score),
              callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
          )

          trainer.train()

          results = trainer.predict(val_dataset)
          preds[val_idx] = np.rint(results.predictions.squeeze()).astype(int)

          results_test = trainer.predict(test_dataset)
          test_preds.append(np.rint(results_test.predictions.squeeze()).astype(int))












  avg_test_preds = np.mean(test_preds, axis=0)
  final_test_preds_rounded = np.clip(np.rint(avg_test_preds), min_score, max_score).astype(int)







  # return preds_test, y_train, y_test
  return preds, final_test_preds_rounded, avg_test_preds, y_train, y_test




### Creating the LSTM-based model

In [None]:
import gensim.downloader as api
from gensim.models import KeyedVectors
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import KFold, StratifiedKFold
import optuna

# Prompt 1 optim params: {'hidden_dim': 256, 'lr': 0.00040147129315448076, 'weight_decay': 4.194966656572804e-05, 'dropout1': 0.48972751622541333, 'dropout2': 0.5637289114635329, 'dropout3': 0.4006279296851355, 'batch_size': 64}
# Prompt 2 Best Params: {'hidden_dim': 128, 'lr': 0.0008329626514872093, 'weight_decay': 1.10904162552094e-06, 'dropout1': 0.4113129372112616, 'dropout2': 0.5339519738927687, 'dropout3': 0.6474745827693765, 'batch_size': 32}
# Prompt 3 Best Params: {'hidden_dim': 64, 'lr': 0.0009176211868710996, 'weight_decay': 8.47468900298905e-05, 'dropout1': 0.4402196829382994, 'dropout2': 0.6543812752462479, 'dropout3': 0.30572393075878385, 'batch_size': 128}
# Prompt 4 Best Params: Best trial: {'hidden_dim': 512, 'lr': 0.0006392739979334983, 'weight_decay': 1.0246979767749631e-06, 'dropout1': 0.6331948007619779, 'dropout2': 0.3024115352348601, 'dropout3': 0.4826430525213473, 'batch_size': 32}
# Prompt 5 Best Params: {'hidden_dim': 256, 'lr': 0.0007888666643834901, 'weight_decay': 1.9973208515938436e-05, 'dropout1': 0.5150492631843749, 'dropout2': 0.6836007071470228, 'dropout3': 0.44931934750457436, 'batch_size': 16}
# Prompt 6 Best Params: {'hidden_dim': 256, 'lr': 0.0004221715369392723, 'weight_decay': 1.6693933987910344e-06, 'dropout1': 0.642989336447883, 'dropout2': 0.6149595736926003, 'dropout3': 0.4561023529421714, 'batch_size': 16}
# Prompt 7 Best Params: {'hidden_dim': 128, 'lr': 0.0007031295542079031, 'weight_decay': 1.1167323499868158e-05, 'dropout1': 0.43014203679075136, 'dropout2': 0.6072345773651249, 'dropout3': 0.5301384805467484, 'batch_size': 64}
# Prompt 8 Best Params: {'hidden_dim': 256, 'lr': 0.00044054250769868446, 'weight_decay': 1.8209940215624457e-05, 'dropout1': 0.6820871115494689, 'dropout2': 0.5530971947983455, 'dropout3': 0.6417406365573428, 'batch_size': 64}
def lstm_model_time(X_train, X_test, y_train, y_test):
    w2v_model = api.load('word2vec-google-news-300')
    embedding_size = w2v_model.vector_size

    class Vocab:
        def __init__(self, token_freqs, min_freq=1, specials=['<pad>', '<unk>']):
            self.itos = list(specials)
            self.stoi = {tok: i for i, tok in enumerate(self.itos)}
            for token, freq in token_freqs.items():
                if freq >= min_freq and token not in self.stoi:
                    self.stoi[token] = len(self.itos)
                    self.itos.append(token)
        def __len__(self):
            return len(self.itos)






    class EssayDataset(Dataset):
        def __init__(self, essays, labels, max_len, vocab):
            self.essays = essays
            self.labels = labels
            self.max_length = max_len
            self.vocab = vocab

        def create_encodings(self, text):
            word_token = word_tokenize(text.lower())
            input_ids = [self.vocab.stoi.get(word, self.vocab.stoi['<unk>']) for word in word_token]
            if len(input_ids) < self.max_length:
                input_ids += [self.vocab.stoi['<pad>']] * (self.max_length - len(input_ids))
            else:
                input_ids = input_ids[:self.max_length]
            return torch.tensor(input_ids)

        def __getitem__(self, index):
            essay = self.create_encodings(self.essays[index])
            label = self.labels[index]
            return essay, label

        def __len__(self):
            return len(self.essays)

    class BiLSTM_CNN(nn.Module):
        def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding_matrix):
            super().__init__()
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
            self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
            self.conv1 = nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=3, padding=1)
            self.dropout1 = nn.Dropout(0.6820871115494689)
            self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
            self.dropout2 = nn.Dropout(0.5530971947983455)
            self.pool = nn.AdaptiveAvgPool1d(1)
            self.dropout3 = nn.Dropout(0.6417406365573428)
            self.fc = nn.Linear(hidden_dim, 1)

        def forward(self, x):
            embedded = self.embedding(x)
            lstm_out, _ = self.lstm(embedded)
            conv1_out = torch.relu(self.conv1(lstm_out.permute(0, 2, 1)))
            conv2_out = torch.relu(self.conv2(conv1_out))
            pool_out = self.pool(conv2_out).squeeze(2)
            output = self.fc(pool_out)
            return output.squeeze()


    def cleaning_data(text):
      text = text.lower()
      text = text.replace('\n', ' ')
      text = text.replace('\t', ' ')
      text = text.replace('\r', ' ')
      text = re.sub(r'\s+', ' ', text)
      text = text.strip()
      return text

    essay_data = X_train.tolist() + X_test.tolist()
    essays = [cleaning_data(essay) for essay in essay_data]
    X_train = pd.Series(essays[:len(X_train)])
    X_test = pd.Series(essays[len(X_train):])
    counter = Counter()
    for essay in essays:
        counter.update(word_tokenize(essay.lower()))
    most_common = counter.most_common(4000)
    vocab = Vocab(dict(most_common), min_freq=1)

    embedding_matrix = np.zeros((len(vocab), embedding_size))
    for i, word in enumerate(vocab.itos):
        embedding_matrix[i] = w2v_model[word] if word in w2v_model else np.random.normal(scale=0.6, size=(embedding_size,))
    embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

    min_score = int(min(y_train.min(), y_test.min()))
    max_score = int(max(y_train.max(), y_test.max()))





    bins = pd.qcut(y_train, q=5, duplicates='drop', labels = False)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(y_train))
    y_train_values = np.zeros(len(y_train))
    test_preds = []
    scaler = MinMaxScaler()
    scaler.fit(y_train.values.reshape(-1, 1))






    for train_idx, val_idx in kf.split(X_train, bins):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]


        y_tr_scaled = scaler.transform(y_tr.values.reshape(-1, 1)).flatten()
        y_val_scaled = scaler.transform(y_val.values.reshape(-1, 1)).flatten()

        train_dataset = EssayDataset(X_tr.tolist(), y_tr_scaled, 512, vocab)
        val_dataset = EssayDataset(X_val.tolist(), y_val_scaled, 512, vocab)

        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

        model = BiLSTM_CNN(len(vocab), embedding_size, 256, embedding_matrix)
        optimizer = torch.optim.Adam(model.parameters(),  lr = 0.00044054250769868446, weight_decay = 1.8209940215624457e-05)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='max',
        factor=0.5,
        patience=1
      )
        criterion = nn.MSELoss()

        for epoch in range(10):
            model.train()
            total_loss = 0
            for essays, labels in train_loader:
                optimizer.zero_grad()
                output = model(essays)
                loss = criterion(output, labels.float())
                total_loss += loss.item()
                loss.backward()
                optimizer.step()
            print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}')

        model.eval()
        val_preds = []
        with torch.no_grad():
            for essays, _ in val_loader:
                output = model(essays).cpu().numpy()
                val_preds.extend(output)

        val_preds = scaler.inverse_transform(np.array(val_preds).reshape(-1, 1)).flatten()
        val_preds_rounded = np.clip(np.rint(val_preds), min_score, max_score).astype(int)
        qwk = cohen_kappa_score(val_preds_rounded, y_val.values, weights='quadratic')


        scheduler.step(qwk)




        oof_preds[val_idx] = val_preds_rounded
        y_train_values[val_idx] = y_val.values


        test_dataset = EssayDataset(X_test.tolist(), y_test.tolist(), 512, vocab)
        test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

        model.eval()
        fold_test_preds = []
        with torch.no_grad():
            for essays, _ in test_loader:
                output = model(essays).cpu().numpy().flatten()
                fold_test_preds.extend(output)

        fold_test_preds = scaler.inverse_transform(np.array(fold_test_preds).reshape(-1, 1)).flatten()
        test_preds.append(fold_test_preds)


    avg_test_preds = np.mean(test_preds, axis=0)
    final_preds_rounded = np.clip(np.rint(avg_test_preds), min_score, max_score).astype(int)





    return oof_preds, final_preds_rounded, avg_test_preds, y_train_values, y_test

### Creating the Random Forest model

In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from sklearn.base import clone
import spacy
from spellchecker import SpellChecker

# Prompt 1 Best Params: n_estimators=500, max_depth=10, random_state=42, n_jobs=-1, max_features = 'sqrt', min_samples_split = 2
# Prompt 7 Best: Params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 500}
# Prompt 8 Best Params:  {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 100}


def random_forest_model_with_gridsearch_time(X_train, X_test, y_train, y_test):
    nlp = spacy.load('en_core_web_sm')
    spell = SpellChecker()

    def cleaning_data(text):
      text = text.replace('\n', ' ')
      text = text.replace('\t', ' ')
      text = text.replace('\r', ' ')
      text = re.sub(r'\s+', ' ', text)
      text = text.strip()
      return text

    X_train = X_train.apply(cleaning_data)
    X_test = X_test.apply(cleaning_data)

    def feature_creation(essay):
        doc = nlp(essay)
        num_sentences = len(list(doc.sents))
        num_words = len(essay.split())
        num_unique_words = len(set(essay.split()))
        num_stop_words = len([token for token in doc if token.is_stop])
        num_proper_nouns = len([token for token in doc if token.pos_ == 'PROPN'])
        num_verbs = len([token for token in doc if token.pos_ == 'VERB'])
        num_adjectives = len([token for token in doc if token.pos_ == 'ADJ'])
        num_adverbs = len([token for token in doc if token.pos_ == 'ADV'])
        num_nouns = len([token for token in doc if token.pos_ == 'NOUN'])
        num_prepositions = len([token for token in doc if token.pos_ == 'ADP'])
        num_pronouns = len([token for token in doc if token.pos_ == 'PRON'])
        num_conjunctions = len([token for token in doc if token.pos_ == 'CCONJ'])
        num_interjections = len([token for token in doc if token.pos_ == 'INTJ'])
        num_punctuation = len([token for token in doc if token.pos_ == 'PUNCT'])
        num_digits = len([token for token in doc if token.pos_ == 'NUM'])
        num_entities = len(list(doc.ents))
        num_spelling_errors = len(spell.unknown(essay.split()))
        avh_word_length = np.mean([len(word) for word in essay.split()])
        avg_sentence_length = np.mean([len(sent) for sent in list(doc.sents)])
        return [num_sentences, num_words, num_unique_words, num_stop_words, num_proper_nouns, num_verbs, num_adjectives, num_adverbs, num_nouns, num_prepositions, num_pronouns,
                num_conjunctions, num_interjections, num_punctuation, num_digits, num_entities, num_spelling_errors, avh_word_length, avg_sentence_length]

    features = np.array([feature_creation(essay) for essay in X_train])

    features_test = np.array([feature_creation(essay) for essay in X_test])

    min_score = min(y_train.min(), y_test.min())
    max_score = max(y_train.max(), y_test.max())

    binned_y = pd.qcut(y_train, q=5, duplicates='drop', labels=False)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(y_train))
    y_train_values = np.zeros(len(y_train))
    y_preds = []

    param_grid = {
        'n_estimators': [100, 300, 500],
        'max_depth': [None, 10, 20, 50],
        'min_samples_split': [2, 5, 7],
        'max_features': ['sqrt', 'log2']
    }

    grid_search = GridSearchCV(
            RandomForestRegressor(random_state=42),
            param_grid,
            cv=3,
            n_jobs=-1,
            scoring='neg_mean_squared_error'
        )
    grid_search.fit(features, y_train)
    print('Best Parameters:', grid_search.best_params_)

    best_model = grid_search.best_estimator_




    for train_idx, val_idx in kf.split(X_train, binned_y):
        X_tr, X_val = features[train_idx], features[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = clone(best_model)
        model.fit(X_tr, y_tr)

        y_pred = model.predict(X_val)
        y_pred = np.clip(np.rint(y_pred), min_score, max_score).astype(int)
        oof_preds[val_idx] = y_pred
        y_train_values[val_idx] = y_val



        y_pred_test = model.predict(features_test)
        y_pred_test_rounded = np.clip(np.rint(y_pred_test), min_score, max_score).astype(int)
        y_preds.append(y_pred_test_rounded)



    avg_test_preds = np.mean(y_preds, axis=0)
    final_preds_rounded = np.clip(np.rint(avg_test_preds), min_score, max_score).astype(int)




    return oof_preds, final_preds_rounded, avg_test_preds, y_train_values, y_test

### Creating the stacking model itself for the regression problem, with the Ridge Regression meta-learner

In [None]:
import time
from sklearn.linear_model import RidgeCV

start_time = time.time()

oof_preds_transformer, preds_test_transformer, preds_test_k_fold_avg_bert, bert_y_train, bert_y_test = generate_stacking_preds_time('bert-base-uncased', X_train_all, X_test_all, y_train_all, y_test_all, lr = 0.00005, bs=8, epochs=80)
bert_completion_time = time.time()
print(f'BERT took {bert_completion_time - start_time} seconds')


oof_preds_lstm, preds_test_lstm, preds_test_k_fold_avg_lstm, lstm_y_train, lstm_y_test = lstm_model_time(X_train_all, X_test_all, y_train_all, y_test_all)
lstm_completion_time = time.time()
print(f'LSTM took {lstm_completion_time - bert_completion_time} seconds')

oof_preds_rf, y_pred_rf, preds_test_k_fold_avg_rf, y_train_rf, y_test_rf = random_forest_model_with_gridsearch_time(X_train_all, X_test_all, y_train_all, y_test_all)
rf_completion_time = time.time()
print(f'RF took {rf_completion_time - lstm_completion_time} seconds')


X_meta_train = np.vstack((oof_preds_transformer, oof_preds_lstm, oof_preds_rf)).T
X_meta_test = np.vstack((preds_test_k_fold_avg_bert,preds_test_k_fold_avg_lstm, preds_test_k_fold_avg_rf)).T

y_meta_train = np.array(y_train_all)
y_meta_test = np.array(y_test_all)


ridge_cv = RidgeCV(alphas=[0.1, 1.0, 10.0, 50.0, 100.0], cv=5)
ridge_cv.fit(X_meta_train, y_meta_train)


meta_preds = ridge_cv.predict(X_meta_test)

min_score = min(y_meta_train.min(), y_meta_test.min())
max_score = max(y_meta_train.max(), y_meta_test.max())

meta_preds_rounded = np.clip(np.rint(meta_preds), min_score, max_score).astype(int)


qwk_meta = cohen_kappa_score(y_meta_test, meta_preds_rounded, weights='quadratic')
print(qwk_meta)
print(f'QWK score: {qwk_meta}')
print(f'Alpha: {ridge_cv.alpha_}')
print(f'Coefficients: {ridge_cv.coef_}')





stacking_completion_time = time.time()
print(f'Stacking took {stacking_completion_time - start_time} seconds')







# Creating the stacking set up for classification models

### Creating the BERT-based model

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import cohen_kappa_score, accuracy_score
import optuna
from torch.nn.functional import softmax

# Prompt 1 Best Params:: {'lr': 4e-05, 'batch_size': 32}
# Prompt 2 Best Params: {'lr': 4e-05, 'batch_size': 8}
# Prompt 3 Best Params: {'lr': 4e-05, 'batch_size': 8}
# Prompt 4 Best Params: {'lr': 4e-05, 'batch_size': 8}
# Prompt 5 Best Params: {'lr': 5e-05, 'batch_size': 8}
# Prompt 6 Best Params: {'lr': 5e-05, 'batch_size': 8}
# Prompt 7 Best Params: {'lr': 3e-05, 'batch_size': 8}
# Prompt 8 Best Params: {'lr': 5e-05, 'batch_size': 8}
def generate_stacking_preds_classification_time(model_name, X_train, X_test, y_train, y_test, lr = 0.00001, bs = 16, epochs = 30):
  class EssayDatasetClassification(Dataset):
        def __init__(self, encodings, labels, seq_len = 512):
            self.encodings = encodings
            self.labels = labels
            self.seq_len = 512

        def __getitem__(self, item):
            return {
                'input_ids': self.encodings['input_ids'][item].clone().detach(),
                'attention_mask': self.encodings['attention_mask'][item].clone().detach(),
                'labels': torch.tensor(self.labels[item], dtype=torch.long)
            }

        def __len__(self):
            return len(self.labels)

  tokenizer = AutoTokenizer.from_pretrained(model_name)

  X_train = X_train.reset_index(drop=True)
  y_train = y_train.reset_index(drop=True)

  X_test = X_test.reset_index(drop=True)
  y_test = y_test.reset_index(drop=True)

  unique_scores = sorted(list(set(np.concatenate([np.array(y_train), np.array(y_test)]))))
  score_to_class = {score: i for i, score in enumerate(unique_scores)}
  class_to_score = {i: score for score, i in score_to_class.items()}
  num_classes = len(score_to_class)

  y_train = pd.Series([score_to_class[s] for s in y_train]).reset_index(drop=True)
  y_test = pd.Series([score_to_class[s] for s in y_test]).reset_index(drop=True)

  min_score = min(min(y_train), min(y_test))
  max_score = max(max(y_train), max(y_test))



  def make_compute_metrics(min_score, max_score):
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        true_scores = [class_to_score[i] for i in labels]
        pred_scores = [class_to_score[i] for i in preds]
        qwk = cohen_kappa_score(true_scores, pred_scores, weights='quadratic')
        acc = accuracy_score(true_scores, pred_scores)
        return {'eval_qwk': qwk, 'accuracy': acc}
    return compute_metrics

  def tokenize_texts(text):
      cleaned_texts = []
      for t in text:
        t = t.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        t = re.sub(r'\s+', ' ', t)
        t = t.strip()
        cleaned_texts.append(t)

      return tokenizer(cleaned_texts, padding='max_length', truncation=True, max_length=512, return_tensors='pt')




  binned_y = pd.qcut(y_train, q=5, duplicates='drop', labels = False)
  kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  preds = np.zeros((len(y_train), num_classes))
  test_preds = []

  for train_idx, val_idx in kf.split(X_train, binned_y):
          model = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type='single_label_classification', num_labels = num_classes)

          train_data = X_train.iloc[train_idx].tolist()
          val_data = X_train.iloc[val_idx].tolist()
          train_labels = y_train.iloc[train_idx].tolist()
          val_labels = y_train.iloc[val_idx].tolist()

          train_encoding = tokenize_texts(train_data)
          val_encoding = tokenize_texts(val_data)

          test_encoding = tokenize_texts(X_test.tolist())





          train_dataset = EssayDatasetClassification(train_encoding, train_labels)
          test_dataset = EssayDatasetClassification(test_encoding, y_test)
          val_dataset = EssayDatasetClassification(val_encoding, val_labels)




          for name, param in model.named_parameters():
              param.requires_grad = False

          for name, param in model.named_parameters():
            if any(layer in name for layer in ['encoder.layer.8', 'encoder.layer.9','encoder.layer.10', 'encoder.layer.11', 'pooler', 'classifier']):
                param.requires_grad = True

          # min_score = min(y_train)
          # max_score = max(y_train)


          training_args = TrainingArguments(
            output_dir='./results',
            learning_rate=lr,
            per_device_train_batch_size=bs,
            per_device_eval_batch_size=bs,
            num_train_epochs= epochs,
            logging_strategy = 'epoch',
            eval_strategy = 'epoch',
            save_strategy = 'epoch',
            load_best_model_at_end = True,
            metric_for_best_model = 'eval_qwk',
            greater_is_better=True,
            report_to = 'none',


        )

      #   train_dataset = EssayDatasetRegression(train_encoding, y_train)
      #   test_dataset = EssayDatasetRegression(test_encoding, y_test)
      #   val_dataset = EssayDatasetRegression(val_encoding, y_val)

          trainer = Trainer(
              model=model,
              args=training_args,
              train_dataset=train_dataset,
              eval_dataset=val_dataset,
              compute_metrics=make_compute_metrics(min_score, max_score),
              callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
          )

          trainer.train()

          results = trainer.predict(val_dataset)
          val_preds = np.argmax(results.predictions, axis=1)
          val_probs = softmax(torch.tensor(results.predictions), dim=1).numpy()
          val_pred_scores = [class_to_score[i] for i in val_preds.astype(int)]
          val_true_scores = [class_to_score[i] for i in results.label_ids]
          preds[val_idx] = val_probs



          results_test = trainer.predict(test_dataset)
          test_preds_2 = np.argmax(results_test.predictions, axis=1)
          test_probs = softmax(torch.tensor(results_test.predictions), dim=1).numpy()
          test_pred_scores = [class_to_score[i] for i in test_preds_2.astype(int)]
          true_test_scores = [class_to_score[i] for i in results_test.label_ids]
          test_preds.append(test_probs)














  avg_test_preds = np.mean(test_preds, axis=0)
  final_test_preds_scores1 = np.argmax(avg_test_preds, axis=1)
  final_test_preds_rounded = [class_to_score[i] for i in final_test_preds_scores1]
  final_true_test_scores = [class_to_score[i] for i in y_test]








  return preds, final_test_preds_rounded, avg_test_preds, y_train, y_test






### Creating the LSTM-based model

In [None]:
import gensim.downloader as api
from gensim.models import KeyedVectors
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import KFold, StratifiedKFold
import optuna
import torch.nn.functional as F
# Prompt 1 optim params: {'hidden_dim': 256, 'lr': 0.00040147129315448076, 'weight_decay': 4.194966656572804e-05, 'dropout1': 0.48972751622541333, 'dropout2': 0.5637289114635329, 'dropout3': 0.4006279296851355, 'batch_size': 64}
# Prompt 2 Best Params: {'hidden_dim': 128, 'lr': 0.0008329626514872093, 'weight_decay': 1.10904162552094e-06, 'dropout1': 0.4113129372112616, 'dropout2': 0.5339519738927687, 'dropout3': 0.6474745827693765, 'batch_size': 32}
# Prompt 3 Best Params: {'hidden_dim': 64, 'lr': 0.0009176211868710996, 'weight_decay': 8.47468900298905e-05, 'dropout1': 0.4402196829382994, 'dropout2': 0.6543812752462479, 'dropout3': 0.30572393075878385, 'batch_size': 128}
# Prompt 4 Best Params: Best trial: {'hidden_dim': 256, 'lr': 0.0006392739979334983, 'weight_decay': 1.0246979767749631e-06, 'dropout1': 0.6331948007619779, 'dropout2': 0.3024115352348601, 'dropout3': 0.4826430525213473, 'batch_size': 64}
# Prompt 5 Best Params: {'hidden_dim': 256, 'lr': 0.0007888666643834901, 'weight_decay': 1.9973208515938436e-05, 'dropout1': 0.5150492631843749, 'dropout2': 0.6836007071470228, 'dropout3': 0.44931934750457436, 'batch_size': 64}
# Prompt 6 Best Params: {'hidden_dim': 256, 'lr': 0.0004221715369392723, 'weight_decay': 1.6693933987910344e-06, 'dropout1': 0.642989336447883, 'dropout2': 0.6149595736926003, 'dropout3': 0.4561023529421714, 'batch_size': 16}
# Prompt 7 Best Params: {'hidden_dim': 128, 'lr': 0.0007031295542079031, 'weight_decay': 1.1167323499868158e-05, 'dropout1': 0.43014203679075136, 'dropout2': 0.6072345773651249, 'dropout3': 0.5301384805467484, 'batch_size': 64}
# Prompt 8 Best Params: {'hidden_dim': 256, 'lr': 0.00044054250769868446, 'weight_decay': 1.8209940215624457e-05, 'dropout1': 0.6820871115494689, 'dropout2': 0.5530971947983455, 'dropout3': 0.6417406365573428, 'batch_size': 64}
def lstm_model_classification_time(X_train, X_test, y_train, y_test):
    w2v_model = api.load('word2vec-google-news-300')
    embedding_size = w2v_model.vector_size

    X_train = X_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)

    X_test = X_test.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    unique_scores = sorted(list(set(np.concatenate([np.array(y_train), np.array(y_test)]))))
    score_to_class = {score: i for i, score in enumerate(unique_scores)}
    class_to_score = {i: score for score, i in score_to_class.items()}
    num_classes = len(score_to_class)

    y_train = pd.Series([score_to_class[s] for s in y_train])
    y_test = pd.Series([score_to_class[s] for s in y_test])

    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    min_score = min(min(y_train), min(y_test))
    max_score = max(max(y_train), max(y_test))


    class Vocab:
        def __init__(self, token_freqs, min_freq=1, specials=['<pad>', '<unk>']):
            self.itos = list(specials)
            self.stoi = {tok: i for i, tok in enumerate(self.itos)}
            for token, freq in token_freqs.items():
                if freq >= min_freq and token not in self.stoi:
                    self.stoi[token] = len(self.itos)
                    self.itos.append(token)
        def __len__(self):
            return len(self.itos)






    class EssayDataset(Dataset):
        def __init__(self, essays, labels, max_len, vocab):
            self.essays = essays
            self.labels = labels
            self.max_length = max_len
            self.vocab = vocab

        def create_encodings(self, text):
            word_token = word_tokenize(text.lower())
            input_ids = [self.vocab.stoi.get(word, self.vocab.stoi['<unk>']) for word in word_token]
            if len(input_ids) < self.max_length:
                input_ids += [self.vocab.stoi['<pad>']] * (self.max_length - len(input_ids))
            else:
                input_ids = input_ids[:self.max_length]
            return torch.tensor(input_ids)

        def __getitem__(self, index):
            essay = self.create_encodings(self.essays[index])
            label = self.labels[index]
            return essay, label

        def __len__(self):
            return len(self.essays)

    class BiLSTM_CNN(nn.Module):
        def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding_matrix):
            super().__init__()
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
            self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
            self.conv1 = nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=3, padding=1)
            self.dropout1 = nn.Dropout(0.642989336447883)
            self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
            self.dropout2 = nn.Dropout(0.6149595736926003)
            self.pool = nn.AdaptiveAvgPool1d(1)
            self.dropout3 = nn.Dropout(0.4561023529421714)
            self.fc = nn.Linear(hidden_dim, num_classes)

        def forward(self, x):
            embedded = self.embedding(x)
            lstm_out, _ = self.lstm(embedded)
            conv1_out = torch.relu(self.conv1(lstm_out.permute(0, 2, 1)))
            conv2_out = torch.relu(self.conv2(conv1_out))
            pool_out = self.pool(conv2_out).squeeze(2)
            output = self.fc(pool_out)
            return output

    def cleaning_data(text):
        text = text.lower()
        text = text.replace('\n', ' ')
        text = text.replace('\t', ' ')
        text = text.replace('\r', ' ')
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        return text

    essay_data = X_train.tolist() + X_test.tolist()
    essays = [cleaning_data(essay) for essay in essay_data]
    X_train = pd.Series(essays[:len(X_train)])
    X_test = pd.Series(essays[len(X_train):])



    counter = Counter()
    for essay in essays:
        counter.update(word_tokenize(essay.lower()))
    most_common = counter.most_common(4000)
    vocab = Vocab(dict(most_common), min_freq=1)

    embedding_matrix = np.zeros((len(vocab), embedding_size))
    for i, word in enumerate(vocab.itos):
        embedding_matrix[i] = w2v_model[word] if word in w2v_model else np.random.normal(scale=0.6, size=(embedding_size,))
    embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

    min_score = int(min(y_train.min(), y_test.min()))
    max_score = int(max(y_train.max(), y_test.max()))





    bins = pd.qcut(y_train, q=5, duplicates='drop', labels = False)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds = np.zeros((len(y_train), num_classes))
    y_train_values = np.zeros(len(y_train))
    test_preds = []





    for train_idx, val_idx in kf.split(X_train, bins):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        train_dataset = EssayDataset(X_tr.tolist(), y_tr.tolist(), 512, vocab)
        val_dataset = EssayDataset(X_val.tolist(), y_val.tolist(), 512, vocab)

        train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

        model = BiLSTM_CNN(len(vocab), embedding_size, 256, embedding_matrix)
        optimizer = torch.optim.Adam(model.parameters(), lr = 0.0004221715369392723, weight_decay = 1.6693933987910344e-06)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='max',
        factor=0.5,
        patience=1
      )
        criterion = nn.CrossEntropyLoss()

        for epoch in range(10):
            model.train()
            total_loss = 0
            for essays, labels in train_loader:
                optimizer.zero_grad()
                output = model(essays)
                loss = criterion(output, labels.long())
                total_loss += loss.item()
                loss.backward()
                optimizer.step()
            print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}')

        model.eval()
        val_preds = []
        val_probs = []
        with torch.no_grad():
            for essays, _ in val_loader:
                output = model(essays).cpu().numpy()
                probs = F.softmax(torch.tensor(output), dim=1).numpy()

                val_probs.append(probs)
                val_preds.extend(np.argmax(probs, axis=1))





        val_probs = np.concatenate(val_probs, axis=0)
        val_pred_scores = [class_to_score[i] for i in val_preds]
        val_true_scores = [class_to_score[i] for i in y_val.tolist()]
        oof_preds[val_idx] = val_probs

        y_train_values[val_idx] = val_true_scores
        qwk = cohen_kappa_score(val_pred_scores, val_true_scores, weights='quadratic')

        scheduler.step(qwk)





        test_dataset = EssayDataset(X_test.tolist(), np.zeros(len(X_test)), 512, vocab)
        test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

        model.eval()
        fold_test_preds = []
        fold_test_probs = []
        with torch.no_grad():
            for essays, _ in test_loader:
                output = model(essays).cpu().numpy()
                fold_test_probs.append(F.softmax(torch.tensor(output), dim=1).numpy())


        test_preds.append(np.concatenate(fold_test_probs, axis=0))


    avg_test_probs = np.mean(test_preds, axis=0)
    final_test_preds = np.argmax(avg_test_probs, axis=1)
    final_test_preds = [class_to_score[i] for i in final_test_preds]

    true_scores = [class_to_score[i] for i in y_test.tolist()]





    return oof_preds, final_test_preds, avg_test_probs, y_train_values, y_test


### Creating the Random Forest model

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import cohen_kappa_score
from sklearn.base import clone
import pandas as pd
import numpy as np
import spacy
from spellchecker import SpellChecker

def random_forest_model_with_gridsearch_classification_time(X_train, X_test, y_train, y_test):
    nlp = spacy.load('en_core_web_sm')
    spell = SpellChecker()

    def cleaning_data(text):
      text = text.replace('\n', ' ')
      text = text.replace('\t', ' ')
      text = text.replace('\r', ' ')
      text = re.sub(r'\s+', ' ', text)
      text = text.strip()
      return text

    X_train = X_train.apply(cleaning_data)
    X_test = X_test.apply(cleaning_data)

    def feature_creation(essay):
        doc = nlp(essay)
        num_sentences = len(list(doc.sents))
        num_words = len(essay.split())
        num_unique_words = len(set(essay.split()))
        num_stop_words = len([token for token in doc if token.is_stop])
        num_proper_nouns = len([token for token in doc if token.pos_ == 'PROPN'])
        num_verbs = len([token for token in doc if token.pos_ == 'VERB'])
        num_adjectives = len([token for token in doc if token.pos_ == 'ADJ'])
        num_adverbs = len([token for token in doc if token.pos_ == 'ADV'])
        num_nouns = len([token for token in doc if token.pos_ == 'NOUN'])
        num_prepositions = len([token for token in doc if token.pos_ == 'ADP'])
        num_pronouns = len([token for token in doc if token.pos_ == 'PRON'])
        num_conjunctions = len([token for token in doc if token.pos_ == 'CCONJ'])
        num_interjections = len([token for token in doc if token.pos_ == 'INTJ'])
        num_punctuation = len([token for token in doc if token.pos_ == 'PUNCT'])
        num_digits = len([token for token in doc if token.pos_ == 'NUM'])
        num_entities = len(list(doc.ents))
        num_spelling_errors = len(spell.unknown(essay.split()))
        avg_word_length = np.mean([len(word) for word in essay.split()])
        avg_sentence_length = np.mean([len(sent) for sent in list(doc.sents)])
        return [
            num_sentences, num_words, num_unique_words, num_stop_words, num_proper_nouns,
            num_verbs, num_adjectives, num_adverbs, num_nouns, num_prepositions,
            num_pronouns, num_conjunctions, num_interjections, num_punctuation, num_digits,
            num_entities, num_spelling_errors, avg_word_length, avg_sentence_length
        ]

    features = np.array([feature_creation(essay) for essay in X_train])
    features_test = np.array([feature_creation(essay) for essay in X_test])

    unique_scores = sorted(list(set(np.concatenate([np.array(y_train), np.array(y_test)]))))
    score_to_class = {score: i for i, score in enumerate(unique_scores)}
    class_to_score = {i: score for score, i in score_to_class.items()}

    y_train_class = y_train.map(score_to_class)
    y_test_class = y_test.map(score_to_class)

    binned_y = pd.qcut(y_train_class, q=5, duplicates='drop', labels=False)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    oof_preds = np.zeros((len(y_train), len(score_to_class)))
    y_train_values = np.zeros(len(y_train))
    test_probs_folds = []

    param_grid = {
        'n_estimators': [100, 300, 500],
        'max_depth': [None, 10, 20, 50],
        'min_samples_split': [2, 5, 7],
        'max_features': ['sqrt', 'log2']
    }

    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=3,
        n_jobs=-1,
        scoring='accuracy'
    )
    grid_search.fit(features, y_train_class)
    print('Best Parameters:', grid_search.best_params_)

    best_model = grid_search.best_estimator_

    for train_idx, val_idx in kf.split(X_train, binned_y):
        X_tr, X_val = features[train_idx], features[val_idx]
        y_tr, y_val = y_train_class.iloc[train_idx], y_train_class.iloc[val_idx]

        model = clone(best_model)
        model.fit(X_tr, y_tr)

        val_probs = model.predict_proba(X_val)
        val_preds = np.argmax(val_probs, axis=1)

        oof_preds[val_idx] = val_probs
        y_train_values[val_idx] = [class_to_score[i] for i in y_val]




        test_probs_folds.append(model.predict_proba(features_test))


    avg_test_probs = np.mean(test_probs_folds, axis=0)
    final_test_class_preds = np.argmax(avg_test_probs, axis=1)
    final_test_scores = [class_to_score[i] for i in final_test_class_preds]
    true_test_scores = [class_to_score[i] for i in y_test_class]






    return oof_preds, final_test_scores, avg_test_probs, y_train_values, y_test.tolist()




### Creating the stacking model itself for the classification problem, with the Logistic Regression meta-learner

In [None]:
import time
from sklearn.linear_model import RidgeCV


start_time = time.time()
X_train_all = pd.Series(X_train_all)
X_test_all = pd.Series(X_test_all)
y_train_all = pd.Series(y_train_all)
y_test_all = pd.Series(y_test_all)


oof_preds_transformer, preds_test_transformer, preds_test_k_fold_avg_bert, bert_y_train, bert_y_test = generate_stacking_preds_classification_time('bert-base-uncased', X_train_all, X_test_all, y_train_all, y_test_all, lr = 0.00005, bs=8, epochs=30)
bert_completion_time = time.time()
print(f'BERT took {bert_completion_time - start_time} seconds')


oof_preds_lstm, preds_test_lstm, preds_test_k_fold_avg_lstm, lstm_y_train, lstm_y_test = lstm_model_classification_time(X_train_all, X_test_all, y_train_all, y_test_all)
lstm_completion_time = time.time()
print(f'LSTM took {lstm_completion_time - bert_completion_time} seconds')

oof_preds_rf, y_pred_rf, preds_test_k_fold_avg_rf, y_train_rf, y_test_rf = random_forest_model_with_gridsearch_classification_time(X_train_all, X_test_all, y_train_all, y_test_all)
rf_completion_time = time.time()
print(f'RF took {rf_completion_time - lstm_completion_time} seconds')

from sklearn.linear_model import RidgeCV
from sklearn.metrics import cohen_kappa_score
import numpy as np
from sklearn.linear_model import LogisticRegressionCV


X_meta_train = np.concatenate([oof_preds_transformer, oof_preds_lstm, oof_preds_rf], axis=1)
X_meta_test = np.concatenate([preds_test_k_fold_avg_bert, preds_test_k_fold_avg_lstm, preds_test_k_fold_avg_rf], axis=1)


unique_scores = sorted(list(set(np.concatenate([np.array(y_train_all), np.array(y_test_all)]))))
score_to_class = {score: i for i, score in enumerate(unique_scores)}
class_to_score = {i: score for score, i in score_to_class.items()}

y_train_all = [score_to_class[s] for s in y_train_all]
y_test_all = [score_to_class[s] for s in y_test_all]




meta_model = LogisticRegressionCV(max_iter=10000, cv=5)
meta_model.fit(X_meta_train, y_train_all)
meta_preds = meta_model.predict(X_meta_test)

min_class = min(score_to_class.values())
max_class = max(score_to_class.values())
meta_class_preds = np.clip(np.rint(meta_preds), min_class, max_class).astype(int)


meta_score_preds = [class_to_score[i] for i in meta_class_preds]
true_scores = [class_to_score[i] for i in y_test_all]


qwk_meta = cohen_kappa_score(true_scores, meta_score_preds, weights='quadratic')
print(f'QWK: {qwk_meta}')




stacking_completion_time = time.time()
print(f'Stacking took {stacking_completion_time - start_time} seconds')






# AI Detection

### Importing data and performing EDA

In [None]:
import pandas as pd
import numpy as np
data_2 = pd.read_csv('/content/drive/MyDrive/AI-Detection/train_v2_drcat_02.csv')
data_2 = data_2[['text', 'label']]
zero_label_count = data_2[data_2['label'] == 0].shape[0]
one_label_count = data_2[data_2['label'] == 1].shape[0]

print(f'Number of rows with label 0: {zero_label_count}')
print(f'Number of rows with label 1: {one_label_count}')


import matplotlib.pyplot as plt

plt.bar(x=['Human text', 'AI-generated text'], height=[zero_label_count, one_label_count])
plt.ylabel('Number of Samples')
plt.title('Class Distribution')
plt.show()


from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=data_2['label'])
print(class_weights)


### Creating an even split of the data

In [None]:
sample_size_per_class = 5000


sampled_df = (
    data_2
    .groupby('label', group_keys=False)
    .apply(lambda x: x.sample(min(len(x), sample_size_per_class), random_state=42))
    .reset_index(drop=True)
)


print(sampled_df['label'].value_counts())



### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sampled_df['text'], sampled_df['label'], test_size=0.2, random_state=42, stratify=sampled_df['label'])


### Creating BERT-based model


In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import cohen_kappa_score, accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split
import optuna
from torch.nn.functional import softmax
import re


# Best Params: {'lr': 2e-05, 'batch_size': 32}
def generate_stacking_preds_classification_ai(model_name, X_train, X_test, y_train, y_test, lr = 0.00001, bs = 16, epochs = 30):
  class EssayDatasetClassification(Dataset):
        def __init__(self, encodings, labels, seq_len = 512):
            self.encodings = encodings
            self.labels = labels
            self.seq_len = 512

        def __getitem__(self, item):
            return {
                'input_ids': self.encodings['input_ids'][item].clone().detach(),
                'attention_mask': self.encodings['attention_mask'][item].clone().detach(),
                'labels': torch.tensor(self.labels[item], dtype=torch.long)
            }

        def __len__(self):
            return len(self.labels)

  tokenizer = AutoTokenizer.from_pretrained(model_name)


  X_train = X_train.reset_index(drop=True)
  y_train = y_train.reset_index(drop=True)

  X_test = X_test.reset_index(drop=True)
  y_test = y_test.reset_index(drop=True)

  # unique_scores = sorted(list(set(np.concatenate([np.array(y_train), np.array(y_test)]))))
  # num_classes = len(score_to_class)







  def make_compute_metrics():
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        acc = accuracy_score(labels, preds)
        recall = recall_score(labels, preds)
        precision = precision_score(labels, preds)
        f1 = f1_score(labels, preds)
        return  {'accuracy': acc,
                 'recall' : recall,
                 'precision' : precision,
                 'F1 score' : f1}
    return compute_metrics

  def tokenize_texts(text):
    cleaned_texts = []
    for t in text:
      t = t.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
      t = re.sub(r'\s+', ' ', t)
      t = t.strip()
      cleaned_texts.append(t)

    return tokenizer(cleaned_texts, padding='max_length', truncation=True, max_length=512, return_tensors='pt')




  binned_y = pd.qcut(y_train, q=5, duplicates='drop', labels = False)
  kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  preds = np.zeros((len(y_train), 2))
  test_preds = []












  X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify = y_train)

  X_train_split = X_train_split.tolist()
  X_val_split = X_val_split.tolist()
  y_train_split = y_train_split.tolist()
  y_val_split = y_val_split.tolist()
  X_test = X_test.tolist()
  y_test = y_test.tolist()

  tokenizer = AutoTokenizer.from_pretrained(model_name)
  X_train_split_encoding = tokenize_texts(X_train_split)
  X_test_encoding = tokenize_texts(X_test)
  X_val_split_encoding = tokenize_texts(X_val_split)


  X_train_split_dataset = EssayDatasetClassification(X_train_split_encoding, y_train_split)
  X_test_dataset = EssayDatasetClassification(X_test_encoding, y_test)
  X_val_split_dataset = EssayDatasetClassification(X_val_split_encoding, y_val_split)
  # true_labels = y_test.tolist()

  model2 = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type='single_label_classification', num_labels = 2)

  for name, param in model2.named_parameters():
      param.requires_grad = False

  for name, param in model2.named_parameters():
      if any(layer in name for layer in ['encoder.layer.8', 'encoder.layer.9','encoder.layer.10', 'encoder.layer.11', 'pooler', 'classifier']):
          param.requires_grad = True



  training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=lr,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    num_train_epochs= epochs,
    logging_strategy = 'epoch',
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    metric_for_best_model = 'accuracy',
    greater_is_better=True,
    report_to = 'none',


  )


  trainer2 = Trainer(
      model=model2,
      args=training_args,
      train_dataset=X_train_split_dataset,
      compute_metrics = make_compute_metrics(),
      eval_dataset=X_val_split_dataset,
      callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
  )

  trainer2.train()




  results_test = trainer2.predict(X_test_dataset)
  test_preds_output = np.argmax(results_test.predictions, axis=1)
  acc = accuracy_score(y_test, test_preds_output)
  recall = recall_score(y_test, test_preds_output)
  precision = precision_score(y_test, test_preds_output)
  f1 = f1_score(y_test, test_preds_output)
  print(f'Accuracy: {acc}')
  print(f'Recall: {recall}')
  print(f'Precision: {precision}')
  print(f'F1 Score: {f1}')


















#   ### This code below is for hyper-parameter tuning


def generate_stacking_preds_hp_tune_ai(model_name, X_train, X_test, y_train, y_test, lr = 0.00001, bs = 16, epochs = 30):
  class EssayDatasetClassification(Dataset):
        def __init__(self, encodings, labels, seq_len = 512):
            self.encodings = encodings
            self.labels = labels
            self.seq_len = 512

        def __getitem__(self, item):
            return {
                'input_ids': self.encodings['input_ids'][item].clone().detach(),
                'attention_mask': self.encodings['attention_mask'][item].clone().detach(),
                'labels': torch.tensor(self.labels[item], dtype=torch.long)
            }

        def __len__(self):
            return len(self.labels)

  tokenizer = AutoTokenizer.from_pretrained(model_name)


  X_train = X_train.reset_index(drop=True)
  y_train = y_train.reset_index(drop=True)

  X_test = X_test.reset_index(drop=True)
  y_test = y_test.reset_index(drop=True)

  # unique_scores = sorted(list(set(np.concatenate([np.array(y_train), np.array(y_test)]))))
  # num_classes = len(score_to_class)







  def make_compute_metrics():
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        acc = accuracy_score(labels, preds)
        recall = recall_score(labels, preds)
        precision = precision_score(labels, preds)
        f1 = f1_score(labels, preds)
        return  {'accuracy': acc,
                 'recall' : recall,
                 'precision' : precision,
                 'fi_score' : f1}
    return compute_metrics

  def tokenize_texts(text):
    cleaned_texts = []
    for t in text:
      t = t.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
      t = re.sub(r'\s+', ' ', t)
      t = t.strip()
      cleaned_texts.append(t)

    return tokenizer(cleaned_texts, padding='max_length', truncation=True, max_length=512, return_tensors='pt')






  model = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type = 'single_label_classification', num_labels = 2)

  X_train_ft, X_val, y_train_ft, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify = y_train)

  X_train_ft = X_train_ft.reset_index(drop=True)
  X_val = X_val.reset_index(drop=True)
  y_train_ft = pd.Series(y_train_ft).reset_index(drop=True)
  y_val = pd.Series(y_val).reset_index(drop=True)


  train_data = X_train_ft.tolist()
  val_data = X_val.tolist()
  train_labels = y_train_ft
  val_labels = y_val

  train_encoding = tokenize_texts(train_data)
  val_encoding = tokenize_texts(val_data)


  train_dataset = EssayDatasetClassification(train_encoding, train_labels)
  val_dataset = EssayDatasetClassification(val_encoding, val_labels)




  for name, param in model.named_parameters():
      param.requires_grad = False

  for name, param in model.named_parameters():
    if any(layer in name for layer in ['encoder.layer.8', 'encoder.layer.9','encoder.layer.10', 'encoder.layer.11', 'pooler', 'classifier']):
        param.requires_grad = True

        # min_score = min(y_train)
        # max_score = max(y_train)


  training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=lr,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    num_train_epochs= epochs,
    logging_strategy = 'epoch',
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    metric_for_best_model = 'accuracy',
    greater_is_better=True,
    report_to = 'none',


  )



  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      compute_metrics=make_compute_metrics(),
      callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
  )

  trainer.train()

  results = trainer.predict(val_dataset)
  val_preds = np.argmax(results.predictions, axis=1)
  acc = accuracy_score(results.label_ids, val_preds)
  recall = recall_score(results.label_ids, val_preds)
  precision = precision_score(results.label_ids, val_preds)
  f1 = f1_score(results.label_ids, val_preds)
  print(f'Accuracy: {acc}')
  print(f'Recall: {recall}')
  print(f'Precision: {precision}')
  print(f'F1 Score: {f1}')







  return {
    'accuracy': acc,
    'recall': recall,
    'precision': precision,
    'f1': f1
}

def tune_hyperparameters(trial):
    lr = trial.suggest_float('lr', 1e-5, 5e-5, step=1e-5)
    bs = trial.suggest_categorical('batch_size', [8, 16, 32])

    print(f'Trial {trial.number}: lr={lr}, batch_size={bs}')



    metrics = generate_stacking_preds_hp_tune_ai('bert-base-uncased', X_train, X_test, y_train, y_test, lr = lr, bs = bs, epochs = 30)
    print(f"[Trial {trial.number}] Accuracy: {metrics['accuracy']}, "
          f"Precision: {metrics['precision']}, "
          f"Recall: {metrics['recall']}, "
          f"F1: {metrics['f1']}")

    return metrics['accuracy']



# study = optuna.create_study(direction='maximize')
# study.optimize(tune_hyperparameters, n_trials=10)

# print('Best acc:', study.best_value)
# print('Best class weights:', study.best_trial.params)




### Checking the run time and performance for the BERT-based model

In [None]:
import time
start_time = time.time()
generate_stacking_preds_classification_ai('bert-base-uncased', X_train, X_test, y_train, y_test, lr = 0.00002, bs = 32, epochs = 30)
end_time = time.time()
print(f'Time taken: {end_time - start_time} seconds')



### Creating LSTM-based model

In [None]:
import gensim.downloader as api
from gensim.models import KeyedVectors
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.model_selection import KFold, StratifiedKFold
import optuna
import torch.nn.functional as F
import time
import re
from nltk import word_tokenize
# Best Params: {'hidden_dim': 256, 'lr': 0.00014009792265935007, 'weight_decay': 4.032946922784683e-06, 'dropout1': 0.28928948977032815, 'dropout2': 0.3824601758543915, 'dropout3': 0.3210742837267551, 'batch_size': 32}
def lstm_model_ai(X_train, X_test, y_train, y_test):
    w2v_model = api.load('word2vec-google-news-300')
    embedding_size = w2v_model.vector_size

    X_train = X_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)

    X_test = X_test.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    # unique_scores = sorted(list(set(np.concatenate([np.array(y_train), np.array(y_test)]))))
    # score_to_class = {score: i for i, score in enumerate(unique_scores)}
    # class_to_score = {i: score for score, i in score_to_class.items()}
    # num_classes = len(score_to_class)

    # y_train = pd.Series([score_to_class[s] for s in y_train])
    # y_test = pd.Series([score_to_class[s] for s in y_test])

    # y_train = y_train.reset_index(drop=True)
    # y_test = y_test.reset_index(drop=True)

    # min_score = min(min(y_train), min(y_test))
    # max_score = max(max(y_train), max(y_test))


    class Vocab:
        def __init__(self, token_freqs, min_freq=1, specials=['<pad>', '<unk>']):
            self.itos = list(specials)
            self.stoi = {tok: i for i, tok in enumerate(self.itos)}
            for token, freq in token_freqs.items():
                if freq >= min_freq and token not in self.stoi:
                    self.stoi[token] = len(self.itos)
                    self.itos.append(token)
        def __len__(self):
            return len(self.itos)






    class EssayDataset(Dataset):
        def __init__(self, essays, labels, max_len, vocab):
            self.essays = essays
            self.labels = labels
            self.max_length = max_len
            self.vocab = vocab

        def create_encodings(self, text):
            word_token = word_tokenize(text)
            input_ids = [self.vocab.stoi.get(word, self.vocab.stoi['<unk>']) for word in word_token]
            if len(input_ids) < self.max_length:
                input_ids += [self.vocab.stoi['<pad>']] * (self.max_length - len(input_ids))
            else:
                input_ids = input_ids[:self.max_length]
            return torch.tensor(input_ids)

        def __getitem__(self, index):
            essay = self.create_encodings(self.essays[index])
            label = float(self.labels[index])
            return essay, torch.tensor(label)

        def __len__(self):
            return len(self.essays)

    class BiLSTM_CNN(nn.Module):
        def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding_matrix):
            super().__init__()
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
            self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
            self.conv1 = nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=3, padding=1)
            self.dropout1 = nn.Dropout(0.28928948977032815)
            self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
            self.dropout2 = nn.Dropout(0.3824601758543915)
            self.pool = nn.AdaptiveAvgPool1d(1)
            self.dropout3 = nn.Dropout( 0.3210742837267551)
            self.fc = nn.Linear(hidden_dim, 1)

        def forward(self, x):
            embedded = self.embedding(x)
            lstm_out, _ = self.lstm(embedded)
            conv1_out = torch.relu(self.conv1(lstm_out.permute(0, 2, 1)))
            conv2_out = torch.relu(self.conv2(conv1_out))
            pool_out = self.pool(conv2_out).squeeze(2)
            output = self.fc(pool_out).squeeze(1)
            return output

    def cleaning_data(text):
        text = text.replace('\n', ' ')
        text = text.replace('\t', ' ')
        text = text.replace('\r', ' ')
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        return text

    essay_data = X_train.tolist() + X_test.tolist()
    essays = [cleaning_data(essay) for essay in essay_data]
    X_train = pd.Series(essays[:len(X_train)])
    X_test = pd.Series(essays[len(X_train):])



    counter = Counter()
    for essay in essays:
        counter.update(word_tokenize(essay))
    most_common = counter.most_common(4000)
    vocab = Vocab(dict(most_common), min_freq=1)

    embedding_matrix = np.zeros((len(vocab), embedding_size))
    for i, word in enumerate(vocab.itos):
        embedding_matrix[i] = w2v_model[word] if word in w2v_model else np.random.normal(scale=0.6, size=(embedding_size,))
    embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

    min_score = int(min(y_train.min(), y_test.min()))
    max_score = int(max(y_train.max(), y_test.max()))





    bins = pd.qcut(y_train, q=5, duplicates='drop', labels = False)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds = np.zeros((len(y_train), 2))
    y_train_values = np.zeros(len(y_train))
    test_preds = []
    # scaler = MinMaxScaler()
    # scaler.fit(y_train.values.reshape(-1, 1))

    # Comment this out to do fine-tuning
    # def tuning_parameters(trial):
    #   split = int(0.8 * len(X_train))
    #   X_tr, X_val = X_train[:split].reset_index(drop=True), X_train[split:].reset_index(drop=True)
    #   y_tr, y_val = y_train[:split].reset_index(drop=True), y_train[split:].reset_index(drop=True)

    #   hidden_dim = trial.suggest_categorical('hidden_dim', [64, 128, 256, 512])
    #   lr = trial.suggest_float('lr', 1e-5, 1e-3, log = True)
    #   weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-4, log=True)
    #   dropout1 = trial.suggest_float('dropout1', 0.2, 0.8)
    #   dropout2 = trial.suggest_float('dropout2', 0.2, 0.8)
    #   dropout3 = trial.suggest_float('dropout3', 0.2, 0.8)
    #   batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])




    #   full_train_dataset = EssayDataset(X_tr.tolist(), y_tr, 512, vocab)
    #   full_train_loader = DataLoader(full_train_dataset, batch_size=batch_size, shuffle=True)

    #   full_val_dataset = EssayDataset(X_val.tolist(), y_val, 512, vocab)
    #   full_val_loader = DataLoader(full_val_dataset, batch_size=batch_size, shuffle=False)

    #   model = BiLSTM_CNN(len(vocab), embedding_size, hidden_dim, embedding_matrix)
    #   model.dropout1.p = dropout1
    #   model.dropout2.p = dropout2
    #   model.dropout3.p = dropout3
    #   optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    #   criterion = nn.BCEWithLogitsLoss()
    #   for epoch in range(5):
    #       model.train()
    #       total_loss = 0
    #       for essays, labels in full_train_loader:
    #           optimizer.zero_grad()
    #           output = model(essays)
    #           loss = criterion(output, labels.float())
    #           total_loss += loss.item()
    #           loss.backward()
    #           optimizer.step()

    #   model.eval()
    #   val_preds = []
    #   with torch.no_grad():
    #       for essays, _ in full_val_loader:
    #           output = model(essays).cpu().numpy()
    #           pred = torch.sigmoid(torch.tensor(output)).numpy()
    #           pred = np.where(pred > 0.5, 1, 0)
    #           val_preds.extend(pred)

    #   acc = accuracy_score(y_val, val_preds)
    #   recall = recall_score(y_val, val_preds, average='macro')
    #   precision = precision_score(y_val, val_preds, average='macro')
    #   f1 = f1_score(y_val, val_preds, average='macro')
    #   print(f'Accuracy: {acc}, Recall: {recall}, Precision: {precision}, F1: {f1}')
    #   return acc

    # study = optuna.create_study(direction='maximize')
    # study.optimize(tuning_parameters, n_trials=10)
    # print('Number of finished trials:', len(study.trials))
    # print('Best trial:', study.best_trial.params)









    full_train_dataset = EssayDataset(X_train.tolist(), y_train, 512, vocab)
    full_train_loader = DataLoader(full_train_dataset, batch_size=32, shuffle=True)

    model2 = BiLSTM_CNN(len(vocab), embedding_size, 256, embedding_matrix)
    optimizer = torch.optim.Adam(model2.parameters(), lr = 0.00014009792265935007, weight_decay = 4.032946922784683e-06)
    criterion  = nn.BCEWithLogitsLoss()

    for epoch in range(10):
        model2.train()
        total_loss = 0
        for essays, labels in full_train_loader:
            optimizer.zero_grad()
            output = model2(essays)
            loss = criterion(output, labels.float())
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(full_train_loader)}')


    model2.eval()
    test_dataset = EssayDataset(X_test.tolist(), np.zeros(len(X_test)), 512, vocab)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    final_test_preds = []
    with torch.no_grad():
        for essays, _ in test_loader:
              output = model2(essays).cpu().numpy()
              pred = torch.sigmoid(torch.tensor(output)).numpy()
              pred = np.where(pred > 0.5, 1, 0)
              final_test_preds.extend(pred)

    acc = accuracy_score(y_test, final_test_preds)
    recall = recall_score(y_test, final_test_preds)
    precision = precision_score(y_test, final_test_preds)
    f1 = f1_score(y_test, final_test_preds)

    print(f'Accuracy: {acc}, Recall: {recall}, Precision: {precision}, F1: {f1}')
    print(classification_report(y_test, final_test_preds))


    # qwk = cohen_kappa_score(final_test_preds, true_scores, weights = 'quadratic')
    # print(f'\n[Full Data Model] Test QWK: {qwk}')

    # return oof_preds
    # return oof_preds, final_test_preds, avg_test_probs, y_train_values, y_test


### Checking the run time and performance for the LSTM-based model

In [None]:
start_time = time.time()
lstm_model_ai(X_train, X_test, y_train, y_test)
end_time = time.time()
print(f'Time taken: {end_time - start_time} seconds')

### Creating the Random Forest model

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import cohen_kappa_score
from sklearn.base import clone
import pandas as pd
import numpy as np
import spacy
from spellchecker import SpellChecker
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import re

def random_forest_model_with_gridsearch_ai(X_train, X_test, y_train, y_test):
    nlp = spacy.load('en_core_web_sm')
    spell = SpellChecker()

    def cleaning_data(text):
      text = text.replace('\n', ' ')
      text = text.replace('\t', ' ')
      text = text.replace('\r', ' ')
      text = re.sub(r'\s+', ' ', text)
      text = text.strip()
      return text

    X_train = X_train.apply(cleaning_data)
    X_test = X_test.apply(cleaning_data)

    def feature_creation(essay):
        doc = nlp(essay)
        num_sentences = len(list(doc.sents))
        num_words = len(essay.split())
        num_unique_words = len(set(essay.split()))
        num_stop_words = len([token for token in doc if token.is_stop])
        num_proper_nouns = len([token for token in doc if token.pos_ == 'PROPN'])
        num_verbs = len([token for token in doc if token.pos_ == 'VERB'])
        num_adjectives = len([token for token in doc if token.pos_ == 'ADJ'])
        num_adverbs = len([token for token in doc if token.pos_ == 'ADV'])
        num_nouns = len([token for token in doc if token.pos_ == 'NOUN'])
        num_prepositions = len([token for token in doc if token.pos_ == 'ADP'])
        num_pronouns = len([token for token in doc if token.pos_ == 'PRON'])
        num_conjunctions = len([token for token in doc if token.pos_ == 'CCONJ'])
        num_interjections = len([token for token in doc if token.pos_ == 'INTJ'])
        num_punctuation = len([token for token in doc if token.pos_ == 'PUNCT'])
        num_digits = len([token for token in doc if token.pos_ == 'NUM'])
        num_entities = len(list(doc.ents))
        num_spelling_errors = len(spell.unknown(essay.split()))
        avg_word_length = np.mean([len(word) for word in essay.split()])
        avg_sentence_length = np.mean([len(sent) for sent in list(doc.sents)])
        return [
            num_sentences, num_words, num_unique_words, num_stop_words, num_proper_nouns,
            num_verbs, num_adjectives, num_adverbs, num_nouns, num_prepositions,
            num_pronouns, num_conjunctions, num_interjections, num_punctuation, num_digits,
            num_entities, num_spelling_errors, avg_word_length, avg_sentence_length
        ]

    features = np.array([feature_creation(essay.strip()) for essay in X_train])
    features_test = np.array([feature_creation(essay.strip()) for essay in X_test])

    # unique_scores = sorted(list(set(np.concatenate([np.array(y_train), np.array(y_test)]))))
    # score_to_class = {score: i for i, score in enumerate(unique_scores)}
    # class_to_score = {i: score for score, i in score_to_class.items()}

    # y_train_class = y_train.map(score_to_class)
    # y_test_class = y_test.map(score_to_class)

    binned_y = pd.qcut(y_train, q=5, duplicates='drop', labels=False)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    oof_preds = np.zeros((len(y_train), 2))
    y_train_values = np.zeros(len(y_train))
    test_probs_folds = []

    param_grid = {
        'n_estimators': [100, 300, 500],
        'max_depth': [None, 10, 20, 50],
        'min_samples_split': [2, 5, 7],
        'max_features': ['sqrt', 'log2']
    }

    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=5,
        n_jobs=-1,
        scoring='accuracy'
    )
    grid_search.fit(features, y_train)
    print('Best Parameters:', grid_search.best_params_)

    best_model = grid_search.best_estimator_


    model2 = clone(best_model)
    model2.fit(features, y_train)

    final_test_preds = model2.predict(features_test)
    acc = accuracy_score(y_test, final_test_preds)
    recall = recall_score(y_test, final_test_preds)
    precision = precision_score(y_test, final_test_preds)
    f1 = f1_score(y_test, final_test_preds)
    print(f'Accuracy: {acc}, Recall: {recall}, Precision: {precision}, F1: {f1}')








### Checking the run time and performance for the Random Forest model

In [None]:
start_time = time.time()
random_forest_model_with_gridsearch_ai(X_train, X_test, y_train, y_test)
end_time = time.time()
print(f'Time taken: {end_time - start_time} seconds')

# Creating AI-detection stacking model

### Creating BERT-based model for stacking

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import cohen_kappa_score, accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import optuna
from torch.nn.functional import softmax
import re



# Best Params: {'lr': 2e-05, 'batch_size': 32}
def generate_stacking_preds_classification_ai_time(model_name, X_train, X_test, y_train, y_test, lr = 0.00001, bs = 16, epochs = 30):
  class EssayDatasetClassification(Dataset):
        def __init__(self, encodings, labels, seq_len = 512):
            self.encodings = encodings
            self.labels = labels
            self.seq_len = 512

        def __getitem__(self, item):
            return {
                'input_ids': self.encodings['input_ids'][item].clone().detach(),
                'attention_mask': self.encodings['attention_mask'][item].clone().detach(),
                'labels': torch.tensor(self.labels[item], dtype=torch.long)
            }

        def __len__(self):
            return len(self.labels)

  tokenizer = AutoTokenizer.from_pretrained(model_name)


  X_train = X_train.reset_index(drop=True)
  y_train = y_train.reset_index(drop=True)

  X_test = X_test.reset_index(drop=True)
  y_test = y_test.reset_index(drop=True)

  # unique_scores = sorted(list(set(np.concatenate([np.array(y_train), np.array(y_test)]))))
  # num_classes = len(score_to_class)







  def make_compute_metrics():
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        acc = accuracy_score(labels, preds)
        recall = recall_score(labels, preds)
        precision = precision_score(labels, preds)
        f1 = f1_score(labels, preds)
        return  {'accuracy': acc,
                 'recall': recall,
                 'precision': precision,
                 'F1 score': f1}
    return compute_metrics

  def tokenize_texts(text):
    cleaned_texts = []
    for t in text:
      t = t.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
      t = re.sub(r'\s+', ' ', t)
      t = t.strip()
      cleaned_texts.append(t)

    return tokenizer(cleaned_texts, padding='max_length', truncation=True, max_length=512, return_tensors='pt')




  binned_y = pd.qcut(y_train, q=5, duplicates='drop', labels = False)
  kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  preds = np.zeros((len(y_train), 2))
  test_preds = []

  for train_idx, val_idx in kf.split(X_train, binned_y):
          model = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type='single_label_classification', num_labels = 2)

          train_data = X_train.iloc[train_idx].tolist()
          val_data = X_train.iloc[val_idx].tolist()
          train_labels = y_train.iloc[train_idx].tolist()
          val_labels = y_train.iloc[val_idx].tolist()

          train_encoding = tokenize_texts(train_data)
          val_encoding = tokenize_texts(val_data)

          test_encoding = tokenize_texts(X_test.tolist())






          train_dataset = EssayDatasetClassification(train_encoding, train_labels)
          test_dataset = EssayDatasetClassification(test_encoding, y_test)
          val_dataset = EssayDatasetClassification(val_encoding, val_labels)




          for name, param in model.named_parameters():
              param.requires_grad = False

          for name, param in model.named_parameters():
            if any(layer in name for layer in ['encoder.layer.8', 'encoder.layer.9','encoder.layer.10', 'encoder.layer.11', 'pooler', 'classifier']):
                param.requires_grad = True

          # min_score = min(y_train)
          # max_score = max(y_train)


          training_args = TrainingArguments(
            output_dir='./results',
            learning_rate=lr,
            per_device_train_batch_size=bs,
            per_device_eval_batch_size=bs,
            num_train_epochs= epochs,
            logging_strategy = 'epoch',
            eval_strategy = 'epoch',
            save_strategy = 'epoch',
            load_best_model_at_end = True,
            metric_for_best_model = 'accuracy',
            greater_is_better=True,
            report_to = 'none',


        )

      #   train_dataset = EssayDatasetRegression(train_encoding, y_train)
      #   test_dataset = EssayDatasetRegression(test_encoding, y_test)
      #   val_dataset = EssayDatasetRegression(val_encoding, y_val)

          trainer = Trainer(
              model=model,
              args=training_args,
              train_dataset=train_dataset,
              eval_dataset=val_dataset,
              compute_metrics=make_compute_metrics(),
              callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
          )

          trainer.train()

          results = trainer.predict(val_dataset)
          val_preds = np.argmax(results.predictions, axis=1)
          val_probs = softmax(torch.tensor(results.predictions), dim=1).numpy()
          preds[val_idx] = val_probs


          results_test = trainer.predict(test_dataset)
          test_preds_2 = np.argmax(results_test.predictions, axis=1)
          test_probs = softmax(torch.tensor(results_test.predictions), dim=1).numpy()
          test_preds.append(test_probs)
























  avg_test_preds = np.mean(test_preds, axis=0)
  final_test_preds_scores= np.argmax(avg_test_preds, axis=1)







  return preds, avg_test_preds, y_train, y_test






### Creating LSTM-based model for stacking

In [None]:
import gensim.downloader as api
from gensim.models import KeyedVectors
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold
import optuna
import torch.nn.functional as F
import time
import re
from nltk import word_tokenize
# Best Params: {'hidden_dim': 256, 'lr': 0.00014009792265935007, 'weight_decay': 4.032946922784683e-06, 'dropout1': 0.28928948977032815, 'dropout2': 0.3824601758543915, 'dropout3': 0.3210742837267551, 'batch_size': 32}
def lstm_model_ai_time(X_train, X_test, y_train, y_test):
    w2v_model = api.load('word2vec-google-news-300')
    embedding_size = w2v_model.vector_size

    X_train = X_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)

    X_test = X_test.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    # unique_scores = sorted(list(set(np.concatenate([np.array(y_train), np.array(y_test)]))))
    # score_to_class = {score: i for i, score in enumerate(unique_scores)}
    # class_to_score = {i: score for score, i in score_to_class.items()}
    # num_classes = len(score_to_class)

    # y_train = pd.Series([score_to_class[s] for s in y_train])
    # y_test = pd.Series([score_to_class[s] for s in y_test])

    # y_train = y_train.reset_index(drop=True)
    # y_test = y_test.reset_index(drop=True)

    # min_score = min(min(y_train), min(y_test))
    # max_score = max(max(y_train), max(y_test))


    class Vocab:
        def __init__(self, token_freqs, min_freq=1, specials=['<pad>', '<unk>']):
            self.itos = list(specials)
            self.stoi = {tok: i for i, tok in enumerate(self.itos)}
            for token, freq in token_freqs.items():
                if freq >= min_freq and token not in self.stoi:
                    self.stoi[token] = len(self.itos)
                    self.itos.append(token)
        def __len__(self):
            return len(self.itos)





    class EssayDataset(Dataset):
        def __init__(self, essays, labels, max_len, vocab):
            self.essays = essays
            self.labels = labels
            self.max_length = max_len
            self.vocab = vocab

        def create_encodings(self, text):
            word_token = word_tokenize(text.lower())
            input_ids = [self.vocab.stoi.get(word, self.vocab.stoi['<unk>']) for word in word_token]
            if len(input_ids) < self.max_length:
                input_ids += [self.vocab.stoi['<pad>']] * (self.max_length - len(input_ids))
            else:
                input_ids = input_ids[:self.max_length]
            return torch.tensor(input_ids)

        def __getitem__(self, index):
            essay = self.create_encodings(self.essays[index])
            label = float(self.labels[index])
            return essay, torch.tensor(label)

        def __len__(self):
            return len(self.essays)

    class BiLSTM_CNN(nn.Module):
        def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding_matrix):
            super().__init__()
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
            self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
            self.conv1 = nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=3, padding=1)
            self.dropout1 = nn.Dropout(0.28928948977032815)
            self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
            self.dropout2 = nn.Dropout(0.3824601758543915)
            self.pool = nn.AdaptiveAvgPool1d(1)
            self.dropout3 = nn.Dropout( 0.3210742837267551)
            self.fc = nn.Linear(hidden_dim, 1)

        def forward(self, x):
            embedded = self.embedding(x)
            lstm_out, _ = self.lstm(embedded)
            conv1_out = torch.relu(self.conv1(lstm_out.permute(0, 2, 1)))
            conv2_out = torch.relu(self.conv2(conv1_out))
            pool_out = self.pool(conv2_out).squeeze(2)
            output = self.fc(pool_out).squeeze(1)
            return output

    def cleaning_data(text):
        text = text.replace('\n', ' ')
        text = text.replace('\t', ' ')
        text = text.replace('\r', ' ')
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        return text

    essay_data = X_train.tolist() + X_test.tolist()
    essays = [cleaning_data(essay) for essay in essay_data]
    X_train = pd.Series(essays[:len(X_train)])
    X_test = pd.Series(essays[len(X_train):])



    counter = Counter()
    for essay in essays:
        counter.update(word_tokenize(essay.lower()))
    most_common = counter.most_common(4000)
    vocab = Vocab(dict(most_common), min_freq=1)

    embedding_matrix = np.zeros((len(vocab), embedding_size))
    for i, word in enumerate(vocab.itos):
        embedding_matrix[i] = w2v_model[word] if word in w2v_model else np.random.normal(scale=0.6, size=(embedding_size,))
    embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

    min_score = int(min(y_train.min(), y_test.min()))
    max_score = int(max(y_train.max(), y_test.max()))





    bins = pd.qcut(y_train, q=5, duplicates='drop', labels = False)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds = np.zeros((len(y_train), 2))
    y_train_values = np.zeros(len(y_train))
    test_preds = []
    # scaler = MinMaxScaler()
    # scaler.fit(y_train.values.reshape(-1, 1)


    for train_idx, val_idx in kf.split(X_train, bins):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        train_dataset = EssayDataset(X_tr.tolist(), y_tr.tolist(), 512, vocab)
        val_dataset = EssayDataset(X_val.tolist(), y_val.tolist(), 512, vocab)

        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

        model = BiLSTM_CNN(len(vocab), embedding_size,256, embedding_matrix)
        optimizer = torch.optim.Adam(model.parameters(), lr = 0.00014009792265935007, weight_decay = 4.032946922784683e-06)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='max',
        factor=0.5,
        patience=1

      )
        criterion = nn.BCEWithLogitsLoss()

        for epoch in range(10):
            model.train()
            total_loss = 0
            for essays, labels in train_loader:
                optimizer.zero_grad()
                output = model(essays)
                loss = criterion(output, labels.float())
                total_loss += loss.item()
                loss.backward()
                optimizer.step()
            print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}')

        model.eval()
        val_preds = []
        val_probs = []
        with torch.no_grad():
            for essays, _ in val_loader:
                output = model(essays).cpu().numpy()
                probs = torch.sigmoid(torch.tensor(output)).numpy()
                both_probs = np.stack([1 - probs, probs], axis=1)
                val_probs.append(both_probs)
                preds = np.where(probs > 0.5, 1, 0)
                val_preds.append(preds)






        val_probs = np.concatenate(val_probs, axis=0)
        val_preds = np.concatenate(val_preds, axis=0)
        oof_preds[val_idx] = val_probs

        y_train_values[val_idx] = y_val.tolist()
        accuracy = accuracy_score(y_val, val_preds)

        scheduler.step(accuracy)





        test_dataset = EssayDataset(X_test.tolist(), np.zeros(len(X_test)), 512, vocab)
        test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

        model.eval()
        fold_test_preds = []
        fold_test_probs = []
        with torch.no_grad():
            for essays, _ in test_loader:
                output = model(essays).cpu().numpy()
                probs = torch.sigmoid(torch.tensor(output)).numpy()
                both_probs = np.stack([1 - probs, probs], axis=1)
                fold_test_probs.append(both_probs)
                preds = np.where(probs > 0.5, 1, 0)
                fold_test_preds.append(preds)







        test_preds.append(np.concatenate(fold_test_probs, axis=0))


    avg_test_probs = np.mean(test_preds, axis=0)
    final_test_preds = np.argmax(avg_test_probs, axis=1)

    labels = y_test.tolist()






    return oof_preds, avg_test_probs, y_train_values, y_test


### Create Random Forest Model for stacking

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import cohen_kappa_score
from sklearn.base import clone
import pandas as pd
import numpy as np
import spacy
from spellchecker import SpellChecker
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import re

def random_forest_model_with_gridsearch_ai_time(X_train, X_test, y_train, y_test):
    nlp = spacy.load('en_core_web_sm')
    spell = SpellChecker()

    def cleaning_data(text):
      text = text.replace('\n', ' ')
      text = text.replace('\t', ' ')
      text = text.replace('\r', ' ')
      text = re.sub(r'\s+', ' ', text)
      text = text.strip()
      return text

    X_train = X_train.apply(cleaning_data)
    X_test = X_test.apply(cleaning_data)

    def feature_creation(essay):
        doc = nlp(essay)
        num_sentences = len(list(doc.sents))
        num_words = len(essay.split())
        num_unique_words = len(set(essay.split()))
        num_stop_words = len([token for token in doc if token.is_stop])
        num_proper_nouns = len([token for token in doc if token.pos_ == 'PROPN'])
        num_verbs = len([token for token in doc if token.pos_ == 'VERB'])
        num_adjectives = len([token for token in doc if token.pos_ == 'ADJ'])
        num_adverbs = len([token for token in doc if token.pos_ == 'ADV'])
        num_nouns = len([token for token in doc if token.pos_ == 'NOUN'])
        num_prepositions = len([token for token in doc if token.pos_ == 'ADP'])
        num_pronouns = len([token for token in doc if token.pos_ == 'PRON'])
        num_conjunctions = len([token for token in doc if token.pos_ == 'CCONJ'])
        num_interjections = len([token for token in doc if token.pos_ == 'INTJ'])
        num_punctuation = len([token for token in doc if token.pos_ == 'PUNCT'])
        num_digits = len([token for token in doc if token.pos_ == 'NUM'])
        num_entities = len(list(doc.ents))
        num_spelling_errors = len(spell.unknown(essay.split()))
        avg_word_length = np.mean([len(word) for word in essay.split()])
        avg_sentence_length = np.mean([len(sent) for sent in list(doc.sents)])
        return [
            num_sentences, num_words, num_unique_words, num_stop_words, num_proper_nouns,
            num_verbs, num_adjectives, num_adverbs, num_nouns, num_prepositions,
            num_pronouns, num_conjunctions, num_interjections, num_punctuation, num_digits,
            num_entities, num_spelling_errors, avg_word_length, avg_sentence_length
        ]

    features = np.array([feature_creation(essay.strip()) for essay in X_train])
    features_test = np.array([feature_creation(essay.strip()) for essay in X_test])

    # unique_scores = sorted(list(set(np.concatenate([np.array(y_train), np.array(y_test)]))))
    # score_to_class = {score: i for i, score in enumerate(unique_scores)}
    # class_to_score = {i: score for score, i in score_to_class.items()}

    # y_train_class = y_train.map(score_to_class)
    # y_test_class = y_test.map(score_to_class)

    binned_y = pd.qcut(y_train, q=5, duplicates='drop', labels=False)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    oof_preds = np.zeros((len(y_train), 2))
    y_train_values = np.zeros(len(y_train))
    test_probs_folds = []

    param_grid = {
        'n_estimators': [100, 300, 500],
        'max_depth': [None, 10, 20, 50],
        'min_samples_split': [2, 5, 7],
        'max_features': ['sqrt', 'log2']
    }

    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=5,
        n_jobs=-1,
        scoring='accuracy'
    )
    grid_search.fit(features, y_train)
    print('Best Parameters:', grid_search.best_params_)

    best_model = grid_search.best_estimator_

    for train_idx, val_idx in kf.split(X_train, binned_y):
        X_tr, X_val = features[train_idx], features[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = clone(best_model)
        model.fit(X_tr, y_tr)

        val_probs = model.predict_proba(X_val)
        val_preds = np.argmax(val_probs, axis=1)

        oof_preds[val_idx] = val_probs
        y_train_values[val_idx] = y_val



        test_probs_folds.append(model.predict_proba(features_test))


    avg_test_probs = np.mean(test_probs_folds, axis=0)
    final_test_class_preds = np.argmax(avg_test_probs, axis=1)


    return oof_preds, avg_test_probs, y_train_values, y_test.tolist()




### Creating Stacking model for AI detection

In [None]:
import time
from sklearn.linear_model import RidgeCV, LogisticRegressionCV
from sklearn.metrics import classification_report, confusion_matrix
import optuna


start_time = time.time()



oof_preds_transformer, preds_test_k_fold_avg_bert, bert_y_train, bert_y_test = generate_stacking_preds_classification_ai_time('bert-base-uncased', X_train, X_test, y_train, y_test, lr = 0.00002, bs = 32, epochs = 30)
bert_completion_time = time.time()
print(f'BERT took {bert_completion_time - start_time} seconds')


oof_preds_lstm, preds_test_k_fold_avg_lstm, lstm_y_train, lstm_y_test =                                        lstm_model_ai_time(X_train, X_test, y_train, y_test)
lstm_completion_time = time.time()
print(f'LSTM took {lstm_completion_time - bert_completion_time} seconds')

oof_preds_rf, preds_test_k_fold_avg_rf, y_train_rf, y_test_rf = random_forest_model_with_gridsearch_ai_time(X_train, X_test, y_train, y_test)
rf_completion_time = time.time()
print(f'RF took {rf_completion_time - lstm_completion_time} seconds')

from sklearn.linear_model import RidgeCV
from sklearn.metrics import cohen_kappa_score
import numpy as np
from sklearn.linear_model import LogisticRegressionCV


X_meta_train = np.concatenate([oof_preds_transformer, oof_preds_lstm, oof_preds_rf], axis=1)
X_meta_test = np.concatenate([preds_test_k_fold_avg_bert, preds_test_k_fold_avg_lstm, preds_test_k_fold_avg_rf], axis=1)


meta_model = LogisticRegressionCV(max_iter=10000, cv=5)
meta_model.fit(X_meta_train, y_train)
meta_preds = meta_model.predict(X_meta_test)

acc = accuracy_score(y_test, meta_preds)
recall = recall_score(y_test, meta_preds)
precision = precision_score(y_test, meta_preds)
f1 = f1_score(y_test, meta_preds)
print(f'Accuracy:  {acc}')
print(f'Recall:    {recall}')
print(f'Precision: {precision}')
print(f'F1 Score:  {f1}')

print(classification_report(y_test, meta_preds, digits=4))
print(confusion_matrix(y_test, meta_preds))



stacking_completion_time = time.time()
print(f'Stacking took {stacking_completion_time - start_time} seconds')







### Creating correlation matrix

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


preds_df = pd.DataFrame({
    'BERT': preds_test_k_fold_avg_bert.flatten(),
    'LSTM': preds_test_k_fold_avg_lstm.flatten(),
    'RF': preds_test_k_fold_avg_rf.flatten(),
})


correlation_matrix = preds_df.corr(method='pearson')
print('Correlation matrix:')
print(correlation_matrix)

plt.figure(figsize=(6, 4))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=0.0, vmax=1.0)
plt.title('Correlation of Base Model Predictions')
plt.show()

### Modelling training times

In [None]:
box_plot_data_aes = pd.DataFrame({
    'BERT': [171.82, 146.08, 122.56, 200.03, 189.84, 168, 183.2, 267.5],
    'LSTM' : [352.24, 201.99, 119.01, 319.89, 322.78, 319.29, 170.87, 175.63],
    'RF': [146.44, 151.03, 66.8, 65.22, 77.74, 85.88, 81.4, 95.51],
    'Stacking Model 1': [2069.53, 2086.37, 1046.4, 1822.57, 2003.22, 2075.3, 1947.63, 2100.08],
    'Stacking Model 2': [2145.5, 2124.99, 1125.88, 2007.73, 2138.44, 2058.07, 1866.04, 2110.49]
})

plt.figure(figsize=(10, 6))
box_plot_data_aes.boxplot(vert=False)
plt.xticks(rotation=45)
plt.ylabel('Model Type', size=15)
plt.xlabel('Time taken to train (seconds)', size=15, labelpad = 10)
plt.title('Time taken to train models for an AES system', size = 15)
plt.show()


bar_graph_ai = pd.Series({
    'BERT' : 497.92,
    'LSTM' : 1751.66,
    'RF' : 844.09,
    'Stacking' : 11415.63
})

plt.figsize=(12, 6)
plt.bar(bar_graph_ai.index, bar_graph_ai.values)
plt.xticks(rotation=45)
plt.xlabel('Model Type', size=12)
plt.ylabel('Time taken to train (seconds)', size=12)
plt.title('Time Taken to train models for an AI-detection system', size = 12)
# plt.xlabel('Model Type', size=15)
# plt.ylabel('Time taken to train', size=15, labelpad = 10)
# plt.title('Bar Graph of Time Taken to Train Models for an AES system', size = 15)


### Creating a confusion matrix from results

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

cm = np.array([[999,1],
              [12,988]])

labels = ['Negative', 'Positive']
df_cm = pd.DataFrame(cm, index = labels, columns = labels)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, fmt='g', cmap='Blues')
plt.title('Confusion Matrix for AI-detection system', size=20)
plt.xlabel('Predicted Labels', labelpad=20, size=15)
plt.ylabel('True Labels', labelpad=20, size=15)
plt.show()
