In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import datetime as datetime
from datetime import datetime

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import mixed_precision
from tensorflow.keras.regularizers import l2

policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

print(tf.__version__)

from imblearn.over_sampling import SMOTE
import random
import os

SEED = 42
max_length = 3000
tf.random.set_seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHED'] = str(SEED)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

import os
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'


tf.keras.backend.clear_session()

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [None]:
df_train = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
df_train.head()

In [None]:
df_sub = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
df_sub.head()

In [None]:
df_train['score'] = df_train['score'] - 1

In [None]:
df_aug = df_train.copy(deep=True)
df_aug['full_text'] = np.NaN
print(df_aug.head())

In [None]:
import gensim
from gensim.models import KeyedVectors

glove_file = '/kaggle/input/glove6b300d/glove.6B.300d.txt'

print("Loading word vectors\n")
load_start = datetime.now()
print(load_start)

word_vectors = KeyedVectors.load_word2vec_format(fname=glove_file, binary=False, unicode_errors='ignore', no_header=True, limit=400000)

load_end = datetime.now()
print(load_end)
print("Word vectors loaded\n")

In [None]:
print("Precomputing similar words\n")
comp_start = datetime.now()
print(comp_start)

similar_words_dict = {word: word_vectors.most_similar(word, topn=1)[0][0] for word in word_vectors.index_to_key}

comp_end = datetime.now()
print(comp_end)    
print("Precompute complete\n")

In [None]:
from multiprocessing import Pool

def augment_text_with_glove(text):
    augmented_text = []
    for word in text.split():
        if word in similar_words_dict:
            augmented_text.append(similar_words_dict[word])
        else:
            augmented_text.append(word)
    return ' '.join(augmented_text)

def parallel_augment_texts(texts, num_workers=4):
    with Pool(num_workers) as pool:
        augmented_texts = list(pool.imap(augment_text_with_glove, texts))
    return augmented_texts

print("Starting data augmentation now\n")
aug_start = datetime.now()
print("Augmentation start time is: ", aug_start)

df_aug['full_text'] = parallel_augment_texts(df_train['full_text'].tolist())

aug_end = datetime.now()
print("\nAugmentation end time is: ", aug_end)
print("\nData augmentation complete")

In [None]:
df_train_fin = pd.concat([df_train, df_aug], axis=0)
len(df_train_fin)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit as sss

splits = sss(n_splits=1, test_size=0.1, random_state=42)

for train_index, test_index in splits.split(df_train_fin['full_text'], df_train_fin['score']):
    X_train, X_test = df_train_fin['full_text'][train_index], df_train_fin['full_text'][test_index]
    y_train, y_test = df_train_fin['score'][train_index], df_train_fin['score'][test_index]

In [None]:
print("X_train:", X_train)
print("X_test:", X_test)
print("y_train:", y_train)
print("y_test:", y_test)

In [None]:
import spacy
from concurrent.futures import ProcessPoolExecutor

nlp = spacy.load("en_core_web_sm")

# Function to apply lemmatization
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop])

def process_chunk(chunk):
    return chunk.apply(lemmatize_text)

def parallel_lemmatize(data, num_processes):
    chunks = np.array_split(data, num_processes)
    with ProcessPoolExecutor(max_workers=num_processes) as executor:
        results = list(executor.map(process_chunk, chunks))
    return pd.concat(results, ignore_index=True)

print("\nApplying lemmatization now")
lem_start = datetime.now()
print("\nLemmatization start time is: ", lem_start)

num_processes = 8
X_train = parallel_lemmatize(X_train, num_processes)

lem_end = datetime.now()
print("\nLemmatization end time is: ", lem_end)
print("\nData lemmatization complete")

In [None]:
X_train[0]

In [None]:
X_train = X_train.tolist()
y_train = y_train.tolist()
X_test = X_test.tolist()
y_test = y_test.tolist()

X_train_nn = X_train
y_train_nn = y_train
X_test_nn = X_test
y_test_nn = y_test

tfidf_vectorizer = TfidfVectorizer(max_features=100000, analyzer='word', stop_words='english')
X_train = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test = tfidf_vectorizer.transform(X_test).toarray()

In [None]:
sub_text = df_sub['full_text'].tolist()
sub_X = tfidf_vectorizer.transform(sub_text).toarray()

In [None]:
print(f"Expected feature shape: {X_train.shape[1]}")
print(f"Sub_X feature shape: {sub_X.shape[1]}")

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

xgb_model = xgb.XGBClassifier(n_jobs=-1, device='cuda', grow_policy='lossguide', colsample_bytree=0.7, booster='gbtree', n_estimators=500, subsample=0.7,  
                              random_state=SEED)
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose = False) # , early_stopping_rounds=100, max_depth=7, scale_pos_weight=1, learning_rate=0.01, reg_alpha=0, min_child_weight=1, gamma=0, reg_lambda=1, objective='multi:softprob',

In [None]:
xgb_preds = xgb_model.predict(X_test) + 1
xgb_accuracy = accuracy_score(y_test, xgb_preds)
print(f"XGBoost Accuracy: {xgb_accuracy}")

In [None]:
sub_xgb_preds = xgb_model.predict(sub_X) + 1
print(sub_xgb_preds)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=1000, bootstrap=True, oob_score=True, max_features='sqrt', n_jobs=-1, verbose=1, random_state=SEED)
rf_model.fit(X_train, y_train)

rf_preds = rf_model.predict(X_test) + 1
rf_accuracy = accuracy_score(y_test, rf_preds)
print(f"Random Forest Accuracy: {rf_accuracy}")

In [None]:
sub_rf_preds = rf_model.predict(sub_X) + 1
print(sub_rf_preds)

In [None]:
X_train_nn
labels = y_train_nn
X_test_nn
val_labels = y_test_nn

In [None]:
!pip install tokenizers

In [None]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, normalizers
# Initialize and train a BPE tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence([normalizers.NFKC(), normalizers.Lowercase()])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
trainer = trainers.BpeTrainer(vocab_size=400000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train_from_iterator(X_train_nn, trainer)
tokenizer.save("tokenizer.json")

In [None]:
tokenizer = Tokenizer.from_file("tokenizer.json")
tokenized_texts = [tokenizer.encode(text).ids for text in X_train_nn]

In [None]:
val_tokenized_texts = [tokenizer.encode(val_text).ids for val_text in X_test_nn]

In [None]:
# Pad the tokenized texts
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 3000
padded_texts = pad_sequences(tokenized_texts, maxlen=max_length, padding='post')

val_padded_texts = pad_sequences(val_tokenized_texts, maxlen=max_length, padding='post')

In [None]:
# Create a TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((padded_texts, labels))
dataset = dataset.shuffle(len(X_train_nn)).batch(128)

val_dataset = tf.data.Dataset.from_tensor_slices((val_padded_texts, val_labels))
val_dataset = val_dataset.batch(128)

In [None]:
from tensorflow.keras.regularizers import l2

tf.keras.backend.clear_session()

nn_model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(max_length,)),
tf.keras.layers.Embedding(input_dim=400000, output_dim=128), 
tf.keras.layers.Conv1D(128, 5, activation='relu', kernel_regularizer=l2(1)),
tf.keras.layers.BatchNormalization(),    
tf.keras.layers.Dropout(0.4),
tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=l2(1)),
tf.keras.layers.Dropout(0.4),
tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=l2(1)),
tf.keras.layers.Dropout(0.4),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True, kernel_regularizer=l2(1))),
tf.keras.layers.LSTM(32, return_sequences=True, kernel_regularizer=l2(1)),    
tf.keras.layers.GlobalMaxPooling1D(),
tf.keras.layers.Dense(6, activation='softmax')
])

nn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# checkpoint_filepath = 'best_model.keras'
# model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
#     filepath=checkpoint_filepath,
#     save_weights_only=False,
#     monitor='val_accuracy',
#     mode='max',
#     save_best_only=True
# )

nn_model.summary()

In [None]:
history = nn_model.fit(dataset, epochs=10, validation_data=val_dataset) # , callbacks=[model_checkpoint_callback]

In [None]:
import matplotlib.pyplot as plt

# Plot utility
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

# Plot the accuracy and results 
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
# best_model = tf.keras.models.load_model('best_model.keras')

In [None]:
pred_texts = df_sub['full_text'].tolist()
tokenized_pred_texts = [tokenizer.encode(pred_text).ids for pred_text in pred_texts]
padded_pred_texts = pad_sequences(tokenized_pred_texts, maxlen=max_length, padding='post')
numpy_pred_texts = np.array(padded_pred_texts)

In [None]:
preds = nn_model.predict(numpy_pred_texts)
preds

In [None]:
max_preds = np.argmax(preds, axis=1)
res_lst = max_preds + 1
res_lst

In [None]:
from scipy.stats import mode

combined_preds = np.array([sub_xgb_preds, sub_rf_preds, res_lst])
final_preds = mode(combined_preds, axis=0)[0].flatten()

final_preds

In [None]:
df_sub_fin = pd.DataFrame()
df_sub_fin['essay_id'] = df_sub['essay_id']
df_sub_fin

In [None]:
df_sub_fin['score'] = final_preds.transpose()
df_sub_fin['score'] = df_sub_fin['score'].astype('int')
df_sub_fin

In [None]:
df_sub_fin.to_csv('submission.csv', header=True, index=False)

In [None]:
import gc

gc.collect()

In [None]:
# texts = df_train['full_text'].tolist()
# labels = df_train['score'].tolist()

# tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=35000)
# X = tfidf_vectorizer.fit_transform(texts).toarray()
# y = labels

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# text_sub = df_sub['full_text'].tolist()
# X_sub = tfidf_vectorizer.transform(text_sub).toarray()

In [None]:
# print(f"Expected feature shape: {X_train.shape[1]}")
# print(f"X_sub feature shape: {X_sub.shape[1]}")

In [None]:
# import xgboost as xgb
# from sklearn.metrics import accuracy_score

# xgb_model = xgb.XGBClassifier(n_jobs=-1, device='cuda', grow_policy='lossguide', colsample_bytree=0.8)
# xgb_model.fit(X_train, y_train)

In [None]:
# xgb_preds = xgb_model.predict(X_test) + 1
# xgb_accuracy = accuracy_score(y_test, xgb_preds)
# print(f"XGBoost Accuracy: {xgb_accuracy}")

In [None]:
# sub_xgb_preds = xgb_model.predict(X_sub) + 1
# print(sub_xgb_preds)

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# rf_model = RandomForestClassifier()
# rf_model.fit(X_train, y_train)

# rf_preds = rf_model.predict(X_test) + 1
# rf_accuracy = accuracy_score(y_test, rf_preds)
# print(f"Random Forest Accuracy: {rf_accuracy}")

In [None]:
# sub_rf_preds = rf_model.predict(X_sub) + 1
# print(sub_rf_preds)

In [None]:
# import numpy as np

# X_train_reshaped = np.expand_dims(X_train, axis=-1)
# X_test_reshaped = np.expand_dims(X_test, axis=-1)

# print(X_train_reshaped.shape)
# print(X_test_reshaped.shape)

In [None]:
# y_train = np.array(y_train)
# y_test = np.array(y_test)

In [None]:
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv1D, Bidirectional, LSTM, GlobalMaxPooling1D
# from tensorflow.keras.regularizers import l2

# tf.keras.backend.clear_session()

# nn_model = Sequential([
#     Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train_reshaped.shape[1], 1)),
#     Dropout(0.4),
#     LSTM(32, return_sequences=True, kernel_regularizer=l2(0.01)),
#     Bidirectional(LSTM(32, return_sequences=True, kernel_regularizer=l2(0.01))),
#     GlobalMaxPooling1D(),    
#     Flatten(),
#     Dense(64, activation='relu'),
#     Dropout(0.4),        
#     Dense(6, activation='softmax')
# ])

# nn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# nn_model.summary()

In [None]:
# nn_model.fit(X_train_reshaped, y_train, epochs=15, batch_size=64, validation_data=(X_test_reshaped, y_test))

# nn_preds = nn_model.predict(X_test_reshaped)
# nn_preds_classes = nn_preds.argmax(axis=1) + 1
# nn_accuracy = accuracy_score(y_test, nn_preds_classes)
# print(f"Neural Network Accuracy: {nn_accuracy}")

In [None]:
# sub_X_reshaped = np.expand_dims(X_sub, axis=-1)

In [None]:
# sub_nn_preds = nn_model.predict(sub_X_reshaped)
# sub_nn_pred_classes = sub_nn_preds.argmax(axis=1) + 1
# print(sub_nn_pred_classes)

In [None]:
# from scipy.stats import mode

# combined_preds = np.array([sub_xgb_preds, sub_rf_preds, sub_nn_pred_classes])
# final_preds = mode(combined_preds, axis=0)[0].flatten()

# final_preds

In [None]:
# df_sub_fin = pd.DataFrame()
# df_sub_fin['essay_id'] = df_sub['essay_id']
# df_sub_fin

In [None]:
# df_sub_fin['score'] = final_preds.transpose()
# df_sub_fin['score'] = df_sub_fin['score'].astype('int')
# df_sub_fin

In [None]:
# df_sub_fin.to_csv('submission.csv', header=True, index=False)

In [None]:
import gc

gc.collect()

In [None]:
# len(df_train)

In [None]:
# df_aug = df_train.copy(deep=True)
# df_aug['full_text'] = np.NaN

In [None]:
# print(df_aug)

In [None]:
# import pickle

# with open('/kaggle/input/similarwordsdict/similar_words_dict.pkl', 'rb') as fp:
#     similar_words_dict = pickle.load(fp)
#     print('similar_words_dict loaded')    

In [None]:
# from multiprocessing import Pool

# def augment_text_with_glove(text):
#     augmented_text = []
#     for word in text.split():
#         if word in similar_words_dict:
#             augmented_text.append(similar_words_dict[word])
#         else:
#             augmented_text.append(word)
#     return ' '.join(augmented_text)

# def parallel_augment_texts(texts, num_workers=4):
#     with Pool(num_workers) as pool:
#         augmented_texts = list(pool.imap(augment_text_with_glove, texts))
#     return augmented_texts

# print("Starting data augmentation now\n")
# aug_start = datetime.now()
# print("Augmentation start time is: ", aug_start)

# df_aug['full_text'] = parallel_augment_texts(df_train['full_text'].tolist())

# aug_end = datetime.now()
# print("\nAugmentation end time is: ", aug_end)
# print("\nData augmentation complete")

In [None]:
# df_train_fin = pd.concat([df_train, df_aug], axis=0)
# len(df_train_fin)

In [None]:
# df_train_fin['score'] = df_train_fin['score'] - 1

In [None]:
# df_train_fin['full_text'].map(len).mean()

In [None]:
# from sklearn.model_selection import StratifiedShuffleSplit as sss
# SEED = 42
# splits = sss(n_splits=1, test_size=0.1, random_state=SEED)

# for train_index, test_index in splits.split(df_train_fin, df_train_fin['score']):
#     train_set = df_train_fin.iloc[train_index]
#     test_set = df_train_fin.iloc[test_index]

# print("Train set indices:", train_set.index)
# print("Test set indices:", test_set.index)

In [None]:
# import spacy
# # Load the English tokenizer, tagger, parser, NER, and word vectors
# nlp = spacy.load("en_core_web_sm")

# # Function to apply lemmatization
# def lemmatize_text(text):
#     doc = nlp(text)
#     return " ".join([token.lemma_ for token in doc])

# from concurrent.futures import ProcessPoolExecutor

# def process_chunk(chunk):
#     return chunk.apply(lemmatize_text)

# def parallel_lemmatize(data, num_processes):
#     chunks = np.array_split(data, num_processes)
#     with ProcessPoolExecutor(max_workers=num_processes) as executor:
#         results = list(executor.map(process_chunk, chunks))
#     return pd.concat(results, ignore_index=True)

# print("\nApplying lemmatization now")
# lem_start = datetime.now()
# print("\nLemmatization start time is: ", lem_start)

# num_processes = 8
# train_set['full_text'] = parallel_lemmatize(train_set['full_text'], num_processes)

# lem_end = datetime.now()
# print("\nLemmatization end time is: ", lem_end)
# print("\nData lemmatization complete")

In [None]:
# texts = train_set['full_text'].tolist()
# labels = train_set['score'].tolist()

In [None]:
# val_texts = test_set['full_text'].tolist()
# val_labels = test_set['score'].tolist()

In [None]:
# from tokenizers import Tokenizer, models, pre_tokenizers, trainers, normalizers
# tokenizer = Tokenizer(models.BPE())
# tokenizer.normalizer = normalizers.Sequence([normalizers.NFKC(), normalizers.Lowercase()])
# tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
# trainer = trainers.BpeTrainer(vocab_size=400000, special_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
# tokenizer.train_from_iterator(texts, trainer)
# tokenizer.save("tokenizer.json")

In [None]:
# tokenizer = Tokenizer.from_file("tokenizer.json")
# tokenized_texts = [tokenizer.encode(text).ids for text in texts]

In [None]:
# val_tokenized_texts = [tokenizer.encode(val_text).ids for val_text in val_texts]

In [None]:
# max_length = 3000
# padded_texts = pad_sequences(tokenized_texts, maxlen=max_length, padding='post')

# val_padded_texts = pad_sequences(val_tokenized_texts, maxlen=max_length, padding='post')

In [None]:
# from imblearn.over_sampling import SMOTE

# smote = SMOTE(random_state=SEED)
# padded_sequences_res, labels_res = smote.fit_resample(padded_texts, labels)

In [None]:
# print("Original class distribution:\n", pd.Series(labels).value_counts())
# print("Resampled class distribution:\n", pd.Series(labels_res).value_counts())

In [None]:
# dataset = tf.data.Dataset.from_tensor_slices((padded_sequences_res, labels_res))
# dataset = dataset.shuffle(len(texts), seed=SEED).batch(128)

# val_dataset = tf.data.Dataset.from_tensor_slices((val_padded_texts, val_labels))
# val_dataset = val_dataset.batch(128)

In [None]:
# tf.keras.backend.clear_session()

# model = tf.keras.Sequential([
#     tf.keras.layers.Input(shape=(max_length,)),
#     tf.keras.layers.Embedding(input_dim=400000, output_dim=128),
#     tf.keras.layers.Conv1D(128, 5, activation='relu'),
#     tf.keras.layers.Dropout(0.4),
#     tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
#     tf.keras.layers.Dropout(0.4),
#     tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
#     tf.keras.layers.Dropout(0.4),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True, kernel_regularizer=l2(0.01))),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True, kernel_regularizer=l2(0.01))),
#     tf.keras.layers.GlobalMaxPooling1D(),
# #     Attention(),
#     tf.keras.layers.Dense(6, activation='softmax')                      
# ])

# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# model.summary()

In [None]:
# tf.keras.backend.clear_session()

# history = model.fit(dataset, epochs=40, validation_data=val_dataset)

In [None]:
# import matplotlib.pyplot as plt

# # Plot utility
# def plot_graphs(history, string):
#   plt.plot(history.history[string])
#   plt.plot(history.history['val_'+string])
#   plt.xlabel("Epochs")
#   plt.ylabel(string)
#   plt.legend([string, 'val_'+string])
#   plt.show()

# # Plot the accuracy and results 
# plot_graphs(history, "accuracy")
# plot_graphs(history, "loss")

In [None]:
# df_test = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
# df_test.head()

In [None]:
# pred_texts = df_test['full_text'].tolist()
# tokenized_pred_texts = [tokenizer.encode(pred_text).ids for pred_text in pred_texts]
# padded_pred_texts = pad_sequences(tokenized_pred_texts, maxlen=max_length, padding='post')
# numpy_pred_texts = np.array(padded_pred_texts)

In [None]:
# preds = model.predict(numpy_pred_texts)
# preds

In [None]:
# max_preds = np.argmax(preds, axis=1)
# res_lst = max_preds + 1
# res_lst

In [None]:
# df_sub = pd.DataFrame()
# df_sub['essay_id'] = df_test['essay_id']
# df_sub

In [None]:
# df_sub['score'] = res_lst.transpose()
# df_sub['score'] = df_sub['score'].astype('int')
# df_sub

In [None]:
# df_sub.to_csv('submission.csv', header=True, index=False)