# LSTM Model

##Install the necessary libraries

In [1]:
pip install tensorflow keras-tuner pandas scikit-learn matplotlib

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


## Import all the libraries

In [2]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras_tuner import RandomSearch
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Loading Data

In [4]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

# Combine for consistent preprocessing
full_texts = pd.concat([train_data['full_text'], test_data['full_text']], axis=0)

# Tokenize text
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(full_texts)
train_sequences = tokenizer.texts_to_sequences(train_data['full_text'])
test_sequences = tokenizer.texts_to_sequences(test_data['full_text'])

# Pad sequences
max_length = 500
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

## Building Model

In [5]:
def build_model(hp):
    model = Sequential([
        Embedding(input_dim=5000, output_dim=64, input_length=max_length),
        LSTM(hp.Int('units', min_value=64, max_value=256, step=32), return_sequences=True),
        Dropout(0.5),
        LSTM(hp.Int('units', min_value=64, max_value=256, step=32)),
        Dense(1, activation='linear')
    ])
    model.compile(
        loss='mean_squared_error',
        optimizer=Adam(hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')),
        metrics=['mean_squared_error']
    )
    return model

## Loading training and testing data

In [6]:
X_train, X_val, y_train, y_val = train_test_split(train_padded, train_data['score'], test_size=0.2, random_state=42)

## Settng up the hyperparameter tuner

In [7]:
tuner = RandomSearch(
    build_model,
    objective='val_mean_squared_error',
    max_trials=5,
    executions_per_trial=2,
    directory='model_tuning',
    project_name='EssayScoring'
)

## Callbacks

In [8]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, save_weights_only=True)

## Training

In [9]:
tuner.search(X_train, y_train, epochs=30, validation_data=(X_val, y_val), callbacks=[early_stopping])

Trial 5 Complete [00h 15m 28s]
val_mean_squared_error: 0.405088409781456

Best val_mean_squared_error So Far: 0.3908444941043854
Total elapsed time: 01h 21m 40s


## Getting the best model

In [10]:
best_model = tuner.get_best_models(num_models=1)[0]

## Evaluate the best model

In [11]:
best_model.load_weights('best_model.h5')
loss, mse = best_model.evaluate(X_val, y_val)
print(f'Validation MSE: {mse}')

FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = 'best_model.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

## Predict Test Score

In [None]:
predicted_scores = best_model.predict(test_padded).flatten()
predicted_scores = np.round(predicted_scores).astype(int)

# Save predictions
submission = pd.DataFrame({'essay_id': test_data['essay_id'], 'score': predicted_scores})
submission.to_csv('submission.csv', index=False)

# Plot training history (from the last executed run)
history = tuner.oracle.get_best_trials(num_trials=1)[0].score
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()