In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from transformers import DistilBertTokenizer, DistilBertModel
import json
import pickle

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = lstm_out[:, -1, :]
        out = self.fc(out)
        return out

def tokenize_and_pad(text, word_to_idx, max_length=512):
    tokens = preprocess_text(text).split()
    sequence = [word_to_idx.get(token, word_to_idx['<UNK>']) for token in tokens]
    if len(sequence) < max_length:
        sequence = sequence + [word_to_idx['<PAD>']] * (max_length - len(sequence))
    else:
        sequence = sequence[:max_length]
    return torch.tensor(sequence, dtype=torch.long)

def train_model(lstm_model, distilbert_model, texts, word_to_idx, epochs=5, lr=0.001):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(lstm_model.parameters(), lr=lr)
    lstm_model.train()
    for epoch in range(epochs):
        for text in texts:
            sequence = tokenize_and_pad(text, word_to_idx).unsqueeze(0)
            optimizer.zero_grad()
            outputs = lstm_model(sequence)
            loss = criterion(outputs, outputs)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}')

def save_models(lstm_model, distilbert_model, lstm_path, distilbert_path):
    # Save LSTM model
    with open(lstm_path, 'wb') as f:
        pickle.dump(lstm_model.state_dict(), f)
    
    # Save DistilBERT model
    with open(distilbert_path, 'wb') as f:
        pickle.dump(distilbert_model.state_dict(), f)

def save_word_to_idx(word_to_idx, filepath):
    with open(filepath, 'w') as f:
        json.dump(word_to_idx, f)

def save_distilbert_tokenizer(tokenizer, filepath):
    with open(filepath, 'wb') as f:
        pickle.dump(tokenizer, f)

# Prepare data and vocabulary
all_texts = [line.strip() for line in open('dataset.txt')] + [line.strip() for line in open('student_answer_low.txt')]
all_tokens = [token for text in all_texts for token in preprocess_text(text).split()]
vocab = set(all_tokens)
word_to_idx = {word: idx for idx, word in enumerate(vocab, 1)}
word_to_idx['<PAD>'] = 0
word_to_idx['<UNK>'] = len(word_to_idx)

# Save word_to_idx
save_word_to_idx(word_to_idx, 'word_to_idx.json')

# Prepare LSTM model
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
vocab_size = len(word_to_idx)
lstm_model = LSTMModel(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, HIDDEN_DIM)

# Load DistilBERT model and tokenizer
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Save DistilBERT tokenizer
save_distilbert_tokenizer(distilbert_tokenizer, 'distilbert_tokenizer.pkl')

# Train and save models
train_texts = [line.strip() for line in open('dataset.txt')] + [line.strip() for line in open('student_answer_low.txt')]
train_model(lstm_model, distilbert_model, train_texts, word_to_idx)
save_models(lstm_model, distilbert_model, 'lstm_model.pkl', 'distilbert_model.pkl')

print("Models, tokenizer, and word_to_idx saved successfully.")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to C:\Users\RICKY
[nltk_data]     DEY\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\RICKY
[nltk_data]     DEY\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\RICKY
[nltk_data]     DEY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/5, Loss: 0.0
Epoch 2/5, Loss: 0.0
Epoch 3/5, Loss: 0.0
Epoch 4/5, Loss: 0.0
Epoch 5/5, Loss: 0.0
Models, tokenizer, and word_to_idx saved successfully.
