In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torchtext.legacy import data
from torchtext.legacy import datasets

# Load the dataset
url = "https://zenodo.org/record/3609356/files/crowdsourced.csv?download=1"
df = pd.read_csv(url)

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save the splits as CSV files
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)

# Define Fields
TEXT = data.Field(tokenize="spacy", include_lengths=True)
LABEL = data.LabelField(dtype=torch.float)

# Define custom dataset using TabularDataset
datafields = [("text", TEXT), ("verdict", LABEL)]
train_data, test_data = data.TabularDataset.splits(
    path="",
    train="train.csv",
    test="test.csv",
    format="csv",
    skip_header=True,
    fields=datafields,
)

# Build vocabulary using pre-trained word embeddings
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

# Create iterators
BATCH_SIZE = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    sort_key=lambda x: len(x.text),
    device=device,
)

# Define the LSTM model
class LSTMClassifier(nn.Module):
    def __init__(
        self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            dropout=dropout,
        )
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to("cpu"))
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        return self.fc(hidden)

# Instantiate the LSTMClassifier model
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model = LSTMClassifier(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

# Initialize the model's embedding layer with pre-trained embeddings
pretrained_embeddings


ModuleNotFoundError: No module named 'torch'