In [286]:
import pandas as pd
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

In [289]:

df = pd.read_csv('dataset.csv')

In [291]:
le = LabelEncoder()
df['y']=le.fit_transform(df['y'])

In [293]:

tokenizer = get_tokenizer('basic_english')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha() and token not in stop_words]
    return tokens[::-1]

df['x'] = df['x'].apply(preprocess_text)

Unnamed: 0,x,y
0,"[day, every, happened, but]",1
1,"[good, rat, water, better, fitted, would, cani...",1
2,"[thing, lot, fix, trying, least, right]",1
3,"[job, use, simply, could, wished, she]",1
4,"[said, government, told, go, patrick, insisted...",1
...,...,...
12939,"[stay, want, wife, baby, new, find, imagined, ...",2
12940,"[father, say, back, come, reason, give, good, ...",2
12941,"[i, sure, i, nowhere, out, consistently, writi...",2
12942,"[easy, live, place, new, find, decided, they]",2


In [294]:
def yield_tokens(data_iter):
    for text in data_iter:
        yield text

vocab = build_vocab_from_iterator(yield_tokens(df['x']), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [295]:
def encode_text(text):
    return [vocab[token] for token in text]
df['x'] = df['x'].apply(encode_text)
print(df['x'].head())

0                     [17, 182, 380, 169]
1    [10, 3076, 51, 95, 2007, 8, 1948, 1]
2            [14, 44, 3696, 181, 253, 74]
3               [96, 89, 255, 11, 505, 5]
4      [35, 239, 163, 30, 1434, 980, 169]
Name: x, dtype: object


In [296]:
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    text_list, label_list = [], []
    for (_text, _label) in batch:
        text_list.append(torch.tensor(_text, dtype=torch.int64))
        label_list.append(torch.tensor(_label, dtype=torch.int64))
    text_padded = pad_sequence(text_list, batch_first=True, padding_value=0)
    label_tensor = torch.tensor(label_list, dtype=torch.int64)
    return text_padded, label_tensor

In [297]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.3, random_state=42)

In [298]:
from torch.utils.data import DataLoader, Dataset

class TextDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['x']
        label = self.dataframe.iloc[idx]['y']
        return text, label

train_dataset = TextDataset(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
test_dataset = TextDataset(test_data)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)

In [299]:
import torch
import torch.nn as nn

class TextClassifierLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim,num_layers=2,def_dropout=0.5):
        super(TextClassifierLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True,bidirectional=True,num_layers=num_layers, dropout=def_dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        out = self.fc(hidden[-1])
        return out

In [300]:
for i in train_data:
    print(i,type(i))

x <class 'str'>
y <class 'str'>


In [302]:
model = TextClassifierLSTM(vocab_size=len(vocab), embed_dim=32, hidden_dim=64, output_dim=5)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

correct_train = 0
total_train = 0

for epoch in range(100):
    model.train()
    for texts, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    train_accuracy = 100 * correct_train / total_train
    print(f'Epoch [{epoch+1}/5], Training Accuracy: {train_accuracy:.2f}%')

Epoch [1/5], Training Accuracy: 24.84%
Epoch [2/5], Training Accuracy: 26.85%
Epoch [3/5], Training Accuracy: 28.45%
Epoch [4/5], Training Accuracy: 30.12%
Epoch [5/5], Training Accuracy: 31.52%
Epoch [6/5], Training Accuracy: 33.10%
Epoch [7/5], Training Accuracy: 34.69%
Epoch [8/5], Training Accuracy: 36.25%
Epoch [9/5], Training Accuracy: 37.86%
Epoch [10/5], Training Accuracy: 39.41%
Epoch [11/5], Training Accuracy: 40.98%
Epoch [12/5], Training Accuracy: 42.48%
Epoch [13/5], Training Accuracy: 43.95%
Epoch [14/5], Training Accuracy: 45.39%
Epoch [15/5], Training Accuracy: 46.76%
Epoch [16/5], Training Accuracy: 48.06%
Epoch [17/5], Training Accuracy: 49.34%
Epoch [18/5], Training Accuracy: 50.60%
Epoch [19/5], Training Accuracy: 51.73%
Epoch [20/5], Training Accuracy: 52.86%
Epoch [21/5], Training Accuracy: 53.93%
Epoch [22/5], Training Accuracy: 54.97%
Epoch [23/5], Training Accuracy: 55.98%
Epoch [24/5], Training Accuracy: 56.96%
Epoch [25/5], Training Accuracy: 57.87%
Epoch [26

In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for texts, labels in test_dataloader:
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(correct,total)
accuracy = 100 * correct / total
print(f'Validation Accuracy: {accuracy}%')

569 2589
Validation Accuracy: 21.97759752800309%
