In [28]:
import pandas as pd
import numpy as np

data = pd.read_csv('./winemag-data-130k-v2.csv')
data.drop(columns=['Unnamed: 0'], inplace=True)
y = data['points']
x = data['description']

y = np.array(y)



In [29]:
#import torch
import torch.nn as nn
import torch 
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, output_size, dropout=0.5):
        super(LSTMModel, self).__init__()
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        # LSTM layer
        self.lstm = nn.LSTM(
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_size, output_size)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # x: [batch_size, sequence_length]
        
        # Embedding layer
        embed = self.embedding(x)  # [batch_size, sequence_length, embed_size]
        
        # LSTM layer
        lstm_out, (hidden, cell) = self.lstm(embed)  # lstm_out: [batch_size, sequence_length, hidden_size]
        
        # Use only the final hidden state for classification
        out = self.fc(self.dropout(hidden[-1]))  # hidden[-1]: [batch_size, hidden_size]
        
        return out


In [30]:
import torch
from torch.utils.data import DataLoader, Dataset
from collections import Counter
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt_tab')
# Example pandas Series (replace this with your Series)

from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
data = x_train

# 1. Tokenization
def tokenize_text(text):
    return word_tokenize(text.lower())  # Tokenize and lowercase

tokenized_data = data.apply(tokenize_text)

# 2. Build Vocabulary
def build_vocab(tokenized_data, max_vocab_size=5000):
    # Flatten the tokenized data and count word frequencies
    all_tokens = [token for doc in tokenized_data for token in doc]
    token_counts = Counter(all_tokens)
    most_common = token_counts.most_common(max_vocab_size - 2)  # Reserve spots for <PAD> and <UNK>
    
    # Create word-to-index mapping
    vocab = {word: idx + 2 for idx, (word, _) in enumerate(most_common)}
    vocab["<PAD>"] = 0
    vocab["<UNK>"] = 1
    return vocab

vocab = build_vocab(tokenized_data)
vocab_size = len(vocab)
print("Vocabulary size:", vocab_size)

# 3. Encode Text and Pad
def encode_and_pad(text, vocab, max_len=300):
    # Map tokens to integers, use <UNK> (1) for unknown tokens
    encoded = [vocab.get(token, vocab["<UNK>"]) for token in text]
    # Pad or truncate to the desired length
    if len(encoded) < max_len:
        encoded += [vocab["<PAD>"]] * (max_len - len(encoded))
    else:
        encoded = encoded[:max_len]
    return encoded

encoded_data = tokenized_data.apply(lambda x: encode_and_pad(x, vocab))
encoded_data = np.array(encoded_data.tolist())  # Convert to NumPy array
print("Example encoded data:", encoded_data[0])

# 4. Create PyTorch Dataset and DataLoader
class TextDataset(Dataset):
    def __init__(self, encoded_data, labels=None):
        self.encoded_data = encoded_data
        self.labels = labels

    def __len__(self):
        return len(self.encoded_data)

    def __getitem__(self, idx):
        x = self.encoded_data[idx]
        if self.labels is not None:
            y = self.labels[idx]
            return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.float)
        return torch.tensor(x, dtype=torch.long)



[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/seanfuhrman/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Vocabulary size: 5000
Example encoded data: [ 149  140    4   31   75   36  796  583  113   28    4   79   13   14
    9  747   37  665    8 1477   35  537  565    3   67   23 2135    5
   18    3    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0   

In [31]:
labels = y_train
dataset = TextDataset(encoded_data, labels)
dataloader = DataLoader(dataset, batch_size=1024, shuffle=True)

embed_size = 128
hidden_size = 128
num_layers = 2
output_size = 1
dropout = 0.5
model = LSTMModel(vocab_size, embed_size, hidden_size, num_layers, output_size, dropout)


optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
epochs = 1
from tqdm import tqdm
for epoch in range(epochs):
    losses = []
    count = 0
    curr_loss = 0
    # Iterate through DataLoader
    for batch in tqdm(dataloader):
        inputs, targets = batch
        
        # Forward pass
        outputs = model(inputs)

        # Compute loss
        loss = criterion(outputs, targets)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Store loss
        losses.append(loss.item())
        curr_loss += loss.item()
        count += 1
    print(f"Epoch {epoch+1}, loss: {curr_loss/count}")

  0%|          | 0/102 [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 102/102 [13:49<00:00,  8.13s/it]

Epoch 1, loss: 6011.7429294960175





In [32]:
# 5. Evaluate the Model
model.eval()

# Tokenize, encode, and pad the validation data

tokenized_val = x_val.apply(tokenize_text)
encoded_val = tokenized_val.apply(lambda x: encode_and_pad(x, vocab))
encoded_val = np.array(encoded_val.tolist())

# Create DataLoader
val_labels = y_val
val_dataset = TextDataset(encoded_val, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=1024)

# Evaluate the model
val_losses = []
count = 0
curr_loss = 0
val_mse = 0
val_abe = 0
for batch in val_dataloader:
    inputs, targets = batch
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    val_losses.append(loss.item())
    curr_loss += loss.item()
    count += 1
    val_mse += loss.item()
    val_abe += torch.abs(outputs-targets).mean().item()
print(f"Validation loss: {curr_loss/count}")
print(f"Validation MSE: {val_mse/count}")
print(f"Validation ABE: {val_abe/count}")

#Get Training MSE/ABE
training_MSE = 0
training_ABE = 0
count = 0
for batch in dataloader:
    inputs, targets = batch
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    val_losses.append(loss.item())
    curr_loss += loss.item()
    count += 1
    training_MSE += loss.item()
    training_ABE += torch.abs(outputs-targets).mean().item()

print(f"Training MSE: {training_MSE/count}")
print(f"Training ABE: {training_ABE/count}")

  return F.mse_loss(input, target, reduction=self.reduction)


Validation loss: 4868.346397986779
Validation MSE: 4868.346397986779
Validation ABE: 69.70632083599384
Training MSE: 4866.968386182598
Training ABE: 69.69766534543506
