In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from datetime import datetime

In [None]:
class QueryFreqDataset(Dataset):
    def __init__(self, df, char_to_idx, max_len=100):
        self.char_to_idx = char_to_idx
        self.max_len = max_len
        freq = Counter(df["query_orig"].astype(str))
        self.items = list(freq.items())
    
    def __len__(self):
        return len(self.items)
    
    def __getitem__(self, idx):
        query, count = self.items[idx]
        
        # encoding char
        seq = [self.char_to_idx.get(ch, self.char_to_idx["<unk>"]) for ch in query]
        if len(seq) < self.max_len:
            seq += [self.char_to_idx["<pad>"]] * (self.max_len - len(seq))
        else:
            seq = seq[:self.max_len]

        return torch.tensor(seq, dtype=torch.long), torch.tensor([count], dtype=torch.float)

In [None]:
class CharLSTMRegressor(nn.Module):
    def __init__(self, vocab_size, emb_size=32, hidden_size=128, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=pad_idx)
        self.lstm = nn.LSTM(emb_size, hidden_size, batch_first=True)
        self.lin = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        emb = self.embedding(x)
        _, (h_n, _) = self.lstm(emb)
        return self.lin(h_n.squeeze(0))

In [None]:
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("val.csv")
test_df = pd.read_csv("test.csv")

In [None]:
# building vocabulary
chars = set()
for q in pd.concat([train_df["query_orig"], val_df["query_orig"]]):
    chars.update(str(q))

char_to_idx = {"<pad>": 0, "<unk>": 1}
for i, ch in enumerate(sorted(chars), start=2):
    char_to_idx[ch] = i

In [None]:
train_ds = QueryFreqDataset(train_df, char_to_idx)
val_ds = QueryFreqDataset(val_df, char_to_idx)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CharLSTMRegressor(len(char_to_idx)).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
train_loss_ = []
val_loss_ = []
model_weights_ = {}

In [None]:
for epoch in range(50):
    model.train()
    total_loss = 0

    init_time = datetime.now()
    for seqs, counts in train_loader:
        seqs, counts = seqs.to(device), counts.to(device)
        preds = model(seqs)
        loss = criterion(preds, counts)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * seqs.size(0)

    # validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for seqs, counts in val_loader:
            seqs, counts = seqs.to(device), counts.to(device)
            val_loss += criterion(model(seqs), counts).item() * seqs.size(0)
    
    final_time = datetime.now()

    train_loss_.append(total_loss)
    val_loss_.append(val_loss)

    model_weights_[epoch] = model.state_dict()

    print(f"Epoch {epoch}: Train MSE={total_loss / len(train_ds):.4f}, Val MSE={val_loss / len(val_ds):.4f}, Time={final_time - init_time}")

KeyboardInterrupt: 

In [None]:
import pickle
with open("model_weights.pkl", "wb") as f:
    pickle.dump(model_weights_, f)

In [None]:
import matplotlib.pyplot as plt
plt.plot(val_loss_)
plt.show()