In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import json

# Set path (based on your screenshot)
base_path = '/kaggle/input/sequence-recognition-using-rnn-lstm-gru/'

# Read JSON file
# Note: If JSON is one object per line, add lines=True
try:
    # Attempt to read the JSON file using pandas
    train_df = pd.read_json(f'{base_path}train.json')
    print("Train Data Head:")
    print(train_df.head())
except:
    # If the above fails, it might be standard JSON format, so use json package directly
    with open(f'{base_path}train.json', 'r') as f:
        data = json.load(f)
    # Print keys of the first element if it's a list, otherwise print keys of the dictionary
    print("Data keys:", data[0].keys() if isinstance(data, list) else data.keys())

Train Data Head:
                                           sequences      labels  num_samples  \
0  [141, 35, 28, 17, 15, 3, 83, 71, 1, 83, 302, 1...  Category_D         4000   
1  [51, 35, 28, 17, 15, 3, 180, 28, 17, 15, 3, 18...  Category_A         4000   
2  [141, 35, 28, 17, 15, 3, 69, 85, 75, 1, 73, 85...  Category_E         4000   
3  [50, 69, 85, 75, 51, 35, 28, 17, 15, 3, 180, 2...  Category_E         4000   
4  [50, 200, 225, 158, 51, 35, 28, 17, 15, 3, 201...  Category_C         4000   

   vocab_size  
0       10000  
1       10000  
2       10000  
3       10000  
4       10000  


In [6]:
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence
import torch

# 1. Label Encoding
le = LabelEncoder()
# Transform labels to numerical indices
train_df['label_idx'] = le.fit_transform(train_df['labels'])
num_classes = len(le.classes_)

# 2. Convert Sequences to Tensor and perform Padding
# When sequence lengths vary, pad with 0 to make every data entry the same length
sequences = [torch.tensor(s) for s in train_df['sequences']]
padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)

print(f"Padded shape: {padded_sequences.shape}") # (Number of samples, Max length)

Padded shape: torch.Size([4000, 10169])


In [7]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super(LSTMClassifier, self).__init__()
        # Embedding layer: Convert 10000 IDs to continuous vectors
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True)
        
        # Classifier: Since it's bidirectional LSTM, hidden_size is multiplied by 2
        self.fc = nn.Linear(hidden_size * 2, num_classes)
        
    def forward(self, x):
        # x shape: (Batch, Seq_Len)
        x = self.embedding(x)  # Convert to (Batch, Seq_Len, Embedding_Dim)
        
        # out contains outputs for all time steps, (h_n, c_n) is the state at the last time step
        out, (h_n, c_n) = self.lstm(x)
        
        # Concatenate the last hidden state of both directions
        cat_hidden = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
        
        return self.fc(cat_hidden)

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMClassifier(vocab_size=10000, embedding_dim=128, hidden_size=64, num_classes=num_classes).to(device)

In [8]:
import torch.optim as optim

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Simple training demonstration
def train_one_epoch(loader):
    model.train()
    for seqs, targets in loader:
        seqs, targets = seqs.to(device), targets.to(device)
        
        outputs = model(seqs)
        loss = criterion(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

# --- Parameter Settings ---
MAX_LEN = 128      # Limit sequence length to avoid memory explosion
BATCH_SIZE = 64
EMBED_DIM = 128
HIDDEN_DIM = 256
EPOCHS = 10
LR = 1e-3
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# --- 1. Data Processing ---
# Read data
base_path = '/kaggle/input/sequence-recognition-using-rnn-lstm-gru/'
train_df = pd.read_json(f'{base_path}train.json')
test_df = pd.read_json(f'{base_path}test.json')

# Label Encoding (Category_A -> 0)
le = LabelEncoder()
train_df['label_idx'] = le.fit_transform(train_df['labels'])
num_classes = len(le.classes_)
vocab_size = 10000 + 1 # Considering 0 as Padding

class SeqDataset(Dataset):
    def __init__(self, sequences, labels=None):
        # Limit length and convert to Tensor
        self.sequences = [torch.tensor(s[:MAX_LEN]) for s in sequences]
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.sequences[idx], self.labels[idx]
        return self.sequences[idx]

# Custom collate_fn for dynamic Padding
def collate_fn(batch):
    if isinstance(batch[0], tuple): # Training mode
        seqs, labels = zip(*batch)
        seqs_padded = pad_sequence(seqs, batch_first=True, padding_value=0)
        return seqs_padded, torch.tensor(labels)
    else: # Test mode
        seqs_padded = pad_sequence(batch, batch_first=True, padding_value=0)
        return seqs_padded

train_loader = DataLoader(SeqDataset(train_df['sequences'], train_df['label_idx']), 
                          batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

# --- 2. Model Definition (Using Bidirectional GRU, more efficient than LSTM) ---

class GRUClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(GRUClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        
    def forward(self, x):
        embedded = self.embedding(x)
        # _ is the hidden state of the last item
        _, hn = self.gru(embedded) 
        # Get the last hidden state of both directions and concatenate
        cat_hn = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim=1)
        return self.fc(cat_hn)

model = GRUClassifier(vocab_size, EMBED_DIM, HIDDEN_DIM, num_classes).to(DEVICE)
optimizer = optim.AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

# --- 3. Training Loop ---
print("Starting training...")
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(DEVICE), y_batch.to(DEVICE)
        
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(train_loader):.4f}")

# --- 4. Prediction and Submission Generation ---
model.eval()
test_loader = DataLoader(SeqDataset(test_df['sequences']), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
predictions = []

with torch.no_grad():
    for x_batch in test_loader:
        x_batch = x_batch.to(DEVICE)
        outputs = model(x_batch)
        _, preds = torch.max(outputs, 1)
        predictions.extend(preds.cpu().numpy())

# Convert back to original label strings
final_labels = le.inverse_transform(predictions)

# Correction: If test_df does not have 'id' column, use index as id
if 'id' in test_df.columns:
    test_ids = test_df['id']
else:
    # According to sample_submission.csv format, typically an index starting from 0 or 1
    test_ids = test_df.index 

submission = pd.DataFrame({
    'id': test_ids, 
    'labels': final_labels
})

submission.to_csv('submission.csv', index=False)
print("Submission saved! Go to Output area to download and upload!")

Starting training...
Epoch 1/10, Loss: 0.6233
Epoch 2/10, Loss: 0.0308
Epoch 3/10, Loss: 0.0061
Epoch 4/10, Loss: 0.0045
Epoch 5/10, Loss: 0.0034
Epoch 6/10, Loss: 0.0037
Epoch 7/10, Loss: 0.0034
Epoch 8/10, Loss: 0.0029
Epoch 9/10, Loss: 0.0025
Epoch 10/10, Loss: 0.0023
Submission saved! 趕快去 Output 區下載並上傳吧！
