<a href="https://colab.research.google.com/github/Taka-Lab-Python/Python/blob/main/IMDB_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Drive mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
#!pip install japanize-matplotlib
#import japanize_matplotlib

In [None]:
import torch
print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())


torch: 2.9.0+cpu
cuda available: False


1. CSV 読み込み（教師ありデータ）
2. Tokenize（文字列 → 単語列）
3. Vocab 構築（単語 → ID）
4. 数値化 + Padding
5. LSTM による系列分類
6. Train と Valid 、 Test csvを使って評価

In [None]:
path1 = "/content/drive/MyDrive/38_自然言語処理/映画レビュ/IMBD_Data/Train.csv"
path2 = "/content/drive/MyDrive/38_自然言語処理/映画レビュ/IMBD_Data/Test.csv"
path3 = "/content/drive/MyDrive/38_自然言語処理/映画レビュ/IMBD_Data/Valid.csv"

In [None]:
# csv フアイル読み込み⇒DataFrameに変換

df1 = pd.read_csv(path1)   # Train
print("Train-行数:", len(df1))

df2 = pd.read_csv(path2) # Test
print("Test-行数:", len(df2))

df3 = pd.read_csv(path3) # Valid
print("Valid-行数:", len(df3))

df_all = pd.concat([df1, df2, df3])
print("全体-行数:", len(df_all))

Train-行数: 40000
Test-行数: 5000
Valid-行数: 5000
全体-行数: 50000


In [None]:
# CSV（教師あり）→ 自前Vocab → LSTM

# 関数　tokenize()
import re
TOKEN_RE = re.compile(r"[A-Za-z']+")
def tokenize(text: str):
    return TOKEN_RE.findall(text.lower())

print(tokenize(df1.iloc[0]["text"])[:30])
display(df1.head())

['i', 'grew', 'up', 'b', 'watching', 'and', 'loving', 'the', 'thunderbirds', 'all', 'my', 'mates', 'at', 'school', 'watched', 'we', 'played', 'thunderbirds', 'before', 'school', 'during', 'lunch', 'and', 'after', 'school', 'we', 'all', 'wanted', 'to', 'be']


Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [None]:
# 3) Vocab 構築

from collections import Counter

# 関数　tokenize()
import re
TOKEN_RE = re.compile(r"[A-Za-z']+")
def tokenize(text: str):
    return TOKEN_RE.findall(text.lower())


PAD = "<pad>"
UNK = "<unk>"

def build_vocab(texts, max_vocab=30000, min_freq=2): # 2で除外
    counter = Counter()
    for t in texts:
        counter.update(tokenize(t))

    words = [w for w, c in counter.most_common(max_vocab) if c >= min_freq]
    itos = [PAD, UNK] + words
    stoi = {w:i for i, w in enumerate(itos)}
    return stoi, itos

stoi, itos = build_vocab(df1["text"])
pad_id = stoi[PAD]
unk_id = stoi[UNK]

print("vocab size:", len(itos))
print(pad_id)
print(unk_id)

vocab size: 30002
0
1


In [None]:
# 4) 数値化（文章 → ID列)

def encode(text, max_len=200):
    tokens = tokenize(text)
    ids = [stoi.get(t, unk_id) for t in tokens[:max_len]]
    return ids


In [None]:
# 5) PyTorch Dataset DataFrameを変換する
# csv⇒DataFrame⇒　PyTorchに変換

import torch
from torch.utils.data import Dataset, DataLoader

class IMDBDataset(Dataset):
    def __init__(self, df, max_len=200):
        self.texts = df["text"].values
        self.labels = df["label"].values
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        ids = encode(self.texts[idx], self.max_len)
        return torch.tensor(ids), torch.tensor(self.labels[idx])

def collate_fn(batch):
    xs, ys = zip(*batch)
    lengths = torch.tensor([len(x) for x in xs])
    max_len = lengths.max().item()

    padded = torch.full((len(xs), max_len), pad_id)
    for i, x in enumerate(xs):
        padded[i, :len(x)] = x

    return padded.long(), lengths.long(), torch.tensor(ys).long()


In [None]:
# DataFrameからPyTorchへの変換
# train_loader  valid_loader  test_loader ⇒　PyTorchへの入力

batch_size_val = 64  #バッチサイズ

train_loader = DataLoader(IMDBDataset(df1),
                          batch_size=batch_size_val, shuffle=True,
                          collate_fn=collate_fn)

valid_loader = DataLoader(IMDBDataset(df3),
                          batch_size=batch_size_val, shuffle=False,
                          collate_fn=collate_fn)

test_loader  = DataLoader(IMDBDataset(df2),
                          batch_size=batch_size_val, shuffle=False,
                          collate_fn=collate_fn)


In [None]:
# 6) LSTM 感情分類モデル
# x は shape (B, T)
# embedding は ID をベクトルに変換, emb は shape (B, T, E) のベクトル列
# h_n[-1] は 最終層の最終隠れ状態で、shape (B, H)
# B：バッチサイズ　T：タイムステップ　H：内部の次元数

import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence

# ------------- LSTM_Sentiment  --------
class LSTM_Sentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 2)

    def forward(self, x, lengths):
        emb = self.embedding(x)
        packed = pack_padded_sequence(
            emb, lengths.cpu(), batch_first=True, enforce_sorted=False
            )
        out, (h_n, _) = self.lstm(packed)
        return self.fc(h_n[-1])  # # 最終時刻の隠れ状態

In [None]:
# 7) 学習・評価（GPU)
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LSTM_Sentiment(len(itos)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

#  --- 関数　run_epoch ----
def run_epoch(loader, train=True):
    model.train() if train else model.eval()
    total_loss, correct, total = 0, 0, 0
    step = 0
    for x, lengths, y in loader:   # x:ID列　lengths:文の長さ　y:正解ラベル
        step += 1
        x, lengths, y = x.to(device), lengths.to(device), y.to(device)

        if train:
            optimizer.zero_grad()

        logits = model(x, lengths)
        loss = criterion(logits, y)

        if train:
            loss.backward()
            optimizer.step()

        if step % 200 == 0:
            print(step, loss.item())

        total_loss += loss.item() * y.size(0)
        correct += (logits.argmax(1) == y).sum().item()
        total += y.size(0)

    return total_loss / total, correct / total


NameError: name 'LSTM_Sentiment' is not defined

In [None]:
# 学習
# train_loader 学習データ 40000
# valid_loader 評価データ 5000

for epoch in range(3):
    tr_loss, tr_acc = run_epoch(train_loader, train=True)
    va_loss, va_acc = run_epoch(valid_loader, train=False)
    print(f"Epoch {epoch+1} | Train acc {tr_acc:.3f} | Valid acc {va_acc:.3f}")

# 8) Test accuracy
te_loss, te_acc = run_epoch(test_loader, train=False)
print("Test accuracy:", te_acc)
