In [None]:
import requests
import re

def download_shakespeare_data(url, file_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(file_path, 'wb') as file:
            file.write(response.content)
        print(f"Data downloaded and saved to {file_path}")
    else:
        print("Failed to download data")

def preprocess_text(file_path, output_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()

    # 不要な文字の削除
    text = re.sub(r'[^a-z\s]', '', text)

    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(text)

    print(f"Preprocessed text saved to {output_path}")

# Project Gutenbergのシェイクスピアの作品ページからのURL
url = "http://www.gutenberg.org/files/100/100-0.txt"
file_path = "shakespeare.txt"
preprocessed_file_path = "preprocessed_shakespeare.txt"

# データのダウンロードと前処理
download_shakespeare_data(url, file_path)
preprocess_text(file_path, preprocessed_file_path)


Data downloaded and saved to shakespeare.txt
Preprocessed text saved to preprocessed_shakespeare.txt


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class ShakespeareDataset(Dataset):
    def __init__(self, text, seq_length):
        self.text = text
        self.seq_length = seq_length
        self.vocab = sorted(set(text))
        self.vocab_size = len(self.vocab)
        self.char_to_idx = {char: idx for idx, char in enumerate(self.vocab)}
        self.idx_to_char = {idx: char for char, idx in self.char_to_idx.items()}
        self.data = [self.char_to_idx[char] for char in text]

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        inputs = self.data[idx:idx+self.seq_length]
        target = self.data[idx+1:idx+self.seq_length+1]
        return torch.tensor(inputs), torch.tensor(target)

seq_length = 100
with open(preprocessed_file_path, 'r', encoding='utf-8') as file:
    text = file.read()
text = text.split()
dataset = ShakespeareDataset(text, seq_length)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


In [None]:
import torch.nn as nn

class ShakespeareModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(ShakespeareModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(num_layers, x.size(0), hidden_size).to(x.device)
        c0 = torch.zeros(num_layers, x.size(0), hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out)
        return out[:, -1]

embed_size = 64
hidden_size = 128
num_layers = 2

model = ShakespeareModel(dataset.vocab_size, embed_size, hidden_size, num_layers).to('cuda')
torch.backends.cudnn.benchmark = True

In [None]:
from tqdm import tqdm
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in tqdm(range(1)):  # エポック数は適宜調整
    for inputs, targets in tqdm(dataloader):
        inputs, targets = inputs.to('cuda'), targets.to('cuda')

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets[:,-1])
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/10], Loss: {loss.item():.4f}')

In [None]:
def generate_text(model, start_text, length=100):
    model.eval()
    chars = [dataset.char_to_idx.get(char, 0) for char in start_text.lower()]
    chars = torch.tensor(chars).unsqueeze(0).to('cuda')

    generated = start_text + " "
    with torch.no_grad():
        for _ in range(length):
            output = model(chars)
            prob = torch.softmax(output, dim=1)
            char_idx = torch.multinomial(prob, 1).item()
            char = dataset.idx_to_char[char_idx]
            generated += char + " "

            chars = torch.cat([chars[:, 1:], torch.tensor([[char_idx]]).to('cuda')], dim=1)

    return generated

start_text = 'to be or not to be'
print(generate_text(model, start_text, 200))


to be or not to be company domestic seat well not duke receive mason that womanthis swung rock man blood quoth life i on bardolph if as if be polecats ho honour it they yours of spirit sternest far maids i francis proceeding well brawnbuttock by is knowst angry of off my realm assurd you north thunder orleans fervour my learn am duke came question my tardied brow my look prorogue long me they drum and friar in thy queen lucentio me calved have gone not even sooner dangerous that thou i down will lasting to taking shall of one cursy and therein enter sir in murder given my on to prefer ingenious fellow the matter keeper royal trick a win am constant to and gentlewoman the all anonsir forgoing prithee clocksetter undone as most instruct animal much of my your in hector he ulysses exit you truth unbashful one it before that car masked his put this still company am fool ambitiously most is of caesar progress henry jades proud scalding part me my sword never for exeunt convenient you then m