In [1]:
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch import Tensor
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


### Encoder-Decoder dataset

### Poetry dataset

In [2]:
df = pd.read_csv('/home/matta/repos/autumn_school/data/russianPoetryWithTheme.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/home/matta/repos/autumn_school/data/russianPoetryWithTheme.csv'

In [4]:
df = df[['author', 'text']]

In [5]:
df

Unnamed: 0,author,text
0,Михаил Лермонтов,"Забывши волнения жизни мятежной,\r\nОдин жил в..."
1,Сергей Есенин,"Нивы сжаты, рощи голы,\r\nОт воды туман и сыро..."
2,Игорь Северянин,Лючинь печальная читала вечером ручьисто-вкрад...
3,Анатолий Жигулин,"Глыбу кварца разбили молотом,\r\nИ, веселым ог..."
4,Николай Тихонов,"Хлынул дождь, когда девушки, встав в хоровод,\..."
...,...,...
16689,Леонид Мартынов,Седо\r\nКурчавятся облака\r\nНад чернотою поле...
16690,Гаврила Державин,"Белокурая Параша,\r\nСребророзова лицом,\r\nКо..."
16691,Федор Сологуб,"Сладкозвучная богиня,\r\nРифма золотая,\r\nСлу..."
16692,Илья Эренбург,Я так любил тебя — до грубых шуток\r\nИ до так...


In [6]:
alphabet = ''.join(chr(i) for i in range(ord('а'), ord('я') + 1))
alphabet += ' \n.,!?-«»()—…:;\"\''

In [7]:
df['text'] = df['text'].str.lower().apply(lambda x: '<start> ' + ''.join([ch for ch in x if ch in alphabet]) + ' <end>')

In [None]:
chars = sorted(set(list(''.join(df['text'].to_list()))))
vocab_size = len(chars)
stoi = {ch: i for i,ch in enumerate(chars)} # string to index
itos = {i: ch for i,ch in enumerate(chars)} # index to string
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

### Transformer

In [None]:
class Head(nn.Module):
	"""
	One head of self-attention
	"""
	def __init__(self, emb_size, head_size, dropout=0.0):
		super().__init__()
		self.head_size = head_size
		self.query = nn.Linear(emb_size, head_size, bias=False)
		self.key = nn.Linear(emb_size, head_size, bias=False)
		self.value = nn.Linear(emb_size, head_size, bias=False)
		self.dropout = nn.Dropout(dropout)
		self.softmax = nn.Softmax(dim=-1)

	def forward(self, x):
		Q = self.query(x)
		K = self.key(x)
		V = self.value(x)
		attention_score = Q @ K.transpose(-2, -1) / (self.head_size ** 0.5)

		# if self.training:
		# 	mask = torch.tril(torch.ones(seq_len, seq_len)).to(device)
		# 	attention_score = attention_score.masked_fill(mask == 0, float('-inf'))

		out = self.softmax(attention_score) @ V
		out = self.dropout(out)
		return out

In [None]:
class MultiHeadAttention(nn.Module):
	"""
	Multiple heads of self-attention in parallel
	"""
	def __init__(self, emb_size, num_heads, head_size, dropout=0.0):
		super().__init__()
		self.head_size = head_size
		self.query = nn.Linear(emb_size, num_heads * head_size, bias=False)
		self.key = nn.Linear(emb_size, num_heads * head_size, bias=False)
		self.value = nn.Linear(emb_size, num_heads * head_size, bias=False)
		self.dropout = nn.Dropout(dropout)
		self.softmax = nn.Softmax(dim=-1)
		self.linear = nn.Linear(num_heads * head_size, emb_size)
		
	def forward(self, x):
		batch_size, seq_len, emb_size = x.shape
		Q = self.query(x).view(batch_size, seq_len, -1, self.head_size).transpose(1, 2)
		K = self.key(x).view(batch_size, seq_len, -1, self.head_size).transpose(1, 2)
		V = self.value(x).view(batch_size, seq_len, -1, self.head_size).transpose(1, 2)

		attention_score = Q @ K.transpose(-2, -1) / (self.head_size ** 0.5)
		out = self.softmax(attention_score) @ V
		out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
		out = self.linear(out)
		out = self.dropout(out)

		return out

In [None]:
class FeedForwardNetwork(nn.Module):
	"""
	A simple linear layer followed by a non-linearity
	"""
	def __init__(self, emb_size, hidden_size, dropout=0.0):	
		super().__init__()
		self.linear1 = nn.Linear(emb_size, hidden_size)
		self.linear2 = nn.Linear(hidden_size, emb_size)
		self.relu = nn.ReLU()
		self.dropout = nn.Dropout(dropout)

	def forward(self, x):
		x = self.linear1(x)
		x = self.relu(x)
		x = self.dropout(x)
		x = self.linear2(x)
		return x

In [None]:
class TransformerBlock(nn.Module):
	"""
	Transformer block: communication followed by computation
	"""
	def __init__(self, emb_size, num_heads, head_size, hidden_size, dropout=0.0):
		super().__init__()
		self.attention = MultiHeadAttention(emb_size, num_heads, head_size, dropout)
		self.ffn = FeedForwardNetwork(emb_size, hidden_size, dropout)
		self.norm1 = nn.LayerNorm(emb_size)
		self.norm2 = nn.LayerNorm(emb_size)
		
	def forward(self, x):
		x = self.norm1(x + self.attention(x))
		x = self.norm2(x + self.ffn(x))
		return x

In [None]:
class GPT(nn.Module):
	def __init__(self, dict_size, emb_size, seq_len, num_heads=8, head_size=64, hidden_size=2048, dropout=0.3):
		super().__init__()
		self.token_embedding_table = nn.Embedding(num_embeddings=dict_size, embedding_dim=emb_size)
		self.blocks = nn.Sequential(
			*[
				TransformerBlock(
				emb_size=emb_size,
		   		num_heads=num_heads,
				head_size=head_size,
				hidden_size=hidden_size,
				dropout=dropout
				) for _ in range(6)
			]
		)
		self.flatten = nn.Flatten()
		self.linear = nn.Linear(emb_size * seq_len, dict_size)
		self.apply(self._init_weights)

	def _init_weights(self, module):
		if isinstance(module, nn.Linear):
			torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
			if module.bias is not None:
				torch.nn.init.zeros_(module.bias)
		elif isinstance(module, nn.Embedding):
			torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

	def forward(self, idx):
		# idx = torch.argmax(x, dim=-1)
		emb = self.token_embedding_table(idx)
		out = self.blocks(emb)
		out = self.flatten(out)
		logits = self.linear(out)
		return logits

	def generate_next_token(self, idx):
		logits = self.forward(idx)
		probs = F.softmax(logits, dim=-1)
		next_token = torch.multinomial(probs, num_samples=1)
		return next_token
		

### Names dataset

In [9]:
f = open('./data/names.txt', 'r')
names = f.read().splitlines()

In [None]:
class NamesDataset(Dataset):
	def __init__(self, names: list[str], seq_len: int=3):
		self._seq_len = seq_len
		self.__stoi = {
			"<start>": 0,
			"<end>": 1,
		}
		for i, ch in enumerate(sorted(set(''.join(names)))):
			self.__stoi[ch] = i + 2
		self.__itos = {i: ch for ch, i in self.__stoi.items()}
		self.names: list[tuple[str, str]] = []
		self.names_encoded: list[tuple[Tensor, Tensor]] = []
		for name in names:
			name_lst = list(name) + ['<end>']
			for i in range(len(name_lst)):
				target: str = name_lst[i]
				target_encoded = torch.tensor([self.__stoi[target]])
				part_name = name_lst[max(0, i - self.seq_len):i]
				entry = ["<start>"] * (self.seq_len - len(part_name)) + part_name
				entry_encoded = torch.tensor([self.__stoi["<start>"]] * (self.seq_len - len(part_name)) + [self.__stoi[ch] for ch in part_name])
				self.names.append((entry, target))
				self.names_encoded.append((entry_encoded, target_encoded))

	def __len__(self):
		return len(self.names)
	
	@property
	def seq_len(self):
		return self._seq_len
	
	@property
	def stoi(self):
		return self.__stoi
	
	@property	
	def itos(self):
		return self.__itos
	
	def get_sample(self, idx):
		return self.names[idx]
	
	def __getitem__(self, idx):
		return self.names_encoded[idx]

In [11]:
names_dataset = NamesDataset(names)
names_loader = DataLoader(names_dataset, batch_size=32, shuffle=True)

In [None]:
for i in range(20):
	print(names_dataset.get_sample(i))

(['<start>', '<start>', '<start>'], 'e')
(['<start>', '<start>', 'e'], 'm')
(['<start>', 'e', 'm'], 'm')
(['e', 'm', 'm'], 'a')
(['m', 'm', 'a'], '<end>')
(['<start>', '<start>', '<start>'], 'o')
(['<start>', '<start>', 'o'], 'l')
(['<start>', 'o', 'l'], 'i')
(['o', 'l', 'i'], 'v')
(['l', 'i', 'v'], 'i')
(['i', 'v', 'i'], 'a')
(['v', 'i', 'a'], '<end>')
(['<start>', '<start>', '<start>'], 'a')
(['<start>', '<start>', 'a'], 'v')
(['<start>', 'a', 'v'], 'a')
(['a', 'v', 'a'], '<end>')
(['<start>', '<start>', '<start>'], 'i')
(['<start>', '<start>', 'i'], 's')
(['<start>', 'i', 's'], 'a')
(['i', 's', 'a'], 'b')


In [None]:
model = GPT(
	dict_size=len(names_dataset.stoi),
	emb_size=8, seq_len=names_dataset.seq_len,
	hidden_size=64, num_heads=8,
	head_size=64,
	dropout=0.3).to(device)

In [46]:
optim = torch.optim.AdamW(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

In [47]:
epochs = 20

In [None]:
loss_history = []
perplexity_history = []
for epoch in range(epochs):
	model.train()
	total_loss = 0
	for seq, target in tqdm(train_loader):
		seq, target = seq.to(device), target.to(device)
		target = F.one_hot(target, num_classes=len(names_dataset.stoi)).squeeze(1).float()
		optim.zero_grad()
		logits = model(seq)
		loss = criterion(logits, target)
		loss.backward()
		optim.step()
		total_loss += loss.item()
	avg_loss = total_loss / len(names_loader)
	perplexity = torch.exp(avg_loss)
	loss_history.append(avg_loss)
	perplexity_history.append(perplexity)
	print(f"Epoch {epoch + 1} / {epochs}, Loss: {avg_loss:.4f}, Perplexity: {perplexity.4f}")

100%|██████████| 7130/7130 [05:28<00:00, 21.69it/s]


Epoch 1 / 20, Loss: 2.3163


100%|██████████| 7130/7130 [05:28<00:00, 21.71it/s]


Epoch 2 / 20, Loss: 2.3057


100%|██████████| 7130/7130 [05:28<00:00, 21.73it/s]


Epoch 3 / 20, Loss: 2.2971


100%|██████████| 7130/7130 [05:30<00:00, 21.59it/s]


Epoch 4 / 20, Loss: 2.2917


100%|██████████| 7130/7130 [05:32<00:00, 21.43it/s]


Epoch 5 / 20, Loss: 2.2882


100%|██████████| 7130/7130 [05:27<00:00, 21.76it/s]


Epoch 6 / 20, Loss: 2.2853


100%|██████████| 7130/7130 [05:27<00:00, 21.75it/s]


Epoch 7 / 20, Loss: 2.2830


100%|██████████| 7130/7130 [05:27<00:00, 21.79it/s]


Epoch 10 / 20, Loss: 2.2761


100%|██████████| 7130/7130 [05:28<00:00, 21.74it/s]


Epoch 11 / 20, Loss: 2.2746


100%|██████████| 7130/7130 [05:27<00:00, 21.74it/s]


Epoch 12 / 20, Loss: 2.2732


100%|██████████| 7130/7130 [05:26<00:00, 21.85it/s]


Epoch 13 / 20, Loss: 2.2716


100%|██████████| 7130/7130 [05:25<00:00, 21.91it/s]


Epoch 14 / 20, Loss: 2.2718


100%|██████████| 7130/7130 [05:25<00:00, 21.92it/s]


Epoch 15 / 20, Loss: 2.2696


100%|██████████| 7130/7130 [05:26<00:00, 21.84it/s]


Epoch 16 / 20, Loss: 2.2696


 64%|██████▍   | 4577/7130 [03:28<01:55, 22.06it/s]

In [87]:
torch.save(model.state_dict(), '/kaggle/working/gpt-name-3')

In [16]:
state_dict = torch.load('./weights/gpt-name-3', weights_only=True, map_location=device)
model.load_state_dict(state_dict)

<All keys matched successfully>

In [None]:
def generate_name(model):
	itos = names_dataset.itos
	seq = torch.zeros(names_dataset.seq_len).reshape(1, 3).to(device).int()
	end_token = names_dataset.stoi['<end>']
	name = []
	while True:
		next_token = model.generate_next_token(seq)
		if next_token.item() == end_token:
			break
		name.append(itos[next_token.item()])
		seq = torch.cat([seq, next_token], dim=1)[:, 1:]
	return ''.join(name)
		

In [None]:
for _ in range(10):
	print(generate_name(model))

keryab
ilder
samen
hirbelranna
aliea
kere
aster
jenn
jaan
pena
