In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
%matplotlib inline

In [2]:
random.seed(1)
np.random.seed(1)

Будем пробовать генерировать шутки. Для обучения будем использовать [датасет с постами reddit](https://kaggle.com/datasets/thedevastator/one-million-reddit-jokes).

In [3]:
path = '/content/drive/MyDrive/NLP_manual/Datasets/reddit_jokes.csv'
data = pd.read_csv(path)

In [4]:
data.head()

Unnamed: 0,type,id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,domain,url,selftext,title,score
0,post,ftbp1i,2qh72,jokes,False,1585785543,https://old.reddit.com/r/Jokes/comments/ftbp1i...,self.jokes,,My corona is covered with foreskin so it is no...,I am soooo glad I'm not circumcised!,2
1,post,ftboup,2qh72,jokes,False,1585785522,https://old.reddit.com/r/Jokes/comments/ftboup...,self.jokes,,It's called Google Sheets.,Did you know Google now has a platform for rec...,9
2,post,ftbopj,2qh72,jokes,False,1585785508,https://old.reddit.com/r/Jokes/comments/ftbopj...,self.jokes,,The vacuum doesn't snore after sex.\r\n\r\n&am...,What is the difference between my wife and my ...,15
3,post,ftbnxh,2qh72,jokes,False,1585785428,https://old.reddit.com/r/Jokes/comments/ftbnxh...,self.jokes,,[removed],My last joke for now.,9
4,post,ftbjpg,2qh72,jokes,False,1585785009,https://old.reddit.com/r/Jokes/comments/ftbjpg...,self.jokes,,[removed],The Nintendo 64 turns 18 this week...,134


Так как наша задача генерации требует только текста, оставим только некоторые столбцы.

In [5]:
columns = ['selftext']
data = data[columns]

## Обработка данных
###1. Чистка датасета

Для начала надо избавиться от пустых строк, или же нанов. Часто они обозначаются как nan, но иногда можно заметить иные способы.

In [6]:
data['selftext'].value_counts()[:10]

[removed]                          232919
[deleted]                          188442
\[removed\]                           272
To get to the other side.             125
Dr. Dre                               111
A stick.                               83
None.                                  81
A stick                                76
He worked it out with a pencil.        74
Then it hit me.                        72
Name: selftext, dtype: int64

Можно заметить, что наиболее частым классом являются _removed_ или _deleted_.

In [7]:
print('Размер данных до чистки', data.shape)
data = data[~data.isin(['[removed]', '[deleted]', '\[removed\]', 'removed', 'deleted'])]
data = data.dropna()
print('Размер данных после чистки', data.shape)

Размер данных до чистки (999998, 1)
Размер данных после чистки (573887, 1)


Надо тексты привести к нижнему регистру и убрать пунктуацию.
Кроме этого можно избавиться от совсем коротких шуток, так как скорее всего это просто ответы на фразы.

In [8]:
from string import punctuation
import re

In [9]:
def clean_text(text):
     text = text.lower()
     #text = re.sub(r'[^\w\s\.]', '', text)
     new_text = []
     for word in text.split():
        if word.endswith(tuple(punctuation)):
            new_text.append(word[:-1])
            new_text.append(word[-1])
        else:
            new_text.append(word)
     return new_text

In [10]:
data['words'] = data['selftext'].apply(clean_text)
#data['words'] = data['clean_text'].apply(str.split)
data['lens'] = data['words'].apply(len)
data = data[data.lens > 3]

### 2. Обработка текста

In [11]:
words = data['words'].tolist()

In [12]:
words[:2]

[['my',
  'corona',
  'is',
  'covered',
  'with',
  'foreskin',
  'so',
  'it',
  'is',
  'not',
  'exposed',
  'to',
  'viruses',
  '.'],
 ["it's", 'called', 'google', 'sheets', '.']]

## N-grams
Для начала попробуем создать самую простую модель, основанную на встречаемости н-граммы в корпусе.

In [13]:
from collections import defaultdict, Counter

In [14]:
# добавляем токены начала и конца
BOS, EOS, UNK = '[bos]', '[eos]', '[unk]'

def ngram_counts(lines, n):
    dictionary = defaultdict(Counter)
    for line in lines:
        new_line = [BOS] * (n-1) + line + [EOS]
        for i in range(n-1, len(new_line)):
            prefix = tuple(new_line[i-n+1:i])
            word = new_line[i]
            dictionary[prefix][word] += 1
    return dictionary

dummy_lines = sorted(words, key=len)[:100]
dummy_counts = ngram_counts(dummy_lines, n=3)
assert set(map(len, dummy_counts.keys())) == {2}, "please only count {n-1}-grams"
assert len(dummy_counts[(BOS, BOS)]) == 66
assert dummy_counts[BOS, 'a']['melon'] == 1

In [15]:
class NGramLanguageModel:
    def __init__(self, lines, n):
        assert n >= 1
        self.n = n

        counts = self.ngram_counts(lines, self.n)

        # compute token proabilities given counts
        self.probs = defaultdict(Counter)
        # probs[(word1, word2)][word3] = P(word3 | word1, word2)

        # populate self.probs with actual probabilities
        for key, value in counts.items():
            sum_of_prefix = sum(value.values())
            for word, cnts in value.items():
                self.probs[key][word] = cnts / sum_of_prefix

    def get_possible_next_tokens(self, prefix):
        """
        :param prefix: string with space-separated prefix tokens
        :returns: a dictionary {token : it's probability} for all tokens with positive probabilities
        """
        prefix = prefix.split()
        prefix = prefix[max(0, len(prefix) - self.n + 1):]
        prefix = [ BOS ] * (self.n - 1 - len(prefix)) + prefix
        return self.probs[tuple(prefix)]

    def get_next_token_prob(self, prefix, next_token):
        """
        :param prefix: string with space-separated prefix tokens
        :param next_token: the next token to predict probability for
        :returns: P(next_token|prefix) a single number, 0 <= P <= 1
        """
        return self.get_possible_next_tokens(prefix).get(next_token, 0)

    @staticmethod
    def ngram_counts(lines, n):
        dictionary = defaultdict(Counter)
        for line in lines:
            new_line = [BOS] * (n-1) + line + [EOS]
            for i in range(n-1, len(new_line)):
                prefix = tuple(new_line[i-n+1:i])
                word = new_line[i]
                dictionary[prefix][word] += 1
        return dictionary

In [16]:
dummy_lm = NGramLanguageModel(dummy_lines, n=3)
p_initial = dummy_lm.get_possible_next_tokens('')
assert p_initial.most_common(1)[0][0] == 'a'

1. Попробуем составить предложение используя жадный метод.

In [17]:
def get_next_word(lm, prefix):
    return lm.get_possible_next_tokens(prefix).most_common(1)[0][0]

In [18]:
lm = NGramLanguageModel(words, n=3)
prefix = 'get'
for i in range(100):
    word = get_next_word(lm, prefix)
    prefix += ' ' + word
    if prefix.endswith(EOS) or len(lm.get_possible_next_tokens(prefix)) == 0:
        break

print(prefix)

get off the roof . [eos]


In [19]:
sentence = ''
word = lm.get_possible_next_tokens('').most_common(1)[0][0]
while word != EOS:
    sentence += f' {word}'
    word = lm.get_possible_next_tokens(sentence).most_common(1)[0][0]

In [20]:
sentence

' i was a little bit of a sudden , the man says , "i don\'t know what to do with the same thing .'

2. Выбор наиболее вероятного слова не показал хороших результатов. Давайте попробуем семплировать методом top-k.

In [21]:
def get_next_word(lm, prefix, k):
    next_words = lm.get_possible_next_tokens(prefix).most_common(k)
    index = random.randint(0, min(k, len(next_words))-1)
    return next_words[index][0]

In [22]:
lm = NGramLanguageModel(words, n=3)

In [23]:
prefix = 'get'
for i in range(100):
    word = get_next_word(lm, prefix, 5)
    prefix += ' ' + word
    if prefix.endswith(EOS) or len(lm.get_possible_next_tokens(prefix)) == 0:
        break

print(prefix)

get a better look and says "i never knew you were a couple of weeks . after a long day at noon . so the man says "i want my left breast . but i was going fishing on his way . some years later . so the guy is sitting on top and said : "you are on me ! my favorite joke ) edit 4 : went bankrupt before i could see it . the second nun , who is in the back room to check it out. ' the woman says : i know you could get rid


In [24]:
prefix = ''
word = get_next_word(lm, '', 5)
while word != EOS:
    prefix += f'{word} '
    word = get_next_word(lm, prefix, 5)
print(prefix + f'{word} ')

because it would take me out of it ! i don't like to hear a zipper on his head . so i asked my mom was a very attractive , older men were in the world , an elderly man in line to get a drink . as he can see your license , that would have been the same question : what ? [eos] 


3. Для сравнения можно сделать beam search. Напоминаем, что он на каждом шаге выбирает k наилучших вариантов - те, с которыми наибольшая вероятность всего предложения.

In [25]:
prefix = 'he'
#lm = NGramLanguageModel(words, n=3)
#next_word = lm.get_possible_next_tokens(prefix).most_common(1)[0]
#prefix += next_word[0]
#prob = next_word[1]
prob=1

k = 5
best_k = 2
next_word = lm.get_possible_next_tokens(prefix).most_common(best_k)
prefixes = [prefix + f' {next_word[0][0]}', prefix + f' {next_word[1][0]}']
probs = [prob * next_word[0][1], prob * next_word[1][1]]

#for step in range(5):
step = 1
while (not prefixes[0].endswith(EOS)) and (not prefixes[0].endswith(EOS)) and (step != 20):
    print('step', step)
    step += 1
    print(prefixes, probs, sep='\n')
    possible_words1 = lm.get_possible_next_tokens(prefixes[0]).most_common(best_k)
    probs1 = []
    for word in possible_words1:
        probs1.append((word[0], probs[0]*word[1]))
    possible_words2 = lm.get_possible_next_tokens(prefixes[1]).most_common(best_k)
    probs2 = []
    for word in possible_words2:
        probs2.append((word[0], probs[1]*word[1]))
    choice = []
    probs1 = sorted(probs1, key=lambda x: x[1], reverse=True)
    probs2 = sorted(probs2, key=lambda x: x[1], reverse=True)
    probs_new = []
    while len(choice) != best_k and len(probs1) != 0 and len(probs2) != 0:
        if probs1[0][1] > probs2[0][1]:
            choice.append(prefixes[0] + f' {probs1[0][0]}')
            probs_new.append(probs[0] * probs1[0][1])
            probs1 = probs1[1:]
            possible_words1 = probs1[1:]
        else:
            choice.append(prefixes[1] + f' {probs2[0][0]}')
            probs_new.append(probs[1] * probs2[0][1])
            probs2 = probs2[1:]
            possible_words2 = probs2[1:]
    prefixes = choice
    probs = probs_new

step 1
['he was', 'he said']
[0.1389352775854344, 0.07018044071524582]
step 2
['he said ,', 'he was a']
[0.0016555081057632137, 0.001751799011952464]
step 3
['he said , "i', 'he was a little']
[2.0088509635331525e-07, 1.2258589536713203e-07]
step 4
['he said , "i don\'t', 'he said , "i have']
[4.097103808716215e-15, 2.373165769238488e-15]
step 5
['he said , "i don\'t know', 'he said , "i have a']
[6.217719161567378e-30, 1.4285188701674254e-30]
step 6
['he said , "i don\'t know what', 'he said , "i don\'t know ,']
[7.020224514695353e-60, 6.707925370390092e-60]
step 7
['he said , "i don\'t know , i', 'he said , "i don\'t know what to']
[8.326080119732219e-120, 6.431011625809181e-120]
step 8
['he said , "i don\'t know what to do', 'he said , "i don\'t know what to say']
[2.5991189639290983e-239, 3.488291241062737e-240]
step 9
['he said , "i don\'t know what to do with', 'he said , "i don\'t know what to do it']
[0.0, 0.0]
step 10
['he said , "i don\'t know what to do it .', 'he said , "i 

In [27]:
prefix = 'because'
#lm = NGramLanguageModel(words, n=3)
#next_word = lm.get_possible_next_tokens(prefix).most_common(1)[0]
#prefix += next_word[0]
#prob = next_word[1]
prob=1

k = 5
best_k = 2
next_word = lm.get_possible_next_tokens(prefix).most_common(best_k)
prefixes = [prefix + f' {next_word[0][0]}', prefix + f' {next_word[1][0]}']
probs = [prob * np.log(next_word[0][1]), prob * np.log(next_word[1][1])]

#for step in range(5):
step = 1
while (not prefixes[0].endswith(EOS)) and (not prefixes[1].endswith(EOS)) and (step != 20):
    print('step', step)
    step += 1
    print(prefixes, probs, sep='\n')
    possible_words1 = lm.get_possible_next_tokens(prefixes[0]).most_common(best_k)
    probs1 = []
    for word in possible_words1:
        probs1.append((word[0], 1/(step+1)*(probs[0]+np.log(word[1]))))
    possible_words2 = lm.get_possible_next_tokens(prefixes[1]).most_common(best_k)
    probs2 = []
    for word in possible_words2:
        probs2.append((word[0], 1/(step+1)*(probs[1]+np.log(word[1]))))
    choice = []
    probs1 = sorted(probs1, key=lambda x: x[1], reverse=True)
    probs2 = sorted(probs2, key=lambda x: x[1], reverse=True)
    probs_new = []
    while len(choice) != best_k and len(probs1) != 0 and len(probs2) != 0:
        if probs1[0][1] > probs2[0][1]:
            choice.append(prefixes[0] + f' {probs1[0][0]}')
            probs_new.append(probs[0] * probs1[0][1])
            probs1 = probs1[1:]
            possible_words1 = probs1[1:]
        else:
            choice.append(prefixes[1] + f' {probs2[0][0]}')
            probs_new.append(probs[1] * probs2[0][1])
            probs2 = probs2[1:]
            possible_words2 = probs2[1:]
    prefixes = choice
    probs = probs_new

step 1
['because they', 'because he']
[-1.810170175064868, -1.9319008203695496]
step 2
['because he was', 'because they are']
[2.2086360066108317, 2.275294391976661]
step 3
['because he was a', 'because they are both']
[-0.10545234172117024, -0.3833335623358987]
step 4
['because he was a little', 'because he was a bit']
[0.07014019480820664, 0.08140607177199918]
step 5
['because he was a bit of', 'because he was a bit ,']
[-0.02267668699102755, -0.030634148384618986]
step 6
['because he was a bit of a', 'because he was a bit , and']
[0.003795426211422038, 0.006442133000250419]
step 7
['because he was a bit , and the', 'because he was a bit of a sudden']
[-0.001882908284150799, -0.0011293985996205466]
step 8
['because he was a bit of a sudden ,', 'because he was a bit of a sudden the']
[0.00014219202611013022, 0.0002849389368838768]
step 9
['because he was a bit of a sudden , the', 'because he was a bit of a sudden , a']
[-2.0241599435691766e-05, -2.0241599435691766e-05]
step 10
['becau