In [1]:
import json
import spacy
import gensim
from nltk.tokenize import sent_tokenize, wordpunct_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from tqdm import tqdm
import pandas as pd
import os
import re
from string import punctuation
import gensim
import logging
import urllib.request

import warnings
warnings.filterwarnings('ignore')
# На вход модели даем текстовый файл, каждое предложение на отдельной строчке.

In [2]:
re_character = re.compile(r'(.+?):')
re_text = re.compile(r': (.+)')
re_episode = re.compile(r'(\w+?)\.')
re_del = re.compile(r'\[.+?\]')

In [3]:
def define_character(lines, file):
    character_texts = []
    episode = re_episode.match(file)[1]
    for n, line in enumerate(lines):
        line = re_del.sub('', line)
        if re_character.match(line) and re_text.search(line):
            character = re_character.match(line)[1]
            text = re_text.search(line)[1]
            character_texts.append(
                {'character': character, 
                 'episode': episode, 
                 'line': text}
            )
    return character_texts

In [4]:
characters_table = pd.DataFrame(columns=['character', 'episode', 'line'])
for root, dirs, files in os.walk('./SpongeBob_SquarePants_Transcripts'):
    for file in tqdm(files):
        path = os.path.join(root, file)
        with open(path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        character_texts = define_character(lines, file)
        characters_table = characters_table.append(character_texts, ignore_index=True)

In [5]:
my_characters = characters_table[characters_table['character'].copy().isin([
    'SpongeBob', 'Patrick', 'Mr. Krabs', 'Squidward', 'Plankton', 'Sandy', 'Karen'])]

In [6]:
sw = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
nltk_to_pos = {'ADV': 'r', 'NOUN': 'n', 'VERB': 'v', 'ADJ': 'a'}

In [7]:
def for_model(line):
    lemmas = []
    new_sent = []
    sentences = sent_tokenize(line)
    for sent in sentences:
        words = [word for word in word_tokenize(sent) if word[-1] not in punctuation]
        if words != []:
            new_sent.append(' '.join(words))
        lemmas = []
    return '\n'.join(new_sent)
'''
def for_model(line):
    lemmas = []
    new_sent = []
    sentences = sent_tokenize(line)
    for sent in sentences:
        words = [word for word in word_tokenize(sent) if word[-1] not in punctuation]
        for word in words:
            pos = pos_tag([word], tagset='universal')[0][1]
            if pos in ['VERB', 'NOUN', 'ADV', 'ADJ']:
                word = lemmatizer.lemmatize(word.lower(), pos=nltk_to_pos[pos])
            else:
                word = lemmatizer.lemmatize(word.lower())
            lemmas.append(word)
        if lemmas != []:
            new_sent.append(' '.join(lemmas))
        lemmas = []
    return '\n'.join(new_sent)
'''

"\ndef for_model(line):\n    lemmas = []\n    new_sent = []\n    sentences = sent_tokenize(line)\n    for sent in sentences:\n        words = [word for word in word_tokenize(sent) if word[-1] not in punctuation]\n        for word in words:\n            pos = pos_tag([word], tagset='universal')[0][1]\n            if pos in ['VERB', 'NOUN', 'ADV', 'ADJ']:\n                word = lemmatizer.lemmatize(word.lower(), pos=nltk_to_pos[pos])\n            else:\n                word = lemmatizer.lemmatize(word.lower())\n            lemmas.append(word)\n        if lemmas != []:\n            new_sent.append(' '.join(lemmas))\n        lemmas = []\n    return '\n'.join(new_sent)\n"

In [8]:
my_characters['lemmatized_lines'] = my_characters['line'].apply(for_model)

In [9]:
my_characters.head()

Unnamed: 0,character,episode,line,lemmatized_lines
1,SpongeBob,LighthouseLouie,"Hold on, I'll check the textbook. Ah, ""Your ...",Hold on I 'll check the textbook\nAh Your Mirr...
4,SpongeBob,LighthouseLouie,Hooray!,Hooray
5,SpongeBob,LighthouseLouie,"Mrs. Puff! Oh, there you are. I'll make it u...",Mrs\nPuff\nOh there you are\nI 'll make it up ...
7,SpongeBob,LighthouseLouie,I love cleaning!,I love cleaning
9,SpongeBob,LighthouseLouie,That takes care of that. Yuck. This place ne...,That takes care of that\nYuck\nThis place need...


In [10]:
all_lines = my_characters.groupby('character')['lemmatized_lines'].agg(list).reset_index()

In [11]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [12]:
i = -1
for name in all_lines['character'].unique():
    i += 1
    with open(f'{name}.txt', 'w') as f:
        f.write('\n'.join(all_lines[all_lines['character'] == name]['lemmatized_lines'][i]))

In [94]:
models = {}
for name in all_lines['character'].unique():
    text = f'{name}.txt'
    data = gensim.models.word2vec.LineSentence(text)
    model_char = gensim.models.Word2Vec(data, vector_size=300, window=5, min_count=2)
    models[name] = model_char
    model_char.init_sims(replace=True)
    model_path = f'{name}.bin'
    model_char.wv.save_word2vec_format(model_path, binary=True)

2022-03-24 22:38:56,047 : INFO : collecting all words and their counts
2022-03-24 22:38:56,049 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-03-24 22:38:56,055 : INFO : collected 1203 word types from a corpus of 3935 raw words and 638 sentences
2022-03-24 22:38:56,056 : INFO : Creating a fresh vocabulary
2022-03-24 22:38:56,060 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 408 unique words (33.915211970074814%% of original 1203, drops 795)', 'datetime': '2022-03-24T22:38:56.060619', 'gensim': '4.1.2', 'python': '3.9.6 (v3.9.6:db3ff76da1, Jun 28 2021, 11:49:53) \n[Clang 6.0 (clang-600.0.57)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2022-03-24 22:38:56,061 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 3140 word corpus (79.79669631512071%% of original 3935, drops 795)', 'datetime': '2022-03-24T22:38:56.061866', 'gensim': '4.1.2', 'python': '3.9.6 (v3.9.6:db3ff76da1, Jun 2

2022-03-24 22:38:56,260 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-24 22:38:56,261 : INFO : EPOCH - 1 : training on 58656 raw words (40479 effective words) took 0.1s, 779670 effective words/s
2022-03-24 22:38:56,307 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-03-24 22:38:56,311 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-24 22:38:56,312 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-24 22:38:56,313 : INFO : EPOCH - 2 : training on 58656 raw words (40329 effective words) took 0.0s, 1113734 effective words/s
2022-03-24 22:38:56,358 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-03-24 22:38:56,362 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-24 22:38:56,362 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-24 22:38:56,363 : INFO : EPOCH - 3 : training on 58656 raw words (40399 effective words)

2022-03-24 22:38:56,934 : INFO : storing 2277x300 projection weights into Patrick.bin
2022-03-24 22:38:56,943 : INFO : collecting all words and their counts
2022-03-24 22:38:56,944 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-03-24 22:38:56,954 : INFO : collected 4015 word types from a corpus of 25429 raw words and 4397 sentences
2022-03-24 22:38:56,955 : INFO : Creating a fresh vocabulary
2022-03-24 22:38:56,959 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 1630 unique words (40.59775840597759%% of original 4015, drops 2385)', 'datetime': '2022-03-24T22:38:56.959854', 'gensim': '4.1.2', 'python': '3.9.6 (v3.9.6:db3ff76da1, Jun 28 2021, 11:49:53) \n[Clang 6.0 (clang-600.0.57)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2022-03-24 22:38:56,960 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 23044 word corpus (90.62094459082151%% of original 25429, drops 2385)', 'datetime':

2022-03-24 22:38:57,160 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-03-24 22:38:57,169 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-24 22:38:57,170 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-24 22:38:57,170 : INFO : EPOCH - 1 : training on 17040 raw words (10702 effective words) took 0.0s, 610219 effective words/s
2022-03-24 22:38:57,179 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-03-24 22:38:57,188 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-24 22:38:57,189 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-24 22:38:57,189 : INFO : EPOCH - 2 : training on 17040 raw words (10690 effective words) took 0.0s, 605549 effective words/s
2022-03-24 22:38:57,198 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-03-24 22:38:57,206 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-0

2022-03-24 22:38:58,084 : INFO : storing 5011x300 projection weights into SpongeBob.bin
2022-03-24 22:38:58,103 : INFO : collecting all words and their counts
2022-03-24 22:38:58,104 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-03-24 22:38:58,122 : INFO : collected 5474 word types from a corpus of 46539 raw words and 8432 sentences
2022-03-24 22:38:58,123 : INFO : Creating a fresh vocabulary
2022-03-24 22:38:58,129 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 2385 unique words (43.569601753744976%% of original 5474, drops 3089)', 'datetime': '2022-03-24T22:38:58.129897', 'gensim': '4.1.2', 'python': '3.9.6 (v3.9.6:db3ff76da1, Jun 28 2021, 11:49:53) \n[Clang 6.0 (clang-600.0.57)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2022-03-24 22:38:58,130 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 43450 word corpus (93.36255613571413%% of original 46539, drops 3089)', 'datetim

In [95]:
models

{'Karen': <gensim.models.word2vec.Word2Vec at 0x7fb9f5c7bbe0>,
 'Mr. Krabs': <gensim.models.word2vec.Word2Vec at 0x7fb9f5c7b310>,
 'Patrick': <gensim.models.word2vec.Word2Vec at 0x7fba01b5d430>,
 'Plankton': <gensim.models.word2vec.Word2Vec at 0x7fb9f5969130>,
 'Sandy': <gensim.models.word2vec.Word2Vec at 0x7fba00c5cbb0>,
 'SpongeBob': <gensim.models.word2vec.Word2Vec at 0x7fb9f5c7b7c0>,
 'Squidward': <gensim.models.word2vec.Word2Vec at 0x7fba01879af0>}

In [97]:
models['SpongeBob'].wv.most_similar('Karen')

[("'em", 0.9076336622238159),
 ('inside', 0.9076310992240906),
 ('before', 0.9073677062988281),
 ('Nutty', 0.9069185256958008),
 ('ice', 0.9067336916923523),
 ('That', 0.9065715074539185),
 ('moon', 0.9065337181091309),
 ('Bottom', 0.9065083861351013),
 ('them', 0.9062583446502686),
 ("n't", 0.9061850309371948)]