In [159]:
from tqdm import tqdm
import pandas as pd
import os
import re
import json
import spacy
import gensim
from nltk.tokenize import sent_tokenize, wordpunct_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from tqdm import tqdm
from collections import Counter
import pandas as pd
import numpy as np
from PIL import Image
import os
import re
from string import punctuation

In [160]:
re_character = re.compile(r'(.+?):')
re_text = re.compile(r': (.+)')
re_episode = re.compile(r'(\w+?)\.')
re_del = re.compile(r'\[.+?\]')

In [161]:
def define_character(lines, file):
    character_texts = []
    episode = re_episode.match(file)[1]
    for n, line in enumerate(lines):
        line = re_del.sub('', line).replace('\xa0', ' ')
        if re_character.match(line) and re_text.search(line):
            character = re_character.match(line)[1]
            text = re_text.search(line)[1]
            character_texts.append(
                {'character': character, 
                 'episode': episode, 
                 'line': text}
            )
    return character_texts

In [162]:
characters_table = pd.DataFrame(columns=['character', 'episode', 'line'])
for root, dirs, files in os.walk('./SpongeBob_SquarePants_Transcripts'):
    for file in tqdm(files):
        path = os.path.join(root, file)
        with open(path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        character_texts = define_character(lines, file)
        characters_table = characters_table.append(character_texts, ignore_index=True)

100%|████████████████████████████████████████| 393/393 [00:00<00:00, 701.11it/s]


In [163]:
characters_table = characters_table.drop(index=32911)

In [164]:
re_title = re.compile(r'[A-Z][a-z]*')
def normalize_title(title):
    return ' '.join(re_title.findall(title))

In [165]:
characters_table['episode'] = characters_table['episode'].apply(normalize_title)

In [166]:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

Какие поинты можно отметить:
- длина реплики каждого персонажа из основных
- самое часто слово у каждого / во всём сериале
- серия с наибольшим количеством реплик
- облака слов каждого персонажа из основных
- можно для прикола просто факты накидать (спарсить с сайта какого-нибудь?:))

In [167]:
char_info = {'SpongeBob': {},
            'Patrick': {},
            'Mr. Krabs': {},
            'Squidward': {},
            'Plankton': {},
            'Sandy': {},
            'Karen': {},
            'Gary': {}}

#### Длина реплики

In [168]:
main_char = ['SpongeBob', 'Patrick', 'Mr. Krabs', 'Squidward', 'Plankton', 'Sandy', 'Karen', 'Gary']
main_char_table = characters_table.copy()[characters_table['character'].isin(main_char)]

In [169]:
main_char_table['len_line'] = main_char_table['line'].apply(lambda x: len(x.split()))

In [170]:
cnt_len_table = main_char_table.groupby('character')['len_line'].agg('median').reset_index(name='count')

In [171]:
for i, row in cnt_len_table.iterrows():
    char_info[row['character']]['length'] = row['count']

#### Вордклауды

In [172]:
all_lines = main_char_table.groupby('character')['line'].agg(list).reset_index()

In [173]:
sw = stopwords.words('english') + ['go', 'get', 'oh', 'well', 'like', 'come', 'look', 'know', 'see', 'hey', 'na', 'one', 'two']
lemmatizer = WordNetLemmatizer()
nltk_to_pos = {'ADV': 'r', 'NOUN': 'n', 'VERB': 'v', 'ADJ': 'a'}

In [174]:
def for_model(lines):
    lemmas = []
    new_sent = []
    for line in lines:
        sentences = sent_tokenize(line)
        for sent in sentences:
            words = [word for word in word_tokenize(sent) if word.isalpha()]
            for word in words:
                pos = pos_tag([word], tagset='universal')[0][1]
                if pos in ['VERB', 'NOUN', 'ADV', 'ADJ']:
                    word = lemmatizer.lemmatize(word.lower(), pos=nltk_to_pos[pos])
                else:
                    word = lemmatizer.lemmatize(word.lower())
                if word not in sw:
                    lemmas.append(word)
    return lemmas

In [175]:
all_lines['lemmas'] = all_lines['line'].apply(for_model)

In [187]:
i = -1
masks = {'SpongeBob': './masks/spongebob_mask.png', 
         'Patrick': './masks/patrick_mask.png',
         'Mr. Krabs': './masks/mrkrabs_mask.png',
         'Sandy': './masks/sandy_mask.png',
         'Squidward': './masks/squidward_mask.png',
         'Karen': './masks/karen_mask.png',
         'Plankton': './masks/plankton_mask.png',
         'Gary': './masks/gary_mask.png'
        }
for name in all_lines['character'].unique():
    i += 1
    mask = np.array(Image.open(masks[name]))
    text = ' '.join(all_lines[all_lines['character'] == name]['lemmas'][i])
    wordcloud = WordCloud(
    background_color ='white',
    width = 800,
    height = 800,
    mask = mask
    ).generate(text)
    '''
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud)
    plt.axis("off") 
    plt.title(f'{name}')
    plt.show()
    '''
    wordcloud.to_file(f'{masks[name]}_cloud.png')
    char_info[name]['wordcloud'] = f'{masks[name]}_cloud.png'

#### Наиболее частые слова персонажа

In [178]:
def frequent_word(words):
    count_words = Counter(words)
    return count_words.most_common(5)

In [179]:
all_lines['frequent'] = all_lines['lemmas'].apply(frequent_word)

In [180]:
for i, row in all_lines.iterrows():
    char_info[row['character']]['most_words'] = row['frequent']

#### Количество серий, в которых встречается персонаж

In [181]:
all_episodes = main_char_table.groupby('character')['episode'].unique().reset_index(name='all_episodes')

In [182]:
def count_episodes(episodes):
    return len(episodes)

In [183]:
all_episodes['count_episodes'] = all_episodes['all_episodes'].apply(count_episodes)

In [184]:
all_episodes.sort_values(['count_episodes'])

Unnamed: 0,character,all_episodes,count_episodes
1,Karen,"[Karen, A Cabininthe Kelp, Goo Goo Gas, Grandm...",55
5,Sandy,"[Oral Report, Stuckonthe Roof, A Cabininthe Ke...",96
4,Plankton,"[The Krusty Slammer, The Great Patty Caper, Ka...",97
0,Gary,"[Penny Foolish, Sentimental Sponge, Bummer Vac...",110
2,Mr. Krabs,"[Lighthouse Louie, Penny Foolish, Barnacle Fac...",262
3,Patrick,"[Oral Report, Sentimental Sponge, Fun Sized Fr...",270
7,Squidward,"[Penny Foolish, Oral Report, The Krusty Slamme...",295
6,SpongeBob,"[Lighthouse Louie, Penny Foolish, Barnacle Fac...",389


In [185]:
for i, row in all_episodes.iterrows():
    char_info[row['character']]['count_episodes'] = row['count_episodes']

#### Тональность реплик?

https://spongebob.fandom.com/wiki/List_of_transcripts#Season_1

либо транскрипты стащить, либо вики странички

In [188]:
with open('char_info.json', 'w') as f:
    f.write(json.dumps(char_info))

#### Количество реплик в эпизоде

In [104]:
lines_per_episode = (characters_table
                     .groupby('episode')['line']
                     .agg('count')
                     .reset_index(name='lines_cnt'))

#### Количество персонажей в эпизоде

In [43]:
character_per_episode = (characters_table
                     .groupby('episode')['character']
                     .agg('unique')
                     .reset_index(name='unique_char'))

In [46]:
character_per_episode['unique_char_cnt'] = character_per_episode['unique_char'].apply(len)

In [49]:
character_per_episode.sort_values(['unique_char_cnt'], ascending=False)[:5]

Unnamed: 0,episode,unique_char,unique_char_cnt
277,Sponge Bobs Big Birthday Blowout,"[Patchy, Potty, SpongeBob's alarm clock, Spong...",75
371,Truthor Square,"[Ricky Gervais, Patchy the Pirate, Security Gu...",64
116,High Sea Diving,"[Perch Perkins, Gary, SpongeBob, Monroe Timmy,...",37
358,The Sponge Who Could Fly,"[French narrator, Potty, Patchy, Children, Mrs...",36
86,Friendor Foe,"[French Narrator, Patchy, Potty, Mr. Pirateson...",34


In [105]:
episodes = character_per_episode.join(lines_per_episode.set_index('episode'), on='episode')

#### Наиболее частый и редкий персонажи

In [89]:
episodes_per_char = characters_table.groupby('character')['episode'].agg('unique').reset_index(name='unique_ep')

In [90]:
episodes_per_char['unique_ep_cnt'] = episodes_per_char['unique_ep'].apply(len)

In [101]:
episodes_per_char[episodes_per_char['character'] == 'Barnacle Boy']

Unnamed: 0,character,unique_ep,unique_ep_cnt
137,Barnacle Boy,"[Mermaid Manand Barnacle Boy, Mermaid Manand B...",11


1448 персонажей встречаются один раз
146 персонажей встречаются 2 раза
49 персонажей встречаются 3 раза

In [155]:
episode_stat = {'lines_cnt': {}, 'chatacters_cnt': {}}

In [156]:
a = episodes.sort_values(['unique_char_cnt'], ascending=False)[:3]
b = episodes.sort_values(['unique_char_cnt'], ascending=True)[:3]
for i, row in a.iterrows():
    episode_stat['chatacters_cnt'][row['episode']] = [row['unique_char_cnt']]
for i, row in b.iterrows():
    episode_stat['chatacters_cnt'][row['episode']] = [row['unique_char_cnt'], row['unique_char'].tolist()]

In [157]:
a = episodes.sort_values(['lines_cnt'], ascending=False)[:3]
b = episodes.sort_values(['lines_cnt'], ascending=True)[:3]
for i, row in a.iterrows():
    episode_stat['lines_cnt'][row['episode']] = row['lines_cnt']
for i, row in b.iterrows():
    episode_stat['lines_cnt'][row['episode']] = row['lines_cnt']

In [158]:
with open('episodes.json', 'w') as f:
    f.write(json.dumps(episode_stat))