#### Imports

In [None]:
import pandas as pd
import numpy as np

import re
import pymorphy3
import nltk
import spacy

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

import requests
import folium

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from fuzzywuzzy import fuzz

from tqdm import tqdm

In [None]:
# открытие файла с романом "Идиот"
with open('book.txt', 'r', encoding='cp1251') as file: 
    book = file.read()

In [None]:
book[:50]

#### Preprocessing

In [None]:
# nltk.download('popular')
# nltk.download('punkt')
# nltk.download('punkt_tab')
# nltk.download('stopwords')

In [None]:
# изменение списка стоп-слов
stop_words = set(stopwords.words('russian'))
stop_words_adds = ['это','мочь',  'все', 'весь', 'свой', 'твой', 'мой', 'еще', 'знать', 'говорить', 'сказать', 'который', 'очень', 'стать', 'хотеть', 'видеть', 'смотреть', 'чрез', 'спросить', 'сейчас', 'тотчас', 'начать', 'хотя', 'именно', 'давеча', 'сделать', 'тут', 'вдруг', 'пройти', 'кроме', 'впоследствии', 'здесь', 'пять', 'аль', 'эвона', 'эк', 'фью', 'ай', 'самый', 'ваш', 'слишком', 'точно', 'несколько']
stop_words.update(stop_words_adds)

In [None]:
morph = pymorphy3.MorphAnalyzer()

def preprocess(text):
    """разделение слов на токены и проверка на вхождение в список стоп-слов и список пунктуаций"""
    tokens = word_tokenize(text)
    preprocessed_text = []
    for token in tokens:
        if len(token) > 1 and token:
            if morph.parse(token)[0].normal_form.lower().replace('ё', 'е') not in stop_words and token.isalpha():
                preprocessed_text.append(token)
    return " ".join(preprocessed_text)

In [None]:
# разделение на предложения
book = book.replace('\n', '')
sentences = re.split(r'[.!?]+', book) 

In [None]:
processed_sentences = [preprocess(sentence) for sentence in sentences]

In [None]:
punctuation_marks = set(['!', ',', '(', ')', ':', '-', '?', '.', '..', '...', '-'])

Нормализация слов

In [None]:
def get_lemma(text):
    """лемматизация слов"""
    text_parts = text.split()
    lemma = []
    for w in text_parts:
        if w not in punctuation_marks:
            lemma.append(morph.parse(w)[0].normal_form.replace('ё', 'е'))
    return lemma

In [None]:
# cоздание списка нормализованных токенов для анализа дисперсии и анализа частотности слов
book_ = ' '.join(processed_sentences)
words = ' '.join(get_lemma(book_)).split()

In [None]:
words[:10]

In [None]:
# нормализация предложений
sentences_ = [get_lemma(sentence) for sentence in processed_sentences]

In [None]:
sentence_to_list = [' '.join(lst) for lst in sentences_]

df = pd.DataFrame(zip(sentences, sentence_to_list),  columns=['sentences', 'sentences_clear']).dropna() # датафрейм из предложений романа
df.to_csv('sentences.csv', index=False)

#### Frequency

In [None]:
pattern_parts = r'\bЧАСТЬ\s+\w+\b'
chapters = re.findall(pattern_parts, book)
print(*chapters, sep = '\n')

Историческоий период сюжета романа

In [None]:
numbers = re.findall(r'\d+', book) # поиск чисел в тексте
print(numbers)
print()
print("По всей видимости, в тексте не указывается год событий")

In [None]:
# найдем словосочетания со словом "век"
pattern_period = r'\w+\s+\bвек[а-я]\b'

period = re.findall(pattern_period, book)
period = set(period)
print(*period, sep = '\n')

В романе упоминается "девятнадцатый век", "золотой век" и "наш век". Вероянее всего, действия в романе описываются в период жизни автора романа Ф.М. Достоевского (1821-1881), в девятнадцатом веке.

In [None]:
# # разделим весь текст на части
parts = re.split(pattern_parts, book)
print(len(parts))

In [None]:
part_1 = parts[1] # первое значение в списке пустое, поэтому не записываем значение с индексом 0
part_2 = parts[2]
part_3 = parts[3]
part_4 = parts[4]

In [None]:
lengths = [len(part_1), len(part_2), len(part_3), len(part_4)]
labels = [f'Часть {i}' for i in range(1, 5)]

In [None]:
plt.bar(labels, lengths)
plt.xlabel(' ')
plt.ylabel('Длина')
plt.title('Размер частей')
plt.show()

In [None]:
parsed_words = []
for word in words:

    parse = morph.parse(word)[0]
    if 'Patr' not in parse.tag :
        
        parsed_words.append(word)        
counter = Counter(parsed_words)

top_words = counter.most_common(100)

In [None]:
wordcloud = WordCloud(width=1000, height=600, background_color='white').generate_from_frequencies(counter)
plt.figure(figsize=(15, 7.5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Список 100 наиболее частых слов романа "Идиот"\n')
plt.show()

#### Geographical objects

Упоминание города в тексте может свидетельствовать о местоположении, где происходят события романа, а также о путях передвижения персонажей

In [None]:
nlp = spacy.load("ru_core_news_md")

In [None]:
def search_entity(marks):
    """поиск именнованных сущностей"""
    entity = []
    
    mid = len(book) // 2
    book_part1 = book[:mid]
    book_part2 = book[mid:]

    for part in [book_part1, book_part2]:
        doc = nlp(part.lower())
        entity_found = [ent.text for ent in doc.ents if ent.label_ in marks]
    entity.extend(entity_found)
    return entity

In [None]:
locations_list = search_entity("LOC")

In [None]:
locations_list = [morph.parse(l)[0].normal_form.title() for l in locations_list if "Name" not in morph.parse(l)[0].tag]

In [None]:
locations_count = pd.Series(locations_list).value_counts()

In [None]:
locations_filtered = locations_count[(locations_count >= 2) &(locations_count.index.str.count(' ') == 0)]

In [None]:
labels = locations_filtered.index
sizes = locations_filtered.values

plt.figure(figsize=(9, 9))
plt.pie(
    sizes,
    labels=labels,
    autopct='%d%%',
    rotatelabels=True,
    startangle=90,
    wedgeprops={'linewidth': 1, 'edgecolor': 'white'})

plt.title('Локации  и места в романе "Идиот"\n', fontsize=16)
plt.show()

In [None]:
locations_df = locations_filtered.reset_index()
locations_df.columns = ['location', 'count']

In [None]:
def get_coordinates(city_name):
    """получение координат"""
    url = "https://nominatim.openstreetmap.org/search"
    params = {
        'q': city_name,
        'format': 'json',
        'limit': 1
    }
    headers = {'User-Agent': 'MyApp/1.0'}
    response = requests.get(url, params=params, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        if data:
            lat = float(data[0]['lat'])
            lon = float(data[0]['lon'])
            return lat, lon
    return None

In [None]:
def add_coordinates(df):
    """добавление координатв дф"""
    lats = []
    lons = []

    for index, row in locations_df.iterrows():
        coords = get_coordinates(row['location'])
        if coords:
            lats.append(coords[0])
            lons.append(coords[1])
        else:
            lats.append(None)
            lons.append(None)

    locations_df['latitude'] = lats
    locations_df['longitude'] = lons

    return locations_df

In [None]:
locations_df = add_coordinates(locations_df)

In [None]:
locations_df.head()

In [None]:
center_lat = locations_df['latitude'].mean()
center_lon = locations_df['longitude'].mean()

map = folium.Map(
    location=[center_lat, center_lon],
    zoom_start=4,
    min_zoom=4,
    max_zoom=4,
    control_scale=False,        
    zoom_control=False      
)

# Добавляем маркеры
for _, row in locations_df.iterrows():
    if pd.notnull(row['latitude']) and pd.notnull(row['longitude']):
        folium.Marker(
            location=[row['latitude'], row['longitude']],
            popup=row['location']        
).add_to(map)

In [None]:
map

#### Characters

In [None]:
nlp = spacy.load("ru_core_news_lg")

In [None]:
def get_characters():
    """поиск имен"""
    names_list = search_entity("PER")  # получение именованных сущностей
    

    def is_name_or_patronymic(word):
        """определение тегов имен"""
        tags = morph.parse(word)[0].tag
        return any(tag in tags for tag in ['Name', 'Surn', 'Patr'])

    def is_valid_name(name):
        """проверка на валидность имен"""
        return all(is_name_or_patronymic(w) for w in name.split())

    names_filtered = [name for name in names_list if is_valid_name(name)]

    def remove_short_words(name):
        """удаление коротких слов"""
        return ' '.join(w for w in name.split() if len(w) > 2)

    names_filtered = [remove_short_words(name) for name in names_filtered]

    def not_only_patronymic(name):
        """удаление строк, состоящих только из отчества"""
        parts = name.split()
        tags_list = [morph.parse(part)[0].tag for part in parts]
        has_patronymic = any('Patr' in tags for tags in tags_list)
        return len(parts) > 1 or not has_patronymic

    names_filtered1 = [name for name in names_filtered if not_only_patronymic(name)]

    names = []
    for name in names_filtered1:
        if not isinstance(name, str):
            names.append(name)
            continue
            
        parts = name.split()
        tags_list = [morph.parse(part)[0].tag for part in parts]

        counts = {
            'Name': sum('Name' in t for t in tags_list),
            'Patr': sum('Patr' in t for t in tags_list),
            'Surn': sum('Surn' in t for t in tags_list),
        }

        for key in ['Name', 'Patr', 'Surn']:
            if counts[key] >= 2:
                target_tag = key
                break
        else:
            target_tag = None

        if target_tag:
            for part in parts:
                if target_tag in morph.parse(part)[0].tag:
                    names.append(part)
                    break
        else:
            names.append(name)

    return names

In [None]:
names = get_characters()

In [None]:
sorted_names = sorted(names, key=len, reverse=True)

In [None]:
# объединение различных вариантов имен в группы
tags_cache = {}

def get_tags(word):
    if word not in tags_cache:
        parse = morph.parse(word)[0]
        tags_cache[word] = set(str(parse.tag).split())
    return tags_cache[word]

def are_similar(name1, name2, threshold=81):
    """проверка пересечения тегов слов"""
    tags1 = get_tags(name1)
    tags2 = get_tags(name2)
    if not tags1.intersection(tags2):
        return False
     # проверка на сходство строк
    ratio = (fuzz.partial_ratio(name1, name2) + fuzz.token_set_ratio(name1, name2) + fuzz.ratio(name1, name2)) / 3
    return ratio >= threshold


def normalize_name(name):
    """приведение слова к нормальной форме"""
    parts_name = name.split()
    normalized = []
    for word in parts_name:
        parse = morph.parse(word)[0]
        normalized.append(parse.normal_form)
    return ' '.join(normalized)

groups = []
for name in sorted_names:
    name_base = normalize_name(name)
    placed = False
    for group in groups:
        base_group_name = normalize_name(group[0])
        if are_similar(base_group_name, name_base):
            group.append(name)
            placed = True
            break
    if not placed:
        groups.append([name])

In [None]:
for group in groups:
    print(group)

Поиск главных персонажей романа

In [None]:
persons = {}

for group in groups:
    for name in group:
        name = str(name).title()
        parts_name = name.split()

        if len(parts_name) > 1:
            first_word, second_word = parts_name[0], parts_name[1]
            tags = morph.parse(second_word)[0].tag
            if 'nomn' in tags and not first_word.endswith(('ом', 'у', 'ы')):
                persons[name] = len(group)
                break
        else:
            tags = morph.parse(name)[0].tag
            if 'nomn' in tags:
                persons[name] = len(group)
                break

In [None]:
main_persons = dict(sorted(persons.items(), key=lambda item: item[1], reverse=True))

In [None]:
top_15_persons = dict(list(main_persons.items())[:15])

In [None]:
keys = list(top_15_persons.keys())
values = list(top_15_persons.values())

plt.figure(figsize=(10, 5))
plt.barh(keys, values)
plt.title('ТОП-15 наиболее упоминаемых героев романа')
plt.gca().invert_yaxis()
plt.show()

In [None]:
top15_groups = sorted(groups, key=len, reverse=True)[:15]

In [None]:
name_variations = []
for group in top15_groups:
    first_words = [name.split()[0] for name in group]
    name_variations.append(first_words)

In [None]:
name_variations = [list(set(name)) for name in name_variations]

In [None]:
def get_dispersion(text, name_list, flag=None):
    """поиск слова в тексте"""
    if flag:
        return [flag if any(wi in w for wi in name_list) else np.nan for w in text]
    else:
        return [i+1 if word in w else np.nan for i, w in enumerate(text)]

In [None]:
dispersion_results = []

for name_var in name_variations:
    result = get_dispersion(words, name_var, flag=1)
    dispersion_results.append(result)    

In [None]:
fig, ax = plt.subplots(figsize=(20, 7))

max_length = max(len(res) for res in dispersion_results)


# графики частотности упоминания персон
for i, res in enumerate(dispersion_results):
    offset = i
    y_vals = np.array([np.nan if np.isnan(x) else x + offset for x in res])
    ax.scatter(np.arange(len(y_vals)), y_vals, s=20, alpha=0.3, zorder=2)


xticks_positions = np.linspace(1, 4, 5).astype(int).tolist()
ax.set_xticklabels([f'{x:.0f}%' for x in np.linspace(0, 100, 6)], fontsize=16)
ax.set_xticks(np.linspace(0, max_length, 6).astype(int))

ax.set_yticks(range(0, len(keys)+1))
ax.set_yticklabels([''] + [f"{n.title()} ({v})" for n, v in zip(keys, values)], fontsize=16)


ax.set_xlim([0, max_length])
ax.set_title('Частота упоминаний персонажей в течение всего романа (%)\n', fontsize=20)
ax.grid(True, linestyle='--', alpha=0.5)
plt.gca().invert_yaxis()
plt.show()

---------------------

#### Sentiments

In [None]:
import pandas as pd
from deeppavlov import build_model, configs

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('sentences.csv', encoding ='utf-8')
df = df.dropna()
df['sentences_clear'] = df['sentences_clear'].dropna().fillna('') 

In [None]:
model = build_model(configs.classifiers.rusentiment_bert, download=False)

def get_sentiment(text):
    """определение тональности предложения"""
    result = model([text])
    if isinstance(result, list):
        sentiment = result[0]
    else:
        sentiment = result
    if sentiment == 'positive':
        return 1
    elif sentiment == 'negative':
        return -1
    else:
        return 0

In [None]:
df['sentiment'] = df['sentences_clear'].apply(get_sentiment)

In [None]:
df.head()

In [None]:
df['rolling_mean'] = df['sentiment'].rolling(window=3).mean()
df['rolling_mean'] = df['rolling_mean'].fillna(0)

In [None]:
plt.figure(figsize=(20, 3))
plt.plot(df.index, df['sentiment'])
plt.ylabel('Sentiment')
plt.title('Изменение тональности текста', fontsize=18)
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(20, 3))
df['sentiment_smooth'] = df['sentiment'].rolling(window=10).mean()
plt.plot(df.index, df['sentiment_smooth'])
plt.ylabel('Sentiment (скользящее среднее)')
plt.title('Изменение тональности текста (сглаженное)', fontsize=18)
plt.grid(True)
plt.show()

In [None]:
max_ = df['sentiment_smooth'].max()
min_ = df['sentiment_smooth'].min()

In [None]:
for index, row in df.iterrows():
    if row['sentiment_smooth'] == max_:
        print( row['sentences'])

In [None]:
for index, row in df.iterrows():
    if row['sentiment_smooth'] == min_:
        print(row['sentences'])