In [1]:
import nltk
import openpyxl
import pymorphy3
import json
import numpy as np
import pandas as pd
import datetime

In [None]:
nltk.download('punkt')
nltk.download('stopword')

[nltk_data] Downloading package punkt to C:\Users\Pavel
[nltk_data]     Alexeyev\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Error loading stopword: Package 'stopword' not found in
[nltk_data]     index


False

In [None]:
from nltk.corpus import stopwords

## XLSX open

In [None]:
dataset_path = 'news.csv'

## Pandas open

In [None]:
df = pd.read_csv(dataset_path)
df = df.drop_duplicates()

In [None]:
df

## News processing

# Обработка текста

In [2]:
import string
def get_special_chars():
  return set(string.punctuation + '\n\xa0\t'+ '—…«»“”–')

In [4]:
''.join(get_special_chars())

'%-|"&\'<»«=~…(]?{—.`$^>_!#;/}+)\xa0\\*:\n”“\t–,@['

In [None]:
#stop_words = set(stopwords.words('russian')) # слишком мало стоп слов 
# используем русские стоп-слова https://github.com/stopwords-iso/stopwords-ru

stopwords = []

with open('stopwords-ru.json', encoding='utf-8') as f:
    stopwords = list(json.load(f))

In [5]:
custom_stopwords = [
    'ао',
    'оао',
    'ооо',
    'тыс', 
    'млн',
    'млрд',
    'трлн',
    'г',
    'гг',
    'куб',
    'м',
    'км',
    'мм',
    'см',
    'л',
    'руб',
]

def get_stop_words(custom_words_enable = False):
  words = stopwords
  if (custom_words_enable):
    words += custom_stopwords
  words = set(words)
  return words

In [None]:
print(f"Всего стоп-слова {len(get_stop_words(True))}")


In [None]:
STOPWORDS = get_stop_words(True)

## Методы обработки текста

In [None]:
spec_chars = get_special_chars()

In [None]:
def remove_characters(text, special_chars = spec_chars):
  return "".join([ch if ch not in spec_chars else ' ' for ch in text]) 

In [None]:
def remove_stopwords(data, stopwords):
  result = []
  for word in data:
      if not (word in stopwords):
          result.append(word)
  return result

In [None]:
def remove_numbers(data):
  result = []
  for word in data:
    if not word.isnumeric():
      result.append(word)
  return result

In [None]:
def tokenize(text):
  return nltk.word_tokenize(text)

In [None]:
morph = pymorphy3.MorphAnalyzer()
def lemmatize(words):
  result = []
  for word in words:
    result.append(morph.normal_forms(word)[0])
  return result

In [None]:
def process_text(text, remove_numbers_enabled = False):

  # преобразуем символы
  process_text = text.lower()

  # удаляем лишние символы
  process_text = remove_characters(process_text, get_special_chars())

  # токенизация
  process_text = tokenize(process_text)

  # получаем стоп-слова
  stop_words = get_stop_words(True)

  # удаляем стоп-слова
  process_text = remove_stopwords(process_text, stop_words)

  # удаляем номера
  if (remove_numbers_enabled):
    process_text = remove_numbers(process_text)

  # лемматизация
  process_text = lemmatize(process_text)

  return ' '.join(process_text)

In [None]:
SPECIAL_CHARS = get_special_chars()
STOP_WORDS = get_stop_words(True)
def process_text_optimized_1(text, remove_numbers_enabled = False):
  if (remove_numbers_enabled):
    return ' '.join(lemmatize(remove_numbers(remove_stopwords(tokenize(remove_characters(text.lower(), SPECIAL_CHARS)), STOP_WORDS))))
  return ' '.join(lemmatize(remove_stopwords(tokenize(remove_characters(text.lower(), SPECIAL_CHARS)), STOP_WORDS)))

In [None]:
def process_text_optimized(text, custom_words_enable = False, remove_numbers_enabled = False):
  text = text.lower()

  text = "".join([ch if ch not in spec_chars else ' ' for ch in text ]) 

  stopwords = getStopWords()
  
  text_tokens = nltk.word_tokenize(text)

  filtered = []
  for w in text_tokens:
      if (w.isnumeric()):
        print(w)
      if (filterNumbers and not w.isnumeric()) and w not in stopwords:
          filtered.append(w)
  return filtered


In [None]:
text = df.iloc[20]['news_body']
text

In [None]:
processed = process_text(text, True)
processed

# Подготовка данных

## Процесс подготовки данных

Процесс подготовки данных зависит от того, какие это были данные: архивные или полученные недавно.

---

**Архивные данные** сохраняются в выгрузке датасета. Из-за большого количества данных обработка на лету невозможна, поэтому данных обрабатываются и выгружаются в **CSV**.

**Новые данные** могут получаться через RSS Finam.ru и дополнять исходный датасет, либо также парситься при помощи **Selenium**.

In [None]:
#df['processed_news'] = df['news_body'].apply(lambda text: process_text(text, True, True))
#process_vectorize = np.vectorize(lambda text: process_text_optimized_1(text, True))

In [None]:
#news = df['news_body'].to_numpy()
#len(news)

In [None]:
df_partial = df.copy()

In [None]:
df_partial

In [None]:
df_partial['news_body'] = df_partial['news_body'].apply(lambda text: process_text_optimized_1(text, True))

In [None]:
df_partial

In [None]:
df_partial = df_partial.drop_duplicates()

In [None]:
df_partial

In [None]:
datetimes = pd.DataFrame(pd.DataFrame(df_partial['datetime'].apply(lambda x: x[:10])))
counts_datetimes= pd.DataFrame(datetimes.groupby('datetime')['datetime'].count().reset_index(name='count'))#.sort_values(['count'], ascending=False))
counts_datetimes.columns = ['datetime', 'counts']
counts_datetimes['datetime'] = counts_datetimes['datetime'].apply(lambda date: datetime.datetime.strptime(str(date), '%Y-%m-%d').date())
counts_datetimes

In [None]:
def get_max_skipped(df):
  m = -1
  s = 0
  for i in range(1, len(df)):
    days_skipped = (df.iloc[i]['datetime'] - df.iloc[i - 1]['datetime']).days - 1
    m = max(m, days_skipped)
    if (days_skipped > 0):
      s += days_skipped
      #print(df.iloc[i]['datetime'], days_skipped)
  return (m, s)

In [None]:
get_max_skipped(counts_datetimes)

In [None]:
import gc
gc.collect()

In [None]:
df_partial.to_csv('news_processed.csv',index = False)