In [None]:
import pandas as pd
from app.config.configuration import Config
from app.core.scrapers.telegram import consts
import os
import re
import stanza
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords

In [None]:
df = pd.read_csv(os.path.join(Config.FILES_PATH, 'full_telegram_data.csv'), encoding='utf-8')

In [None]:
df['date'] = pd.to_datetime(df['date'], utc=True).dt.tz_convert('Europe/Kiev').dt.tz_localize(None)
df['date'] = df['date'].apply(lambda x: x.replace(minute=0, second=0))
df['lang'] = df['channel_id'].map(consts.CHANNEL_IDS)

In [None]:
df = df.drop(columns=['id']).sort_values(by=['date'])

In [None]:
df.to_csv('full_telegram_data.csv')

In [None]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^\sА-Яа-яЁёЇїІіЄєҐґ\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip().lower()

df['cleaned_content'] = df['content'].apply(clean_text)

In [None]:
df = df.drop('channel_id', axis=1)
df = df.drop('id', axis=1)
df = df.drop('content', axis=1)

In [None]:
stanza.download('uk', verbose=False)
stanza.download('ru', verbose=False)

In [None]:
nlp_uk = stanza.Pipeline('uk', processors='tokenize,mwt,pos,lemma', use_gpu=False)
nlp_ru = stanza.Pipeline('ru', processors='tokenize,pos,lemma', use_gpu=False)

In [None]:
nltk.download('stopwords')

In [None]:
def clean_stopwords(stopword_list):
    cleaned = []
    for word in stopword_list:
        word = re.sub(r"[^\sА-Яа-яЁёЇїІіЄєҐґ]", "", word)
        word = word.strip().lower().split()
        cleaned.extend(word)
    return cleaned

ru_stopwords = clean_stopwords(stopwords.words('russian'))
stopwords_ua = pd.read_csv(os.path.join(Config.FILES_PATH, 'stopwords_ua.txt'), header=None, names=['stopwords'])
uk_stopwords = clean_stopwords(list(stopwords_ua.stopwords))
all_stopwords = set(uk_stopwords + ru_stopwords)

In [None]:
def lemmatize_stanza(text, lang):
    nlp = nlp_uk if lang == 'uk' else nlp_ru
    doc = nlp(text)
    return [w.lemma for sent in doc.sentences for w in sent.words]

In [None]:
df.head()

In [None]:
df['content'] = df['cleaned_content'].str.split().apply(lambda words: ' '.join(word for word in words if word.isalpha() and word not in all_stopwords))

In [None]:
df = df.drop('cleaned_content', axis=1)
df.head()

In [None]:
from collections import defaultdict
from tqdm import tqdm

def build_lemmatizer():
    cache = defaultdict(dict)

    def lemmatize_word(word, lang):
        if word in cache[lang]:
            return cache[lang][word]
        lemmas = lemmatize_stanza(word, lang)
        lemma = lemmas[0] if lemmas else word
        cache[lang][word] = lemma
        return lemma

    return lemmatize_word

lemmatize_word = build_lemmatizer()

unique_pairs = set()
for _, row in tqdm(df.iterrows(), total=len(df)):
    words = row['content'].split()
    lang = row['lang']
    unique_pairs.update((word, lang) for word in words)

lemma_dict = {}
for word, lang in tqdm(unique_pairs, desc="Lemmatizing"):
    lemma = lemmatize_word(word, lang)
    lemma_dict[(word, lang)] = lemma

def lemmatize_text(text, lang):
    return ' '.join(lemma_dict.get((word, lang), word) for word in text.split())

df['l_content'] = df.apply(lambda row: lemmatize_text(row['content'], row['lang']), axis=1)

In [None]:
df.head()