# Установка/импорт библиотек

In [None]:
%pip install beautifulsoup4 requests fake-useragent lxml
%pip install pandas scikit-learn joblib pymorphy2 nltk
%pip install telebot

In [1]:
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import pandas as pd
import csv
import datetime
import warnings as wr
wr.filterwarnings('ignore')

# Парсинг данных

In [4]:
ua = UserAgent()

headers = {
    'accept': 'application/json, text/plain, */*',
    'user-Agent': ua.google,
}

article_dict = {}
themes = ['programming',
'artificial_intelligence',
'robot',
'design',
'gamedev',
'sound']

for i in themes:
    for j in range(1, 41):
        try:
            url = f'https://habr.com/ru/hubs/{i}/articles/page{j}/'
            req = requests.get(url, headers=headers).text
            soup = BeautifulSoup(req, 'lxml')
            all_hrefs_articles = soup.find_all('a', class_='tm-title__link')  # получаем статьи

            for article in all_hrefs_articles:  # проходимся по статьям
                article_name = article.find('span').text  # собираем названия статей
                article_link = f'https://habr.com{article.get("href")}'  # ссылки на статьи
                article_dict[article_name] = article_link
        except:
            break

In [3]:
%%time
karma = []
rating = []
description = []
difficulty = []
read_time = []
views = []
liked = []
favorite = []
comments = []
tags = []
hubs = []
them = []

text = ''
t_tags = ''
t_hubs = ''

for i, j in article_dict.items():
    req = requests.get(j, headers=headers).text
    soup = BeautifulSoup(req, 'lxml')
    content = soup.find('div', id='post-content-body')
    try:
        try:
            for k in content.find_all('p'):
                text += k.text
            description.append(text)
            text = ''
        except:
            for k in content.find_all('br'):
                text += k.text
            description.append(text)
            text = ''
    except:
        continue
    
    stats = soup.find('div', class_='tm-article-snippet__stats')
    difficulty.append('Неопределенный' if stats.find('span', class_='tm-article-complexity__label') == None else stats.find('span', class_='tm-article-complexity__label').text)
    read_time.append('Неопределенно' if stats.find('span', class_='tm-article-reading-time__label') == None else stats.find('span', class_='tm-article-reading-time__label').text)
    views.append(stats.find('span', class_='tm-icon-counter__value').text)
    try:
        user = soup.find('div', class_='tm-user-card__meta')
        karma.append('0' if user.find('div', class_='tm-karma__votes tm-karma__votes_positive') == None else user.find('div', class_='tm-karma__votes tm-karma__votes_positive').text)
        rating.append(user.find('span').text)
    except:
        continue
    info = soup.find('div', 'tm-article-sticky-panel')
    liked.append(info.find('div', 'tm-article-rating tm-data-icons__item').find('span').text)
    favorite.append(info.find('span', 'bookmarks-button__counter').text)
    comments.append(info.find('div', 'tm-article-comments-counter-link tm-data-icons__item').text.split()[-1])
    
    keywords = soup.find('div', 'tm-article-presenter__meta').find_all('ul', 'tm-separated-list__list')
    for i in keywords[0].find_all('a'):
        t_tags += i.text + ', '
    tags.append(t_tags)
    for i in keywords[1].find_all('a'):
        t_hubs += i.text + ', '
    hubs.append(t_hubs)
    t_tags = ''
    t_hubs = ''

CPU times: total: 3min 54s
Wall time: 55min 10s


In [4]:
with open(f"habr_data_{datetime.datetime.now().strftime('%d_%m_%Y')}.csv", "w", encoding='utf-8') as f:
    wr = csv.writer(f, quoting=csv.QUOTE_ALL)
    wr.writerow(['post_name', 'karma', 
    'rating', 'post_content', 
    'views', 'difficulty', 
    'favorite', 'read_time',
    'liked', 'comments', 'tags', 'hubs', 'post_link'])
    
    for i in range(len(karma)):
        wr.writerow([list(article_dict.keys())[i],
        karma[i], rating[i],
        description[i], views[i],
        difficulty[i], favorite[i],
        read_time[i], liked[i],
        comments[i],tags[i], hubs[i],
        list(article_dict.values())[i]])
    print('Статьи были успешно получены')

Статьи были успешно получены


# Обработка данных

In [3]:
import numpy as np
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

df = pd.read_csv('habr_data_16_12_2023.csv')

df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
df['difficulty'] = df['difficulty'].apply(lambda x: x.replace('\r\n', '').strip())
df['read_time'] = df['read_time'].apply(lambda x: x.replace('\r\n', '').strip())
df['views'] = df['views'].apply(lambda x: str(float(x.replace('K', ' K').split()[0])*1000) if 'K' in x else x)
df['views'] = df['views'].apply(lambda x: float(x.replace('M', ' M').split()[0])*1000000 if 'M' in x else x)

In [3]:
df['hubs'] = df['hubs'].apply(lambda x: 'Робототехника' if 'Робототехника' in x else x)
df['hubs'] = df['hubs'].apply(lambda x: 'Разработка игр' if 'Разработка игр' in x else x)
df['hubs'] = df['hubs'].apply(lambda x: 'Звук' if 'Звук' in x else x)
df['hubs'] = df['hubs'].apply(lambda x: 'Искусственный интеллект' if 'Искусственный интеллект' in x else x)
df['hubs'] = df['hubs'].apply(lambda x: 'Программирование' if 'Программирование' in x else x)
df['hubs'] = df['hubs'].apply(lambda x: 'Дизайн' if 'Дизайн' in x else x)

# 14_12_23
# df['hubs'] = df['hubs'].apply(lambda x: 'Искусственный интеллект' if 'Искусственный интеллект' in x else x)
# df['hubs'] = df['hubs'].apply(lambda x: 'Программирование' if 'Программирование' in x else x)
# df['hubs'] = df['hubs'].apply(lambda x: 'Научно-популярное' if 'Научно-популярное' in x else x)
# df['hubs'] = df['hubs'].apply(lambda x: 'Разработка игр' if 'Разработка игр' in x else x)
# df['hubs'] = df['hubs'].apply(lambda x: 'Анализ и проектирование систем' if 'Анализ и проектирование систем' in x else x)

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ivve2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ivve2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
%%time
def remove_stopwords(text, language):
    stop_words = set(stopwords.words(language))
    words = word_tokenize(text)
    filtered_text = ' '.join([word for word in words if word.lower() not in stop_words])
    return filtered_text

df['post_content'] = df['post_content'].apply(lambda x: remove_stopwords(x, 'english'))
df['post_content'] = df['post_content'].apply(lambda x: remove_stopwords(x, 'russian'))

CPU times: total: 1min 17s
Wall time: 1min 17s


In [6]:
import re
import pymorphy2
def df_preprocess(text):
    reg = re.compile('[^а-яА-яa-zA-Z0-9 ]')
    text = text.lower().replace("ё", "е")
    text = text.replace('ии', 'ai')
    text = re.sub('((www\.[^\s]+)|(http?://[^\s]+))', 'сайт', text)
    text = reg.sub(' ', text)
    
    # Лемматизация
    morph = pymorphy2.MorphAnalyzer(lang='ru')
    text = [morph.parse(word)[0].normal_form for word in text.split()]
    text = [i.replace('сетея', 'сеть') for i in text]
    # Стемминг
    # stemmer = SnowballStemmer("russian")
    # text =[stemmer.stem(word) for word in text.split()]

    return text

In [63]:
%%time
df.post_content.apply(df_preprocess)

CPU times: total: 14min 51s
Wall time: 14min 54s


0       [маленький, снизойти, милость, божый, ниспослать, два, книжка, один, книжка, бейсик, студент, ка...
1       [здравствуй, уважаемый, хабра, этот, статья, описывать, объектный, ориентировать, онтологический...
3       [наверное, слышать, процесс, собеседование, faang, bigtech, leetcode, задача, системный, дизайн,...
4       [давно, работать, сниппет, помощь, создавать, скелет, модуль, функция, конструкция, given, стано...
5       [привет, это, команда, курс, python, разработчик, недавно, провести, рефакторинг, большой, обнов...
                                                       ...                                                 
4549    [материал, статья, взять, мой, дзен, канал, в, прошлый, статья, внести, ясность, корректный, зав...
4552    [материал, статья, взять, мой, дзен, канал, статья, 1, статья, 2, статья, 3, статья, 4, статья, ...
4558    [автоматический, распознавание, речь, stt, asr, пройти, долгий, путь, совершенствование, иметь, ...
4574    [решенai, задача, св

In [154]:
df1 = pd.read_csv('model_data.csv')
df1.drop(columns='Unnamed: 0', inplace=True)
df1.dropna(inplace=True)

In [155]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

x_train_h, x_test_h, y_train_h, y_test_h = train_test_split(df1.post_content, df1.hubs, test_size=0.1, random_state=42)

In [156]:
vectorizer = CountVectorizer(ngram_range=(1,2), analyzer='word')
x_train_BOW_bi = vectorizer.fit_transform(x_train_h)
x_test_BOW_bi = vectorizer.transform(x_test_h)

In [10]:
vectorizer = TfidfVectorizer(ngram_range=(1,3))

x_train_TFIDF = vectorizer.fit_transform(x_train_h)
x_test_TFIDF = vectorizer.transform(x_test_h)

In [11]:
%%time
no_topics = 20
no_top_words=10

LDA = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=42)
LDA.fit(x_train_BOW_bi)

nmf = NMF(n_components=no_topics, random_state=42, l1_ratio=.5, init='nndsvd')
nmf.fit(x_train_BOW_bi)

LSA = TruncatedSVD(n_components=no_topics, random_state=42)
LSA.fit(x_train_BOW_bi)

CPU times: total: 22min 4s
Wall time: 7min 32s


In [12]:
def topics_max(mod_data,model):
    top_n=[]
    for i in range(mod_data.shape[0]):
        top_max=[]
        topic_list=model.transform(x_train_BOW_bi[i,:])
        top_max.append(np.max(topic_list))
        top_max.append(np.argmax(topic_list))  
        top_n.append(top_max)
    return np.array(top_n)

In [13]:
top_lda=topics_max(x_train_BOW_bi, LDA)

ds_topic = pd.DataFrame(x_train_BOW_bi)

ds_topic['topic_lda']=top_lda[:,1]
ds_topic['probability_lda']=top_lda[:,0]
ds_topic['y']=y_train_h
ds_topic

# Обучение и тесты модели

In [257]:
from sklearn.cluster import KMeans

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(x_train_h)

data = pd.DataFrame(df['hubs'].iloc[x_train_h.index])

kmeans = KMeans(n_clusters=6, random_state=42)
data['cluster'] = kmeans.fit_predict(X)

In [252]:
data['cluster'][df['hubs'] == 'Программирование'].value_counts()

cluster
0    457
1    128
Name: count, dtype: int64

In [258]:
prog = data['cluster'][data['hubs'] == 'Программирование'].value_counts()
ai = data['cluster'][data['hubs'] == 'Искусственный интеллект'].value_counts()
game = data['cluster'][data['hubs'] == 'Разработка игр'].value_counts()
design = data['cluster'][data['hubs'] == 'Дизайн'].value_counts()
sound = data['cluster'][data['hubs'] == 'Звук'].value_counts()
robot = data['cluster'][data['hubs'] == 'Робототехника'].value_counts()

pd.DataFrame({'prog':prog, 'ai':ai, 'game':game, 'design':design, 'sound':sound, 'robot':robot})

Unnamed: 0_level_0,prog,ai,game,design,sound,robot
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,315.0,380.0,405.0,362.0,345.0,222.0
1,2.0,5.0,1.0,3.0,,
2,1.0,,,,,
3,47.0,26.0,47.0,20.0,10.0,12.0
4,,1.0,,,,
5,220.0,203.0,177.0,155.0,73.0,86.0


## Отбор особенностей

In [157]:
lsvc = LinearSVC(C=.5, dual=True, random_state=42)
selective_model = SelectFromModel(lsvc, max_features=None)

x_train_BOW_bi_select_features = selective_model.fit_transform(x_train_BOW_bi, y_train_h)
x_test_BOW_bi_select_features = selective_model.transform(x_test_BOW_bi)

In [158]:
clf = LogisticRegression(random_state=42)
clf.fit(x_train_BOW_bi_select_features, y_train_h)
print(clf.score(x_test_BOW_bi_select_features, y_test_h))

0.8213256484149856


In [159]:
from sklearn.metrics import accuracy_score
pd.set_option('display.max_rows', 60)

p = clf.predict(x_test_BOW_bi_select_features)
frame = pd.DataFrame(y_test_h)
frame['predict'] = p
frame.columns = ['hubs', 'predict']
print(accuracy_score(y_test_h, p))
frame

0.8213256484149856


Unnamed: 0,hubs,predict
3064,Звук,Робототехника
410,Программирование,Искусственный интеллект
3182,Звук,Звук
1578,Робототехника,Робототехника
1283,Искусственный интеллект,Искусственный интеллект
...,...,...
3040,Звук,Звук
1779,Дизайн,Дизайн
2429,Разработка игр,Разработка игр
2942,Разработка игр,Разработка игр


In [202]:
from sklearn.model_selection import GridSearchCV
     

grid={
      "C":[i/10 for i in range(1, 3)], 
      "penalty":['l2', 'l1'],
      "solver":['lbfgs', 'liblinear', 'saga']
      }

clf=LogisticRegression()
logreg_cv=GridSearchCV(clf,grid,cv=3)
%time logreg_cv.fit(x_train_BOW_bi_select_features, y_train_h)

print("tuned hpyerparameters :(best parameters) ", logreg_cv.best_params_)
print("accuracy :", logreg_cv.best_score_)

CPU times: total: 25min 1s
Wall time: 5min 38s
tuned hpyerparameters :(best parameters)  {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
accuracy : 0.826173774092446


In [203]:
clf = LogisticRegression(C=0.1, penalty='l2', solver='liblinear', random_state=42)
clf.fit(x_train_BOW_bi_select_features, y_train_h)
clf.score(x_test_BOW_bi_select_features, y_test_h)

0.8328530259365994

In [205]:
scores = cross_val_score(clf, x_train_BOW_bi_select_features, y_train_h, cv=3, scoring='accuracy')
print(clf,'\nCross-validate: ', scores)

LogisticRegression(C=0.1, random_state=42, solver='liblinear') 
Cross-validate:  [0.81634615 0.82001925 0.84215592]


In [167]:
from joblib import dump

dump(clf, 'model.joblib')

['model.joblib']

# Telegram бот

In [166]:
import telebot
from joblib import load

API_TOKEN = 'TOKEN'
bot = telebot.TeleBot(API_TOKEN)
model = load('model.joblib')

@bot.message_handler(commands=['help', 'start'])
def send_welcome(message):
    bot.reply_to(message, """Привет. Я бот для распознавания тем статей, я помогу вам узнать к какой теме относиться текст который вы мне отправите
для того что бы узнать тему напишите: хочу узнать""")

@bot.message_handler(func=lambda message: True if message.text == 'хочу узнать' else False)
def echo_message(message):
    bot.reply_to(message, 'Введите свой текст')
    bot.register_next_step_handler(message, predict_message)
    
def predict_message(message):
    query = remove_stopwords(message.text, 'english')
    query = remove_stopwords(message.text, 'russian')
    
    bot.reply_to(message, model.predict(selective_model.transform(vectorizer.transform([query]))))

bot.infinity_polling()

2023-12-17 22:17:01,984 (__init__.py:966 MainThread) ERROR - TeleBot: "Infinity polling: polling exited"
2023-12-17 22:17:01,985 (__init__.py:968 MainThread) ERROR - TeleBot: "Break infinity polling"
