In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import re
import numpy as np
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from joblib import Parallel, delayed
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kazhidov\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kazhidov\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kazhidov\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
subjects = [
    'Здоровье.ру',
    'Проект по использованию технологий компьютерного зрения на базе искусственного интеллекта (ИИ) для анализа медицинских изображений',
    'Skillbox',
    'Gett',
    'Промобот',
    'Иннотех',
    'Cybersport.Metaratings',
    'MMA.Metaratings',
    'Goose Gaming',
    'ESforce Holding',
    'Vinci Agency',
    'Гэллэри Сервис',
    'Студия Артемия Лебедева',
    'Федерация креативных индустрий',
    'Метарейтинг',
    'СберМаркет',
    'Balance Platform',
    'Московская биржа',
    'Samsung Electronics',
    'Нетология',
    'Дневник МЭШ',
    'Цифровое образование'
]
all_href = {}
for subject in tqdm(subjects):
    all_href[subject] = []
    URL_TEMPLATE = f"https://habr.com/ru/search/?q={subject}&target_type=posts&order=relevance"
    r = requests.get(URL_TEMPLATE)
    soup = bs(r.text, "html.parser")
    post_names = soup.find_all('h2', class_='tm-article-snippet__title tm-article-snippet__title_h2')
    for name in post_names:
        href = name.a['href']
        if href.find('post') != -1:
            all_href[subject].append(href)
all_href

100%|██████████| 22/22 [00:13<00:00,  1.62it/s]


{'Здоровье.ру': ['/ru/post/574430/',
  '/ru/post/58324/',
  '/ru/post/375519/',
  '/ru/post/164463/',
  '/ru/post/74505/',
  '/ru/post/475654/',
  '/ru/post/38591/',
  '/ru/post/72148/',
  '/ru/post/83803/',
  '/ru/post/101726/',
  '/ru/post/152309/',
  '/ru/post/287314/'],
 'Проект по использованию технологий компьютерного зрения на базе искусственного интеллекта (ИИ) для анализа медицинских изображений': ['/ru/post/370445/',
  '/ru/post/324694/',
  '/ru/post/450298/',
  '/ru/post/410443/',
  '/ru/post/432324/',
  '/ru/post/374527/'],
 'Skillbox': ['/ru/post/563822/'],
 'Gett': ['/ru/post/295122/',
  '/ru/post/295748/',
  '/ru/post/298338/',
  '/ru/post/296078/',
  '/ru/post/292978/',
  '/ru/post/300250/',
  '/ru/post/299920/'],
 'Промобот': ['/ru/post/395637/',
  '/ru/post/477262/',
  '/ru/post/394371/',
  '/ru/post/593209/',
  '/ru/post/437552/',
  '/ru/post/226291/',
  '/ru/post/397507/'],
 'Иннотех': ['/ru/post/562146/',
  '/ru/post/577594/',
  '/ru/post/529372/',
  '/ru/post/5625

In [4]:
columns = ['header', 'rating_up', 'rating_low', 'time', 'subject', 'text_post']
df = pd.DataFrame(columns=columns)
df

Unnamed: 0,header,rating_up,rating_low,time,subject,text_post


In [5]:
for name_sub in tqdm(all_href):
    for sub_href in all_href[name_sub]:
        data = {
            'header': '',
            'rating_up': 0,
            'rating_low': 0,
            'rating_all': 0,
            'text_post': '',
            'time': '',
            'subject': name_sub
        }
        URL_TEMPLATE = f"https://habr.com{sub_href}"
        r = requests.get(URL_TEMPLATE)
        soup = bs(r.text, "html.parser")
        
        rating_post = soup.find_all('span', class_='tm-votes-meter__value tm-votes-meter__value_positive tm-votes-meter__value_appearance-article tm-votes-meter__value_rating')
        for rating_text in rating_post:
            data['rating_up'] = int(rating_text['title'][rating_text['title'].find('↑')+1:rating_text['title'].find('и')])
            data['rating_low'] = int(rating_text['title'][rating_text['title'].find('↓')+1:len(rating_text['title'])+1])
            data['rating_all'] = int(rating_text['title'][rating_text['title'].find('голосов')+len('голосов'):rating_text['title'].find(':')])
        
        header_post = soup.find_all('h1', class_='tm-article-snippet__title tm-article-snippet__title_h1')
        for header_names in header_post:
            data['header'] = header_names.span.text
        
        text_post = soup.find_all('div', class_='tm-article-body')[0]
        data['text_post'] = text_post.text
        
        time = soup.find_all('time')[0]
        data['time'] = str(time['datetime'])
        
        df = df.append(data, ignore_index=True)

100%|██████████| 22/22 [01:00<00:00,  2.76s/it]


In [6]:
df.head()

Unnamed: 0,header,rating_up,rating_low,time,subject,text_post,rating_all
0,Как работая из дома оставаться здоровым и не с...,12,7,2021-08-24T13:10:21.000Z,Здоровье.ру,"Привет всем удаленщикам, кто остается после р...",19.0
1,О ручках IT-ника замолвите слово,78,26,2009-04-26T19:20:46.000Z,Здоровье.ру,"Когда-то, когда мониторы были большими и не ж...",104.0
2,"Беседа с космонавтом: о здоровье, невесомости ...",119,0,2015-01-16T05:46:41.000Z,Здоровье.ру,\n\r\nПервая часть беседы с космонавтом Павло...,119.0
3,Практический опыт. Как наконец-то начать заним...,164,22,2012-12-31T16:00:00.000Z,Здоровье.ру,Я давний подписчик блога «GTD» (уст. Учись ра...,186.0
4,"ПоискПоБлогам.Ру — Яндекс закрывает рейтинги, ...",0,0,2009-11-06T16:03:32.000Z,Здоровье.ру,"Не секрет, что в ближайшем будущем Яндекс пре...",0.0


In [7]:
df['time'] = pd.to_datetime(df['time'])

In [8]:
df['time_year'] = df['time'].dt.year
df['time_month'] = df['time'].dt.month
df['time_day'] = df['time'].dt.day
df['time_hour'] = df['time'].dt.hour
df['time_minute'] = df['time'].dt.minute

In [9]:
df = df.drop(['time'], axis=1)
df.head()

Unnamed: 0,header,rating_up,rating_low,subject,text_post,rating_all,time_year,time_month,time_day,time_hour,time_minute
0,Как работая из дома оставаться здоровым и не с...,12,7,Здоровье.ру,"Привет всем удаленщикам, кто остается после р...",19.0,2021,8,24,13,10
1,О ручках IT-ника замолвите слово,78,26,Здоровье.ру,"Когда-то, когда мониторы были большими и не ж...",104.0,2009,4,26,19,20
2,"Беседа с космонавтом: о здоровье, невесомости ...",119,0,Здоровье.ру,\n\r\nПервая часть беседы с космонавтом Павло...,119.0,2015,1,16,5,46
3,Практический опыт. Как наконец-то начать заним...,164,22,Здоровье.ру,Я давний подписчик блога «GTD» (уст. Учись ра...,186.0,2012,12,31,16,0
4,"ПоискПоБлогам.Ру — Яндекс закрывает рейтинги, ...",0,0,Здоровье.ру,"Не секрет, что в ближайшем будущем Яндекс пре...",0.0,2009,11,6,16,3


In [10]:
def nlp_processing(text, stemmer = PorterStemmer(), lemmatizer = WordNetLemmatizer()):
    text = text.lower()
    text = ' '.join([word for word in text.split(' ') if not word in set(stopwords.words('russian'))])
    text = ''.join([sym for sym in text if sym.isalpha() or sym == ' '])    
    text = ' '.join([stemmer.stem(word) for word in word_tokenize(text)])
    text = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text)])
    text = text.strip()
    return text

In [11]:
df['header'] = df['header'].apply(nlp_processing)

In [12]:
df['text_post'] = df['text_post'].apply(nlp_processing)

In [13]:
df.head()

Unnamed: 0,header,rating_up,rating_low,subject,text_post,rating_all,time_year,time_month,time_day,time_hour,time_minute
0,работая дома оставаться здоровым сойти ума,12,7,Здоровье.ру,привет всем удаленщикам остается рабочего дня ...,19.0,2021,8,24,13,10
1,ручках itника замолвите слово,78,26,Здоровье.ру,когдато мониторы большими жидкокристаллическим...,104.0,2009,4,26,19,20
2,беседа космонавтом здоровье невесомости космич...,119,0,Здоровье.ру,первая часть беседы космонавтом павлом виногра...,119.0,2015,1,16,5,46
3,практический опыт наконецто начать заниматься ...,164,22,Здоровье.ру,давний подписчик блога gtd уст учись работатьс...,186.0,2012,12,31,16,0
4,поискпоблогамру яндекс закрывает рейтинги пред...,0,0,Здоровье.ру,секрет ближайшем будущем яндекс прекратит пока...,0.0,2009,11,6,16,3


In [14]:
df.to_csv('nan_data.csv')

In [15]:
corpus_text_post = df['text_post'].to_numpy().tolist()
corpus_header = df['header'].to_numpy().tolist()

In [16]:
vectorizer = TfidfVectorizer(ngram_range=(1, 1))
vector_text_post = vectorizer.fit_transform(corpus_text_post)
vector_header = vectorizer.fit_transform(corpus_header)

In [17]:
df_ = df.drop(['header', 'text_post'], axis=1)

In [18]:
df_ = pd.concat([df_, pd.DataFrame(vector_text_post.toarray(), columns=[f'text_post_{col}' for col in range(vector_text_post.shape[1])])], sort=False, axis=1)

In [19]:
df_ = pd.concat([df_, pd.DataFrame(vector_header.toarray(), columns=[f'header_{col}' for col in range(vector_header.shape[1])])], sort=False, axis=1)

In [20]:
df_.head()

Unnamed: 0,rating_up,rating_low,subject,rating_all,time_year,time_month,time_day,time_hour,time_minute,text_post_0,...,header_554,header_555,header_556,header_557,header_558,header_559,header_560,header_561,header_562,header_563
0,12,7,Здоровье.ру,19.0,2021,8,24,13,10,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0
1,78,26,Здоровье.ру,104.0,2009,4,26,19,20,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0
2,119,0,Здоровье.ру,119.0,2015,1,16,5,46,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0
3,164,22,Здоровье.ру,186.0,2012,12,31,16,0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0
4,0,0,Здоровье.ру,0.0,2009,11,6,16,3,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.388776,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,9,2,Цифровое образование,11.0,2020,8,3,14,36,0.0,...,0.0,0.0,0.0,0.36064,0.0,0.0,0.0,0.0,0.000000,0.0
104,2,1,Цифровое образование,3.0,2020,7,21,13,31,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0
105,19,0,Цифровое образование,19.0,2018,9,18,21,34,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0
106,0,0,Цифровое образование,0.0,2021,10,21,6,22,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0


In [21]:
from sklearn import preprocessing

In [22]:
le = preprocessing.LabelEncoder()
le.fit(df_['subject'])
df_['subject'] = le.transform(df_['subject'])

In [23]:
df_.head()

Unnamed: 0,rating_up,rating_low,subject,rating_all,time_year,time_month,time_day,time_hour,time_minute,text_post_0,...,header_554,header_555,header_556,header_557,header_558,header_559,header_560,header_561,header_562,header_563
0,12,7,9,19.0,2021,8,24,13,10,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,78,26,9,104.0,2009,4,26,19,20,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,119,0,9,119.0,2015,1,16,5,46,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,164,22,9,186.0,2012,12,31,16,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,9,0.0,2009,11,6,16,3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.388776,0.0


In [26]:
from sklearn.cluster import KMeans

In [27]:
kmeans = KMeans(random_state=0).fit(df_)

In [28]:
kmeans.labels_

array([7, 2, 2, 6, 7, 5, 0, 1, 3, 7, 0, 4, 5, 4, 7, 7, 5, 0, 4, 1, 7, 7,
       1, 7, 1, 1, 0, 4, 4, 4, 1, 0, 1, 4, 1, 7, 7, 0, 1, 7, 1, 4, 1, 4,
       5, 5, 4, 5, 4, 5, 5, 5, 4, 4, 4, 4, 5, 4, 4, 2, 4, 0, 4, 0, 5, 4,
       1, 1, 1, 0, 1, 1, 7, 5, 7, 7, 5, 1, 1, 4, 1, 1, 4, 4, 7, 5, 7, 6,
       1, 4, 7, 7, 0, 1, 0, 4, 5, 1, 7, 1, 0, 0, 5, 4, 4, 1, 4, 7])

In [30]:
df_['cluster'] = kmeans.labels_

In [32]:
df_.head()

Unnamed: 0,rating_up,rating_low,subject,rating_all,time_year,time_month,time_day,time_hour,time_minute,text_post_0,...,header_555,header_556,header_557,header_558,header_559,header_560,header_561,header_562,header_563,cluster
0,12,7,9,19.0,2021,8,24,13,10,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
1,78,26,9,104.0,2009,4,26,19,20,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,119,0,9,119.0,2015,1,16,5,46,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,164,22,9,186.0,2012,12,31,16,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
4,0,0,9,0.0,2009,11,6,16,3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.388776,0.0,7


In [31]:
df_.to_csv('df_.csv')