# Парсер данных с сайта РБК

In [None]:
import tqdm
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import string
import nltk
from nltk.corpus import stopwords
stopwords = set(stopwords.words('russian'))
nltk.download('wordnet')
nltk.download('stopwords')

import time

import pymorphy2
morph = pymorphy2.MorphAnalyzer()

# Блок парсинга текста статьи

In [None]:
train_df = pd.read_csv("train_dataset_train.csv", index_col=0)
test_df = pd.read_csv("test_dataset_test.csv", index_col=0)

In [None]:
corpus_tag = train_df[["title","session"]].append(test_df[["title","session"]])

## Функция для удаления html тэгов

In [None]:
def striphtml(data): 
    p = re.compile(r'<.*?>')
    return p.sub('', data)

## Функция для выполнения лемматизации текста

In [None]:
def lemmatize(text):
    words = text.split() # разбиваем текст на слова
    res = list()
    for word in words:
        if (word not in stopwords and len(word) > 1):
            p = morph.parse(word)[0]
            res.append(p.normal_form)
    text = " ".join(res)
    return text

## Функция загрузка текста статьи с сайта РБК

In [None]:
def article_parser(session, art_id):
    url_id = art_id.replace(session, "")
    url = "https://www.rbc.ru/rbcfreenews/"
    full_url = url+url_id
    try:
        article =  requests.get(full_url)
    except:
        time.sleep(5.5)
        article =  requests.get(full_url)
    soup = BeautifulSoup(article.text)
   
    article_text =  soup.find('div', {'class': 'article__text article__text_free'}).findAll('p')
    new_article = []
    for i, elem in enumerate(article_text):
        new_soup = BeautifulSoup(str(elem))
        a_tags = new_soup.a
        tags = []
        tags_to_replace = []
        if a_tags!=None:
            
            tags_to_replace =  [str(a_tags)]
            tags = [a.string for a in a_tags]
            if len(tags) !=0 and len(tags_to_replace) == len(tags):
                for j, tag in enumerate(a_tags):
                   
                    new_article.append(str(new_soup).replace(str(tags_to_replace[j]),tags[j]).replace(u'\xa0', u' ').replace(u'\n', u' '))
        else:
            new_article.append(str(new_soup).replace(u'\n', u' ').replace(u'\xa0', u' '))
    #print(new_article)
    text = "".join([striphtml(res) for res in new_article])
    text = re.sub(r"[-—()\"#/@;:<>{}=~|?€«»$]", " ", text)
    text=re.sub(' +', ' ', text)
    text = text.lower()
    text = lemmatize(text)
    return text

## Заполнение датафрейма текстом статей

In [None]:
corpus_tag['article'] = ""
for index, row in tqdm(corpus_tag.iterrows()):
    corpus_tag.loc[index, 'article'] = article_parser(row["session"], index)
    

In [None]:
train_df_to_append = pd.read_csv("train_dataset_train.csv", index_col=0)
test_df_to_append = pd.read_csv("test_dataset_test.csv", index_col=0)

In [None]:
train_df_to_append = pd.merge(train_df_to_append, corpus_tag["article"], left_index=True, right_index=True)
test_df_to_append = pd.merge(test_df_to_append, corpus_tag["article"], left_index=True, right_index=True)

## Загрузка отсутствующих статей

In [None]:
def span_parser(session, art_id):
   
    url_id = art_id.replace(session, "")
    url = "https://www.rbc.ru/rbcfreenews/"
    full_url = url+url_id
    try:
        article =  requests.get(full_url)
    except:
        time.sleep(5.5)
        article =  requests.get(full_url)
    soup = BeautifulSoup(article.text)
   
    article_text =  soup.find('div', {'class': 'article__text article__text_free'}).findAll('span')
    new_article = []
    for i, elem in enumerate(article_text):
        new_soup = BeautifulSoup(str(elem))
        a_tags = new_soup.a
        tags = []
        tags_to_replace = []
        if a_tags!=None:
            
            tags_to_replace =  [str(a_tags)]
            tags = [a.string for a in a_tags]
            
            if len(tags) !=0 and len(tags_to_replace) == len(tags):
                for j, tag in enumerate(a_tags):
                   
                    new_article.append(str(new_soup).replace(str(tags_to_replace[j]),tags[j]).replace(u'\xa0', u' ').replace(u'\n', u' ')) 
        else:           
            new_article.append(str(new_soup).replace(u'\n', u' ').replace(u'\xa0', u' '))      
    text = "".join([striphtml(res) for res in new_article])
    #text = text_cleaner(text)
    text = re.sub(r"[-—()\"#/@;:<>{}=~|?€«»$]", " ", text)
    text=re.sub(' +', ' ', text)
    
    text = text.lower()
    text = lemmatize(text)
    return text

In [None]:
corpus_tag = train_df_to_append[["article","session"]].append(test_df_to_append[["article","session"]])

In [None]:
na_corpus = corpus_tag[corpus_tag["article"] == '']
na_corpus

In [None]:
article_texts = []

for index, row in tqdm(na_corpus.iterrows()):
    corpus_tag.loc[index, 'article'] = span_parser(row["session"], index)
    

In [None]:
corpus_tag["article"].isna().sum()

In [None]:
train_df_to_append.drop("article", axis=1, inplace=True)
test_df_to_append.drop("article", axis=1, inplace=True)


In [None]:
train_df_to_append = pd.merge(train_df_to_append, corpus_tag["article"], left_index=True, right_index=True)
test_df_to_append = pd.merge(test_df_to_append, corpus_tag["article"], left_index=True, right_index=True)

## Парсинг дополнительных данных

In [None]:
def data_parser(session, art_id):
   
    url_id = art_id.replace(session, "")
    url = "https://www.rbc.ru/rbcfreenews/"
    full_url = url+url_id
    try:
        article =  requests.get(full_url)
    except:
        time.sleep(5.5)
        article =  requests.get(full_url)
    soup = BeautifulSoup(article.text)
   
    t =  soup.find('div', {'data-id': url_id})
    meta_name = soup.find('meta',{'name':'news_keywords'})
    meta_genre = soup.find('meta', {'itemprop':'genre'})
    frame = pd.DataFrame(data={'data_type':t.attrs['data-type'],
                         'categ':t.attrs['data-category-nick'],
                         'aggregator':t.attrs['data-aggregator'],
                         'char_len':t.attrs['data-chars-length'],
                         'keyfeatures':meta_name["content"],
                         'genre':meta_genre["content"]}, index=[art_id])
    return frame

In [None]:
train_df = pd.read_csv("train_dataset_train.csv", index_col=0)
test_df = pd.read_csv("test_dataset_test.csv", index_col=0)

In [None]:
corpus_tag = train_df[["title","session"]].append(test_df[["title","session"]])

In [None]:
import warnings
warnings.filterwarnings('ignore')
df = pd.DataFrame(columns = ["data_type","categ","aggregator","char_len","keyfeatures","genre"])
for index, row in tqdm(corpus_tag.iterrows()):
    df = df.append(data_parser(row["session"], index))

In [None]:
train_df_to_append = pd.merge(train_df_to_append, df, left_index=True, right_index=True)
test_df_to_append = pd.merge(test_df_to_append, df, left_index=True, right_index=True)

## Парсинг дополнительных данных

In [None]:
import time
def meta_data_parser(session, art_id):
    url_id = art_id.replace(session, "")
    url = "https://www.rbc.ru/rbcfreenews/"
    full_url = url+url_id
    try:
        article =  requests.get(full_url)
    except:
        time.sleep(5.5)
        article =  requests.get(full_url)
    soup = BeautifulSoup(article.text)
   
    t =  soup.find('a', {'class': "article__header__category", "itemprop":"articleSection"})
    meta_address = soup.find('meta', {'itemprop':'address'})
    header_cat = str(t.string)
    meta_add = str(meta_address['content'])
    del soup 
    del t
    del meta_address
    del article
    return header_cat, meta_add

In [None]:
train_df = pd.read_csv("train_dataset_train.csv", index_col=0)
test_df = pd.read_csv("test_dataset_test.csv", index_col=0)

In [None]:
corpus_tag = train_df[["title","session"]].append(test_df[["title","session"]])
del train_df
del test_df


In [None]:
import warnings
warnings.filterwarnings('ignore')
header_cat_l = []
meta_address_l = []

index = corpus_tag.index.tolist()
for index, row in tqdm(enumerate(index)):
    header_cat, meta_address = meta_data_parser(corpus_tag.loc[row,"session"], row)
    header_cat_l.append(header_cat)
    meta_address_l.append(meta_address)
    del header_cat
    del meta_address
    

In [None]:
corpus_tag["header_cat"] = header_cat_l
corpus_tag["meta_address"] = meta_address_l

In [None]:
train_df_to_append = pd.merge(train_df_to_append, corpus_tag["header_cat"], left_index=True, right_index=True)
train_df_to_append = pd.merge(train_df_to_append, corpus_tag["meta_address"], left_index=True, right_index=True)
test_df_to_append = pd.merge(test_df_to_append, corpus_tag["header_cat"], left_index=True, right_index=True)
test_df_to_append = pd.merge(test_df_to_append, corpus_tag["meta_address"], left_index=True, right_index=True)

## Парсинг данных об авторах статьи

In [None]:
def authors_parser(session, art_id):
    url_id = art_id.replace(session, "")
    url = "https://www.rbc.ru/rbcfreenews/"
    full_url = url+url_id
    try:
        article =  requests.get(full_url)
    except:
        time.sleep(5.5)
        article =  requests.get(full_url)
    soup = BeautifulSoup(article.text)
   
    t =  soup.find_all('span', {'class': "article__authors__author__name"})
    
    if len(t) == 0:
        t =  str(soup.find('div', {'itemprop': "author"}).find('meta', {'itemprop':"name"})['content'])
        return t
    else:
        authors = [str(author.string) for author in t]
        return authors
    
    

In [None]:
train_df = pd.read_csv("train_dataset_train.csv", index_col=0)
test_df = pd.read_csv("test_dataset_test.csv", index_col=0)

In [None]:
corpus_tag = train_df[["title","session"]].append(test_df[["title","session"]])
del train_df
del test_df
#np.log -> predict na predict np.exp()

In [None]:
import warnings
warnings.filterwarnings('ignore')
authors=[]

index = corpus_tag.index.tolist()
for index, row in tqdm(enumerate(index)):
    author = authors_parser(corpus_tag.loc[row,"session"], row)
    if type(author) == list:
        if len(author)>1:
            author = ' '.join(author)
        else:
            author = author[0]
    #print(author)
    authors.append(author)
    

In [None]:
corpus_tag["new_authors"] = authors

In [None]:
train_df_to_append = pd.merge(train_df_to_append, corpus_tag["new_authors"], left_index=True, right_index=True)
test_df_to_append = pd.merge(test_df_to_append, corpus_tag["new_authors"], left_index=True, right_index=True)

In [None]:
train_df_to_append.to_csv("full_train.csv")
test_df_to_append.to_csv("full_test.csv")