# HomeWork 4

## Pavel Belov

In [5]:
import requests
from lxml import html
from abc import ABC, abstractmethod
from datetime import datetime
from pprint import pprint
from pymongo import MongoClient, errors
import zlib



## Constants

In [14]:
# User agent

DEFAULT_HEADER = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, \
like Gecko) Chrome/83.0.4103.97 Safari/537.36'

# Mail.ru

MAIL_DOMAIN_URL = 'https://news.mail.ru'
MAIL_SEARCH_URL = f'{MAIL_DOMAIN_URL}/search'
MAIL_API_URL = f'{MAIL_DOMAIN_URL}//najax/search'

# Yandex

YANDEX_DOMAIN_URL = 'https://newssearch.yandex.ru'
YANDEX_SEARCH_URL = f'{YANDEX_DOMAIN_URL}/yandsearch'

# Lenta

LENTA_DOMAIN_URL = 'https://m.lenta.ru'
LENTA_SEARCH_URL = f'{LENTA_DOMAIN_URL}/search/v2/process'

## Mail.ru

In [10]:
class MailRuParser():
    def __init__(self, params):
        self.url = MAIL_SEARCH_URL
        print(self.url)
        self.params = {'q': query_string}
        self.header = {
            'User-Agent': DEFAULT_HEADER,
            'Cache-Control': 'private, no-cache, no-store'
        }
        self.response = None
        self.dom = None 
        
    def fetch_data(self, url=None):
        if self.url is None and url is None:
            raise Exception("No 'url' parameter provided during initialization")

        if url is not None:
            self.url = url
            self.response = requests.get(url=self.url, headers=self.header)
            return

        self.response = requests.get(url=self.url, headers=self.header, params=self.params)

    def get_dom(self):
        if self.response is None:
            raise Exception("You need to fetch_data first")
        self.dom = html.fromstring(self.response.text)
        return self.dom

    def get_news(self):
        nodes_list = self.get_nodes_list()

        news = []
        for node in nodes_list:
            article = {
                'link': self.get_link_from_node(node),
                'source': self.get_source_from_node(node),
                'title': self.get_title_from_node(node),
                'published_at': self.get_publish_date_from_node(node)
            }
            news.append(article)

        return news

    def get_news_from_pages_range(self, pages_range):
        total_news = []
        for page_number in pages_range:
            self.fetch_data(self.get_page_link(page_number))
            total_news += self.get_api_news()

        return total_news

    def get_api_news(self):
        data = self.response.json()
        news_list = []
        for item in data['data']['items']:
            article = {
                'link': f'{MAIL_DOMAIN_URL}{item["url"]}',
                'source': item['source']['name'],
                'title': item['title'],
                'published_at': datetime.fromisoformat(item['published']['rfc3339'])
            }
            news_list.append(article)

        return news_list

    def get_nodes_list(self):
        dom = self.get_dom()
        return dom.xpath("//div[@class='paging js-module']//div[@class='paging__content js-pgng_cont']/\
        div[@class='newsitem newsitem_height_fixed js-ago-wrapper js-pgng_item']")

    def get_page_link(self, page_number):
        return f'{MAIL_API_URL}/?q={self.params["q"]}&page={page_number}'

    def get_source_from_node(self, node):
        return node.xpath(".//span[@class='newsitem__param'][1]/text()")[0]

    def get_title_from_node(self, node):
        return node.xpath("./span[@class='cell']/a/@href")[0]

    def get_link_from_node(self, node):
        return node.xpath("./span[@class='cell']/a/span/text()")[0]

    def get_publish_date_from_node(self, node):
        published_at_string = node.xpath(".//div[@class='newsitem__params']/span[@class='newsitem__param js-ago']/@datetime")[0]
        return datetime.fromisoformat(published_at_string) if published_at_string is not None else None


## yandex

In [7]:
class YandexParser():
    
    def __init__(self, query):
        self.url = YANDEX_SEARCH_URL
        self.params = {
             'text': query,
             'rpt': 'nnews2',
             'wiz_no_news': 1,
             'rel': 'rel'
        }
        self.header = {
            'User-Agent': DEFAULT_HEADER,
            'Cache-Control': 'private, no-cache, no-store'
        }
        self.response = None
        self.dom = None
        
    def fetch_data(self, url=None):
        if self.url is None and url is None:
            raise Exception("No 'url' parameter provided during initialization")

        if url is not None:
            self.url = url
            self.response = requests.get(url=self.url, headers=self.header)
            return

        self.response = requests.get(url=self.url, headers=self.header, params=self.params)

    def get_dom(self):
        if self.response is None:
            raise Exception("You need to fetch_data first")
        self.dom = html.fromstring(self.response.text)
        return self.dom

    def get_news(self):
        nodes_list = self.get_nodes_list()

        news = []
        for node in nodes_list:
            article = {
                'link': self.get_link_from_node(node),
                'source': self.get_source_from_node(node),
                'title': self.get_title_from_node(node),
                'published_at': self.get_publish_date_from_node(node)
            }
            news.append(article)

        return news

    def get_news_pages(self, number_of_pages):
        total_news = []

        self.fetch_data()
        total_news += self.get_news()
        next_page_link = self.get_page_link(None)

        if next_page_link is None:
            return total_news

        for i in range(0, number_of_pages):
            next_page_link = self.get_page_link(i)
            if next_page_link is not None:
                self.fetch_data(next_page_link)
                total_news += self.get_news()
            else:
                break

        return total_news

    def get_nodes_list(self):
        dom = self.get_dom()
        return dom.xpath("//ul[@class='search-list']/li")

    def get_page_link(self, page_number):
        dom = self.get_dom()
        link_node = dom.xpath("//div[@class='pager__content']//span[@class='pager__group'][last()]/a/@href")
        if link_node is not None and len(link_node) != 0:
            link_tail = link_node[0]
            return f'{YANDEX_DOMAIN_URL}{link_tail}'

        return None

    def get_source_from_node(self, node):
        return node.xpath(".//div[@class='document i-bem']//div[@class='document__provider-name']/text()")[0]

    def get_title_from_node(self, node):
        return node.xpath(".//div[@class='document i-bem']//h2[@class='document__head']//a/text()")[0]

    def get_link_from_node(self, node):
        return node.xpath(".//div[@class='document i-bem']//h2[@class='document__head']//a/@href")[0]

    def get_publish_date_from_node(self, node):
        return node.xpath(".//div[@class='document i-bem']//div[@class='document__time']/text()")[0]

## Lenta.ru

In [8]:
class LentaParser():
    def __init__(self, query):
        self.url = LENTA_SEARCH_URL
        self.params = {
             'query': query,
             'from': 0,
             'size': 100,
             'sort': 2,
             'title_only': 0,
             'domain': 1
         }
        self.header = {
            'User-Agent': DEFAULT_HEADER,
            'Cache-Control': 'private, no-cache, no-store'
        }
        self.response = None
        self.dom = None
        
    def fetch_data(self, url=None):
        if self.url is None and url is None:
            raise Exception("No 'url' parameter provided during initialization")

        if url is not None:
            self.url = url
            self.response = requests.get(url=self.url, headers=self.header)
            return

        self.response = requests.get(url=self.url, headers=self.header, params=self.params)
    
    def get_dom(self):
        if self.response is None:
            raise Exception("You need to fetch_data first")
        self.dom = html.fromstring(self.response.text)
        return self.dom

    def get_news(self):
        nodes_list = self.get_nodes_list()

        news = []
        for node in nodes_list:
            article = {
                'link': self.get_link_from_node(node),
                'source': self.get_source_from_node(node),
                'title': self.get_title_from_node(node),
                'published_at': self.get_publish_date_from_node(node)
            }
            news.append(article)

        return news

    def get_news_pages(self, pages_range):
        total_news = []
        for page_number in pages_range:
            self.params['from'] = page_number * self.params['size']
            self.fetch_data()
            total_news += self.get_news()

        return total_news

    def get_nodes_list(self):
        result = self.response.json()['matches']
        return result

    def get_page_link(self, page_number):
        pass

    def get_source_from_node(self, node):
        return None

    def get_title_from_node(self, node):
        return node['title']

    def get_link_from_node(self, node):
        return node['url']

    def get_publish_date_from_node(self, node):
        timestamp = node['pubdate']
        return datetime.fromtimestamp(timestamp) if timestamp else None

## main code

In [11]:
client = MongoClient('localhost', 27017)
db = client['news_db']
news_db = db.news


def make_hash(any_dict):
    return zlib.adler32(bytes(repr(any_dict), 'utf-8'))


def save_news_to_db(news_list):
    for article in news_list:
        article_hash = make_hash(article)
        article["_id"] = article_hash

        try:
            news_db.insert_one(article)
        except errors.DuplicateKeyError:
            print("Duplicate found for article: ", article)
            pass


In [12]:
news_db.delete_many({})

<pymongo.results.DeleteResult at 0x7f7bbe419cc8>

In [16]:


query_string = 'наука'

# news.mail.ru/
mail_ru_parser = MailRuParser(query_string)
mail_ru_parser.fetch_data()
mail_ru_news_from_page = mail_ru_parser.get_news()
mail_ru_news_from_api = mail_ru_parser.get_news_from_pages_range(range(2, 12))

# yandex.ru/news/

yandex_parser = YandexParser(query_string)
yandex_parser.fetch_data()
yandex_news = yandex_parser.get_news_pages(3)

# lenta.ru/

lenta_parser = LentaParser(query_string)
lenta_parser.fetch_data()
lenta_news = lenta_parser.get_news_pages(range(0, 2))

result_news = yandex_news + lenta_news + mail_ru_news_from_page + mail_ru_news_from_api 

save_news_to_db(result_news)


https://news.mail.ru/search


In [17]:


def get_all_news():
    return_list = []
    for news in news_db.find({}):
        return_list.append(news)
    return return_list

In [18]:
pprint(get_all_news()[0:5])

[{'_id': 1394723968,
  'link': 'https://www.pnp.ru/social/akademik-ran-soobshhil-chto-pandemiya-koronavirusa-mozhet-dlitsya-ot-goda-do-tryokh-let.html',
  'published_at': '14\xa0июня\xa0в\xa001:22',
  'source': 'Парламентская газета',
  'title': 'Академик РАН сообщил, что пандемия коронавируса может длиться от '
           'года до трёх лет'},
 {'_id': 239513037,
  'link': 'https://360tv.ru/news/obschestvo/koronavirus-mozhet-mutirovat-v-bolee-agressivnye-formy-akademik-ran/',
  'published_at': '14\xa0июня\xa0в\xa004:35',
  'source': 'Телеканал 360°',
  'title': 'Коронавирус может мутировать в более агрессивные формы — академик '
           'РАН'},
 {'_id': 3770663438,
  'link': 'https://tsargrad.tv/news/vakcina-na-skorost-jeto-prestupno-ukol-ot-koronavirusa-raskritikoval-doktor-medicinskih-nauk_260564',
  'published_at': '11:00',
  'source': 'Царьград',
  'title': 'Вакцина на скорость - это преступно: «укол от коронавируса» '
           'раскритиковал доктор медицинских '},
 {'_id': 22