# GeekBrains University
## Methods of data collection

## Lesson 4
### Parsing HTML. XPath

### The Task 
Написать приложение, которое собирает основные новости с сайтов mail.ru, lenta.ru, yandex-новости.    
    
Для парсинга использовать xpath. Структура данных должна содержать:    
•	название источника,    
•	наименование новости,    
•	ссылку на новость,    
•	дата публикации    

In [365]:
from lxml import html
import requests
from pprint import pprint
import datetime

import pandas as pd
import numpy as np
import re
from pymongo import MongoClient

In [347]:
headers = {'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}

In [348]:
client = MongoClient( 'localhost' , 27017 )
db = client['news']

## news.mail.ru

### Get list of news links from news.mail.ru

In [349]:
def get_mail_news():
    main_link = 'https://news.mail.ru/'
    response = requests.get(main_link, headers= headers)
    root = html.fromstring(response.text)

    topics = ['politics', 'economics', 'society', 'incident'] # Topics list for news-mail.ru

    # Get all news links
    data = []
    for topic in topics:
        links = f"//a[contains(@href, '{topic}')]/@href"
        links = root.xpath(links) # list of news links on the topic
        links = [main_link + x for x in links if re.search('\d/$', x)] # Rid off not news links and add main part
        data += links
    
    return(data)

### Get neccessary info on the news link (article)

In [350]:
def mail_news_name_source_date(link):
    main_link = link
    response = requests.get(link, headers= headers)
    root = html.fromstring(response.text)
    
    info = {'link': link}

    date = "//span[contains(@class, 'breadcrumbs__item')]//span[@datetime]/@datetime"
    info['date'] = root.xpath(date)[0] # news date

    source = "//span[contains(@class, 'breadcrumbs__item')]//span/a//text()"
    info['source'] = root.xpath(source)[0] # Name of the source

    name = "//div[contains(@class, 'article js-article js-module')]//span[contains(@class, 'hdr__text')]//text()"
    info['name'] = root.xpath(name)[0] # Name of the article

    return info

### Main loop for news.mail.ru

In [351]:
info = [*map(mail_news_name_source_date, get_mail_news())]
collection = db.news_mail_ru
collection.insert_many(info)

<pymongo.results.InsertManyResult at 0x1f47d31ba08>

In [352]:
info[:3]

[{'link': 'https://news.mail.ru//politics/41433436/',
  'date': '2020-04-17T17:02:11+03:00',
  'source': 'ТАСС',
  'name': 'Путин заявил, что проблема распространения коронавируса уходит из Москвы в другие регионы',
  '_id': ObjectId('5e99f845bc26fad04b35b3c6')},
 {'link': 'https://news.mail.ru//politics/41435438/',
  'date': '2020-04-17T17:58:45+03:00',
  'source': 'Коммерсантъ',
  'name': '«Скрытность COVID-19 — тайная сила этого противника»',
  '_id': ObjectId('5e99f845bc26fad04b35b3c7')},
 {'link': 'https://news.mail.ru//politics/41435438/',
  'date': '2020-04-17T17:58:45+03:00',
  'source': 'Коммерсантъ',
  'name': '«Скрытность COVID-19 — тайная сила этого противника»',
  '_id': ObjectId('5e99f845bc26fad04b35b3c8')}]

## lenta.ru

### Get list of news links from lenta.ru

In [353]:
def get_lenta_news():
    main_link = 'https://lenta.ru/'
    response = requests.get(main_link, headers= headers)
    root = html.fromstring(response.text)

    # Get all news links
    
    links = f"//a[contains(@href, '/news/')]/@href"
    links = root.xpath(links) # list of news links on the topic
    links = [main_link + x for x in links if re.search('\/\d{4}\/\d{2}\/\d{2}\/', x)] # Rid off not news links and add main part
    
    return(links)

### Get neccessary info on the news link (article)

In [354]:
def lenta_news_name_source_date(link):
    main_link = link
    response = requests.get(link, headers= headers)
    root = html.fromstring(response.text)
    
    info = {'link': link}
    
    top = root.xpath(".//div[contains(@class, 'b-topic__header js-topic__header')]")[0]

    date = ".//time[contains(@class, 'g-date')]/@datetime"
    info['date'] = top.xpath(date)[0] # News date

    info['source'] ="lenta.ru" # lenta.ru is an owner

    name = "//h1[contains(@class, 'b-topic__title')]//text()"
    info['name'] = top.xpath(name)[0] # Name of the article

    return info

### Main loop for lenta.ru

In [355]:
info = [*map(lenta_news_name_source_date, get_lenta_news())]
collection = db.news_lenta_ru
collection.insert_many(info)

<pymongo.results.InsertManyResult at 0x1f47ff97e88>

In [356]:
info[:3]

[{'link': 'https://lenta.ru//news/2020/04/17/gref/',
  'date': '2020-04-17T09:00:00+03:00',
  'source': 'lenta.ru',
  'name': 'Задайте вопрос Грефу для эксклюзивного интервью на\xa0ресурсах Rambler Group',
  '_id': ObjectId('5e99f8c1bc26fad04b35b3e2')},
 {'link': 'https://lenta.ru//news/2020/04/17/gref/',
  'date': '2020-04-17T09:00:00+03:00',
  'source': 'lenta.ru',
  'name': 'Задайте вопрос Грефу для эксклюзивного интервью на\xa0ресурсах Rambler Group',
  '_id': ObjectId('5e99f8c1bc26fad04b35b3e3')},
 {'link': 'https://lenta.ru//news/2020/04/17/effect/',
  'date': '2020-04-17T21:32:00+03:00',
  'source': 'lenta.ru',
  'name': 'Собянин оценил эффект от\xa0введения пропускного режима в\xa0Москве',
  '_id': ObjectId('5e99f8c1bc26fad04b35b3e4')}]

## yandex.ru/news/

### Get all info from yandex.ru/news/ (all info we can find from this page directly)

In [392]:
def get_yandex_news():
    main_link = 'https://yandex.ru/news/'
    response = requests.get(main_link, headers= headers)
    root = html.fromstring(response.text)

    # Get all news links
    
    news = f"//a[contains(@href, '/news/story/')]"
    news = root.xpath(news) # list of news
    
    data = []
    for n in news:
        info = {}
        
        link = n.xpath("./@href")
        info['link'] = main_link[:-6] + link[0]
        
        grandpa = n.xpath("./../../..")[0]
        text = grandpa.xpath(".//div[contains(@class, 'story__date')]//text()")[0]
        
        time = text[-5:]
        info['date'] = datetime.datetime.combine(datetime.date.today(), datetime.datetime.strptime(time, "%H:%M").time())

        info['source'] = text[:-5]

        data.append(info)
    return(data)

### Main loop for yandex/news

In [393]:
info = [*get_yandex_news()]
collection = db.news_yandex_ru
collection.insert_many(info)

<pymongo.results.InsertManyResult at 0x1f47d41f0c8>

In [394]:
info[:3]

[{'link': 'https://yandex.ru/news/story/Sobyanin_dopustil_uzhestochenie_propusknogo_rezhima--11d80bdd961d62432f2abecd8dcb6025?lr=213&lang=ru&stid=8_2R6LHWuktFJD0bXIYR&persistent_id=94813076&rubric=index&from=index',
  'date': datetime.datetime(2020, 4, 17, 20, 42),
  'source': 'РИА Новости ',
  '_id': ObjectId('5e99fd39bc26fad04b35b5c4')},
 {'link': 'https://yandex.ru/news/story/Sobyanin_dopustil_uzhestochenie_propusknogo_rezhima--11d80bdd961d62432f2abecd8dcb6025?lr=213&lang=ru&stid=8_2R6LHWuktFJD0bXIYR&persistent_id=94813076&rubric=index&from=index&comments=1',
  'date': datetime.datetime(2020, 4, 17, 20, 42),
  'source': 'РИА Новости ',
  '_id': ObjectId('5e99fd39bc26fad04b35b5c5')},
 {'link': 'https://yandex.ru/news/story/Nazvany_obyazatelnye_usloviya_dlya_peredvizheniya_po_Moskve_peshkom--2c620e5439b953764399601a27affb4c?lr=213&lang=ru&stid=5LnIyZgRGX62u8wp3I-F&persistent_id=94341741&rubric=index&from=index',
  'date': datetime.datetime(2020, 4, 17, 20, 3),
  'source': 'Известия ',