In [1]:
import lxml.html
import lxml.etree
import requests
from datetime import timedelta, datetime
from sqlalchemy import func, create_engine
from sqlalchemy import Column, Integer, Text, DateTime
from sqlalchemy.schema import Index
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base

In [2]:
Base = declarative_base()

class News(Base):
    __tablename__ = 'news'
    id = Column(Integer, primary_key=True)
    title = Column(Text, nullable=False)
    category = Column(Text, nullable=False)
    date = Column(Text, nullable=False)
    
    def __init__(self, title, category, date):
        self.title = title
        self.category = category
        self.date = date


def init_db(db_url):
    engine = create_engine(db_url)
    Base.metadata.bind = engine
    Base.metadata.create_all()
    return sessionmaker(bind=engine)

db_session = init_db('sqlite:///ria.ru.db')()

In [3]:
def get(link, cat):
    url = 'https://ria.ru/'+ cat + link
    return lxml.html.fromstring(requests.get(url).text)

In [5]:
def daterange(start_date, number_of_days):
    for n in range(number_of_days):
        yield start_date - timedelta(n)

In [6]:
categories = ['politics', 'society', 'economy', 'world', 'incidents', 'science', 'culture', 'religion']
amount_of_days = 400
today = datetime.now()
total = 0
for cat in categories:
    i = 0
    d = 0
    for day in daterange(today, amount_of_days):
        d += 1
        #print(cat, datetime.strftime(day, '%d.%m.%Y'))
        try:
            tree = get(datetime.strftime(day, '/%Y%m%d/'), cat)
        except Exception as e:
            print('Exeption', e, 'on', cat, datetime.strftime(day, '%d.%m.%Y'))
            continue
        title_items = tree.xpath('//div[@class="b-list__item "]/a/span[@class="b-list__item-title"]/span')
        for title in title_items:
            if title.text is None:
                continue
            news = News(title.text, cat, datetime.strftime(day, '%d.%m.%Y'))
            db_session.add(news)
            i += 1
        db_session.commit()
        if d%50 == 0:
            print('+', i) 
    total += i
    print(cat, total, '+', i)

+ 797
+ 1602
+ 2318
+ 3044
+ 3708
Exeption HTTPSConnectionPool(host='ria.ru', port=443): Max retries exceeded with url: /politics/20170313/ (Caused by SSLError(SSLError("bad handshake: SysCallError(-1, 'Unexpected EOF')",),)) on politics 13.03.2017
+ 4362
+ 5138
+ 5850
politics 5850 + 5850
+ 600
+ 1199
+ 1799
+ 2392
+ 2990
+ 3573
+ 4158
+ 4741
society 10591 + 4741
+ 557
+ 1115
+ 1653
Exeption HTTPSConnectionPool(host='ria.ru', port=443): Max retries exceeded with url: /economy/20170709/ (Caused by SSLError(SSLError("bad handshake: SysCallError(-1, 'Unexpected EOF')",),)) on economy 09.07.2017
+ 2195
+ 2712
+ 3262
+ 3813
+ 4382
economy 14973 + 4382
+ 1000
+ 2000
+ 3000
+ 4000
+ 5000
+ 6000
+ 7000
+ 8000
world 22973 + 8000
Exeption HTTPSConnectionPool(host='ria.ru', port=443): Max retries exceeded with url: /incidents/20171214/ (Caused by SSLError(SSLError("bad handshake: SysCallError(-1, 'Unexpected EOF')",),)) on incidents 14.12.2017
+ 968
+ 1952
+ 2939
+ 3922
+ 4855
+ 5819
Exeption HT

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vectorizer', CountVectorizer(max_df=0.2, ngram_range=(1,2))),
                         ('tfidf', TfidfTransformer(norm='l2')),
                         ('clf', SGDClassifier(penalty='l2'))])

text_clf = text_clf.fit([n.title.lower() for n in db_session.query(News).order_by(News.id)],
                       [n.category for n in db_session.query(News).order_by(News.id)])

In [25]:
data = ['Ученые выяснили, когда люди больше всего интересуются сексом',
       'Патриарх Кирилл призвал верующих усилить молитву о мире на Украине',
       'Генсек ООН подтвердил право Эр-Рияда на самооборону',
       'Стали известны главные темы инвестфорума в Сочи',
       'Минфин: Украина не заплатила РФ по долгу, дефолт наступит 31 декабря',
       'Путин поздравил писателя Даниила Гранина с днем рождения']
print(text_clf.predict(data))

['science' 'religion' 'world' 'economy' 'economy' 'culture']
