In [1]:
import pandas as pd
import numpy as np
import scipy.sparse

from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import json

from datetime import datetime

import lightgbm as lgb
import gc

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
news = pd.read_csv('gazeta.csv', quotechar='\"', 
                 escapechar='\\', 
                 error_bad_lines=False, nrows=800000)
news = news[~news['title'].isnull()][['edition', 'title']]
news['doc_id'] = range(len(news))
news.index = range(len(news))
news.head(3)

b'Skipping line 437258: expected 17 fields, saw 20\n'
b'Skipping line 591054: expected 17 fields, saw 20\n'
b'Skipping line 780302: expected 17 fields, saw 20\n'
b'Skipping line 793371: expected 17 fields, saw 20\n'
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,edition,title,doc_id
0,Бизнес,Госдума приняла сегодня в первом чтении и сраз...,0
1,Наука,Японские ученые из физического центра RIKEN за...,1
2,Армия,Россия готовится расширить свою военную операц...,2


In [3]:
news['edition'].value_counts()

Спорт         318908
Общество      163389
Бизнес        105377
Политика      100214
Культура       42290
Армия          25522
Наука          17258
Новости        13251
Стиль          12875
Мнения           586
Авто             314
Технологии        16
Name: edition, dtype: int64

In [4]:
news = pd.concat([news[news['edition']==u'Спорт'].iloc[:3000], 
                  news[news['edition']==u'Политика'].iloc[:3000],
                  news[news['edition']==u'Культура'].iloc[:3000], 
                  news[news['edition']==u'Армия'].iloc[:3000],
                  news[news['edition']==u'Стиль'].iloc[:3000], 
                  news[news['edition']==u'Общество'].iloc[:3000],
                  news[news['edition']==u'Бизнес'].iloc[:3000], 
                  news[news['edition']==u'Новости'].iloc[:3000],
                  news[news['edition']==u'Наука'].iloc[:3000]])

In [5]:
news[['doc_id', 'title']].to_csv("articles.csv", index=None)

In [6]:
users = ["u10{}".format(i) for i in range(1000, 9000)]
users = pd.DataFrame({'uid': users, 'churn': [0 for i in range(7000)]+[1 for i in range(1000)]})
users.sample(frac=1).to_csv("users_churn.csv", index=None)
users.head(3)

Unnamed: 0,uid,churn
0,u101000,0
1,u101001,0
2,u101002,0


In [7]:
news['edition'].value_counts()

Культура    3000
Стиль       3000
Политика    3000
Общество    3000
Наука       3000
Спорт       3000
Новости     3000
Бизнес      3000
Армия       3000
Name: edition, dtype: int64

In [8]:
sport = news[news['edition']==u'Спорт']['doc_id'].values
politics = news[news['edition']==u'Политика']['doc_id'].values
culture = news[news['edition']==u'Культура']['doc_id'].values
business = news[news['edition']==u'Бизнес']['doc_id'].values
army = news[news['edition']==u'Армия']['doc_id'].values
society = news[news['edition']==u'Общество']['doc_id'].values
news_ = news[news['edition']==u'Новости']['doc_id'].values
science = news[news['edition']==u'Наука']['doc_id'].values

In [9]:
news

Unnamed: 0,edition,title,doc_id
6,Спорт,Заместитель председателяnправительства РФnСерг...,6
4896,Спорт,Матч 1/16 финала Кубка России по футболу был п...,4896
4897,Спорт,Форвард «Авангарда» Томаш Заборский прокоммент...,4897
4898,Спорт,Главный тренер «Кубани» Юрий Красножан прокомм...,4898
4899,Спорт,Решением попечительского совета владивостокско...,4899
...,...,...,...
513441,Наука,Ученые Токийского университета морских наук и ...,513441
513442,Наука,Главой кафедры отечественной истории XX века и...,513442
513443,Наука,Американские ученые уточнили возраст расположе...,513443
513444,Наука,За последние 50 лет тропический углеродный цик...,513444


In [11]:
total = []
for i in range(2000):
    q = np.random.choice(sport, 5).tolist()+np.random.choice(science, 1).tolist()
    total.append(q)
for i in range(2000):
    q = np.random.choice(politics, 5).tolist()+np.random.choice(news_, 1).tolist()
    total.append(q)
for i in range(1000):
    q = np.random.choice(culture, 5).tolist()+np.random.choice(society, 1).tolist()
    total.append(q)
for i in range(1000):
    q = np.random.choice(politics, 5).tolist()+np.random.choice(society, 1).tolist()
    total.append(q)
for i in range(1000):
    q = np.random.choice(politics, 5).tolist()+np.random.choice(army, 1).tolist()
    total.append(q)
for i in range(1000):
    q = np.random.choice(society, 5).tolist()+np.random.choice(politics, 1).tolist()
    total.append(q)

In [12]:
pd.DataFrame({'uid': ["u10{}".format(i) for i in range(1000, 9000)], 
              'articles': total}).sample(frac=1).to_csv("users_articles.csv", index=None)

In [13]:
news[news['doc_id']==492909]

Unnamed: 0,edition,title,doc_id


In [14]:
pd.DataFrame({'uid': ["u10{}".format(i) for i in range(1000, 9000)], 
              'articles': total}).sample(frac=1)

Unnamed: 0,uid,articles
4263,u105263,"[295284, 294492, 290056, 294886, 293817, 1093]"
1873,u102873,"[5780, 5117, 7010, 7823, 6333, 513172]"
6613,u107613,"[323409, 324675, 323532, 324015, 322666, 470715]"
7198,u108198,"[2579, 1864, 2312, 703, 2540, 322150]"
5301,u106301,"[323951, 322288, 323440, 323377, 323651, 2416]"
...,...,...
7934,u108934,"[3322, 2279, 1202, 1819, 666, 324212]"
4646,u105646,"[295249, 293955, 293981, 295057, 294737, 620]"
2274,u103274,"[324294, 323729, 322282, 322723, 322368, 513324]"
3977,u104977,"[324098, 322637, 322263, 323564, 323256, 519863]"
