In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

import scipy

%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
train_df = pd.read_csv('./data/howpop_train.csv')
test_df  = pd.read_csv('./data/howpop_test.csv')

In [3]:
train_df.head(1).T

Unnamed: 0,0
url,https://habrahabr.ru/post/18284/
domain,habrahabr.ru
post_id,18284
published,2008-01-01 18:19:00
author,@Tapac
flow,develop
polling,False
content_len,4305
title,Новогодний подарок блоггерам — WordPress 2.3.2
comments,0


In [4]:
train_df.shape, test_df.shape

((134137, 17), (3990, 9))

In [None]:
train_df['published'].apply(lambda ts: pd.to_datetime(ts).value).plot();

In [6]:
pd.DataFrame.corr(train_df)

Unnamed: 0,post_id,polling,content_len,comments,favs,views,votes_plus,votes_minus,views_lognorm,favs_lognorm,comments_lognorm
post_id,1.0,0.081628,0.241384,-0.147763,0.089313,0.132385,-0.187885,-0.235594,-0.010376,-0.005845,-0.001605
polling,0.081628,1.0,-0.003337,0.042605,0.006882,0.02566,-0.033505,0.027116,-0.003523,-0.036385,0.071417
content_len,0.241384,-0.003337,1.0,-0.023544,0.308194,0.204101,0.068779,-0.078686,0.246063,0.356481,0.073132
comments,-0.147763,0.042605,-0.023544,1.0,0.164166,0.290035,0.613961,0.457638,0.349568,0.278942,0.66274
favs,0.089313,0.006882,0.308194,0.164166,1.0,0.634304,0.416241,0.062877,0.456097,0.587982,0.263239
views,0.132385,0.02566,0.204101,0.290035,0.634304,1.0,0.396849,0.128654,0.585105,0.406782,0.326427
votes_plus,-0.187885,-0.033505,0.068779,0.613961,0.416241,0.396849,1.0,0.464168,0.414232,0.449712,0.525081
votes_minus,-0.235594,0.027116,-0.078686,0.457638,0.062877,0.128654,0.464168,1.0,0.146609,0.088226,0.355458
views_lognorm,-0.010376,-0.003523,0.246063,0.349568,0.456097,0.585105,0.414232,0.146609,1.0,0.688811,0.54653
favs_lognorm,-0.005845,-0.036385,0.356481,0.278942,0.587982,0.406782,0.449712,0.088226,0.688811,1.0,0.479476


In [12]:
train_df['published']=pd.to_datetime(train_df['published'])
test_df['published']=pd.to_datetime(test_df['published'])

In [17]:
features = ['author', 'flow', 'domain','title']
train_size = int(0.7 * train_df.shape[0])

In [18]:
len(train_df), train_size

(134137, 93895)

In [101]:
X, y = train_df.loc[:, features],  train_df['favs_lognorm'] 

X_test = test_df.loc[:, features]

In [20]:
X_train, X_valid = X.iloc[:train_size, :], X.iloc[train_size:,:]

y_train, y_valid = y.iloc[:train_size], y.iloc[train_size:]

In [23]:
vectorizer_title = TfidfVectorizer(min_df=3, max_df=0.3, ngram_range=(1, 3))

X_train_title = vectorizer_title.fit_transform(X_train['title'])
X_valid_title = vectorizer_title.transform(X_valid['title'])
X_test_title = vectorizer_title.transform(X_test['title'])

In [30]:
len(vectorizer_title.vocabulary_)

50624

In [32]:
vectorizer_title.vocabulary_.get('python')

9065

In [33]:
vectorizer_title_ch = TfidfVectorizer(analyzer='char', min_df=3, max_df=0.3, ngram_range=(1, 3))

X_train_title_ch = vectorizer_title_ch.fit_transform(X_train['title'])
X_valid_title_ch = vectorizer_title_ch.transform(X_valid['title'])
X_test_title_ch = vectorizer_title_ch.transform(X_test['title'])

In [34]:
len(vectorizer_title_ch.vocabulary_)

32839

In [35]:
feats = ['author', 'flow', 'domain']
X_train[feats][:5]

Unnamed: 0,author,flow,domain
0,@Tapac,develop,habrahabr.ru
1,@DezmASter,design,habrahabr.ru
2,@DezmASter,design,habrahabr.ru
3,@Taoorus,design,habrahabr.ru
4,@dennydo,,geektimes.ru


In [37]:
# сначала заполняем пропуски прочерком
X_train[feats][:5].fillna('-')

Unnamed: 0,author,flow,domain
0,@Tapac,develop,habrahabr.ru
1,@DezmASter,design,habrahabr.ru
2,@DezmASter,design,habrahabr.ru
3,@Taoorus,design,habrahabr.ru
4,@dennydo,-,geektimes.ru


In [38]:
# Преобразуем датафрейм в словарь, где ключами являются индексы объектов (именно для этого мы транспонировали датафрейм),
# а значениями являются словари в виде 'название_колонки':'значение'
X_train[feats][:5].fillna('-').T.to_dict()

{0: {'author': '@Tapac', 'flow': 'develop', 'domain': 'habrahabr.ru'},
 1: {'author': '@DezmASter', 'flow': 'design', 'domain': 'habrahabr.ru'},
 2: {'author': '@DezmASter', 'flow': 'design', 'domain': 'habrahabr.ru'},
 3: {'author': '@Taoorus', 'flow': 'design', 'domain': 'habrahabr.ru'},
 4: {'author': '@dennydo', 'flow': '-', 'domain': 'geektimes.ru'}}

In [40]:
# В DictVectorizer нам нужно будет передать список словарей для каждого объекта в виде 'название_колонки':'значение',
# поэтому используем .values()
X_train[feats][:5].fillna('-').T.to_dict().values()

dict_values([{'author': '@Tapac', 'flow': 'develop', 'domain': 'habrahabr.ru'}, {'author': '@DezmASter', 'flow': 'design', 'domain': 'habrahabr.ru'}, {'author': '@DezmASter', 'flow': 'design', 'domain': 'habrahabr.ru'}, {'author': '@Taoorus', 'flow': 'design', 'domain': 'habrahabr.ru'}, {'author': '@dennydo', 'flow': '-', 'domain': 'geektimes.ru'}])

In [41]:
# В итоге получается разреженная матрица
dict_vect = DictVectorizer()
dict_vect_matrix = dict_vect.fit_transform(X_train[feats][:5].fillna('-').T.to_dict().values())
dict_vect_matrix

<5x9 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [42]:
# Но можно преобразовать ее в numpy array с помощью .toarray()
dict_vect_matrix.toarray()

array([[0., 0., 1., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0., 1., 0., 1., 0.],
       [1., 0., 0., 0., 0., 1., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1., 0., 1., 0.],
       [0., 0., 0., 1., 1., 0., 1., 0., 0.]])

In [43]:
# В получившейся матрице 5 строк (по числу объектов) и 9 столбцов
# Далее разберемся почему колонок именно 9
dict_vect_matrix.shape

(5, 9)

In [44]:
for col in feats:
    print(col,len(X_train[col][:5].fillna('-').unique()))

author 4
flow 3
domain 2


In [45]:
# например, самая первая колонка называется 'author=@DezmASter' - то есть принимает значение 1 только если автор @DezmASter
dict_vect.feature_names_

['author=@DezmASter',
 'author=@Taoorus',
 'author=@Tapac',
 'author=@dennydo',
 'domain=geektimes.ru',
 'domain=habrahabr.ru',
 'flow=-',
 'flow=design',
 'flow=develop']

In [55]:
vectorizer_feats = DictVectorizer()

X_train_feats = vectorizer_feats.fit_transform(X_train[feats].fillna('-').T.to_dict().values())
X_valid_feats = vectorizer_feats.transform(X_valid[feats].fillna('-').T.to_dict().values())
X_test_feats = vectorizer_feats.transform(X_test[feats].fillna('-').T.to_dict().values())

In [56]:
X_train_feats.shape

(93895, 17869)

In [108]:
X_train_new = scipy.sparse.hstack([X_train_title, X_train_feats, X_train_title_ch])
X_valid_new = scipy.sparse.hstack([X_valid_title, X_valid_feats, X_valid_title_ch])
X_test_new =  scipy.sparse.hstack([X_test_title, X_test_feats, X_test_title_ch])

In [109]:
X_test_new.shape

(3990, 101332)

In [63]:
model1 = Ridge(alpha=0.1, random_state=1)
model2 = Ridge(alpha=1.0, random_state=1)

tr1 = model1.fit(X = X_train_new, y = y_train)
tr2 = model2.fit(X = X_train_new, y = y_train)

In [118]:
train_preds1 = model1.predict(X_train_new)
valid_preds1 = model1.predict(X_valid_new)
train_preds2 = model2.predict(X_train_new)
valid_preds2 = model2.predict(X_valid_new)


print('Ошибка на трейне1',mean_squared_error(y_train, train_preds1))
print('Ошибка на тесте1',mean_squared_error(y_valid, valid_preds1))
print('Ошибка на трейне2',mean_squared_error(y_train, train_preds2))
print('Ошибка на тесте2',mean_squared_error(y_valid, valid_preds2))

Ошибка на трейне1 0.10874234020442716
Ошибка на тесте1 0.9652010375502428
Ошибка на трейне2 0.23647671939171366
Ошибка на тесте2 0.69172165563284


In [103]:
X_TrVal=scipy.sparse.vstack(( X_train_new, X_valid_new))
y = train_df['favs_lognorm'] 
X_TrVal.shape, y.shape

((134137, 101332), (134137,))

In [104]:
%%time
model = Ridge().fit(X_TrVal, y)

CPU times: user 21.1 s, sys: 485 ms, total: 21.6 s
Wall time: 24.6 s


In [110]:
test_preds = model.predict(X_test_new)

In [122]:
sample_submission = pd.read_csv('./data/habr_sample_submission.csv', 
                                index_col='url')

In [123]:
sample_submission.head()

Unnamed: 0_level_0,favs_lognorm
url,Unnamed: 1_level_1
https://habrahabr.ru/post/314080/,0.323588
https://habrahabr.ru/company/plesk/blog/313732/,0.560105
https://habrahabr.ru/company/etagi/blog/314000/,0.667039
https://habrahabr.ru/company/knopka/blog/314030/,0.149563
https://geektimes.ru/company/audiomania/blog/282058/,0.670754


In [124]:
ridge_submission = sample_submission.copy()
ridge_submission['favs_lognorm'] = test_preds
# это будет бейзлайн "Простое решение"
ridge_submission.to_csv('ridge_baseline.csv') 