## Потапова Софья, 394 группа

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import HashingVectorizer

In [2]:
urls_train_df = pd.read_csv('url_domain_train', header=None, delimiter='\t')
urls_train_df.columns = ['id', 'url', 'count']
urls_train_df = urls_train_df[['id', 'url']]

In [3]:
urls_train_df.head()

Unnamed: 0,id,url
0,000000014B60815F65B38258011B6C01,login.rutracker.org
1,000000014B60815F65B38258011B6C01,rutracker.org
2,000000014C03DA2A47AC433A0C755201,admin.tour-spb.net
3,000000014C03DA2A47AC433A0C755201,czinfo.ru
4,000000014C03DA2A47AC433A0C755201,forumsostav.ru


In [4]:
urls_train_df = pd.DataFrame(urls_train_df.groupby('id')['url'].apply(lambda x: x.tolist()))
urls_train_df['id'] = urls_train_df.index
urls_train_df.index = range(len(urls_train_df))
urls_train_df.columns = ['urls', 'id']

In [5]:
urls_train_df.head()

Unnamed: 0,urls,id
0,"[id.rambler.ru, mail.rambler.ru, r0.ru]",000000013CB5719C0000A2C90002C101
1,"[1prime.ru, autorambler.ru, chellak.ru, docs.c...",00000001442BE24000001B7D00F50801
2,[bosch-korolev.ru],00000001448580F800003F1B31FB0901
3,"[aptekanizkihcen.ua, colady.ru, gorod.dp.ua, i...",0000000145BDB2FF000157971645E901
4,"[astrorok.ru, diets.ru, edaplus.info, eshzdoro...",000000014602771F0000DB9359714C01


In [6]:
age_train_df = pd.read_csv('age_profile_train', header=None, delimiter='\t')
age_train_df.columns = ['id', 'age']

In [7]:
age_train_df.head()

Unnamed: 0,id,age
0,000000013CB5719C0000A2C90002C101,53
1,00000001442BE24000001B7D00F50801,48
2,00000001448580F800003F1B31FB0901,28
3,0000000145BDB2FF000157971645E901,44
4,000000014602771F0000DB9359714C01,48


In [8]:
train_df = urls_train_df.merge(age_train_df, on='id', how='left')

In [9]:
train_df.head()

Unnamed: 0,urls,id,age
0,"[id.rambler.ru, mail.rambler.ru, r0.ru]",000000013CB5719C0000A2C90002C101,53
1,"[1prime.ru, autorambler.ru, chellak.ru, docs.c...",00000001442BE24000001B7D00F50801,48
2,[bosch-korolev.ru],00000001448580F800003F1B31FB0901,28
3,"[aptekanizkihcen.ua, colady.ru, gorod.dp.ua, i...",0000000145BDB2FF000157971645E901,44
4,"[astrorok.ru, diets.ru, edaplus.info, eshzdoro...",000000014602771F0000DB9359714C01,48


In [10]:
def read_data(filename, column_name):
    df = pd.read_csv(filename, header=None, delimiter='\t')
    df.columns = ['Id', column_name] + [str(i) for i in range(len(df.columns) - 2)]
    df = df[['Id', column_name]]
    return df


def group_data(filename, column_name):
    df = read_data(filename, column_name)

    df = pd.DataFrame(df.groupby('Id')[column_name].apply(lambda x: x.tolist()))
    df['Id'] = df.index
    df.index = range(len(df))
    df.columns = [column_name, 'Id']
    return df

def merge_data(dfs):
    df = dfs[0]
    for n in dfs[1:]:
        df = df.merge(n, on='Id', how='left')
    return df

In [11]:
train_df = merge_data([group_data('url_domain_train', 'url'), group_data('title_unify_train', 'title'), read_data('age_profile_train', 'age')])
test_df = merge_data([group_data('url_domain_test', 'url'), group_data('title_unify_test', 'title')])


In [13]:
urls_train_df = read_data('url_domain_train', ['id', 'url', '_'], ['id', 'url'])
urls_train_df = group_data(urls_train_df, 'url')
titles_train_df = read_data('title_unify_train', ['id', 'title', '_'], ['id', 'title'])
titles_train_df = group_data(titles_train_df, 'title')
age_train_df = read_data('age_profile_train', ['id', 'age'])


titles_train_df = merge_data([titles_train_df, age_train_df])
urls_train_df = merge_data([urls_train_df, age_train_df])

In [67]:
urls_test_df = read_data('url_domain_test', ['id', 'url', '_'], ['id', 'url'])
urls_test_df = group_data(urls_test_df, 'url')
titles_test_df = read_data('title_unify_test', ['id', 'title', '_'], ['id', 'title'])
titles_test_df = group_data(titles_test_df, 'title')


test_df = merge_data([titles_test_df, urls_test_df], how='inner')

In [14]:
train_df.head()

Unnamed: 0,url,Id,title,age
0,"[id.rambler.ru, mail.rambler.ru, r0.ru]",000000013CB5719C0000A2C90002C101,[бесплатный надёжный почта рамблер электронный...,53
1,"[1prime.ru, autorambler.ru, chellak.ru, docs.c...",00000001442BE24000001B7D00F50801,[24-х 34-х до договор неделя новость предложит...,48
2,[bosch-korolev.ru],00000001448580F800003F1B31FB0901,"[авто бош контакт королёв сервис, авто бош кор...",28
3,"[aptekanizkihcen.ua, colady.ru, gorod.dp.ua, i...",0000000145BDB2FF000157971645E901,[ua втрать війни донбасі за на новини озвучить...,44
4,"[astrorok.ru, diets.ru, edaplus.info, eshzdoro...",000000014602771F0000DB9359714C01,"[black walnut грецкий орех чёрный, inmoment ru...",48


In [15]:
test_df.head()

Unnamed: 0,url,Id,title
0,"[1000bankov.ru, 1tv.ru, 4put.ru, argumenti.ru,...",000000014A02348E701552980349FF01,[11-й гектар дом коммунизм на набережная недос...
1,"[autorambler.ru, bilettorg.ru, dsol-druzhba.ru...",000000014A10EA183BF8594A0B2AB201,[20-летний выглядеть девушка как королева ната...
2,"[photosight.ru, rambler.ru]",000000014A4FE5C33A929D4C26943601,[медийный портал рамблер]
3,"[base.consultant.ru, dogovor-obrazets.ru, fd.r...",000000014B7BB9957784A9BC0AC9F401,"[бесплатно код онлайн срочно статистика, или и..."
4,"[assessor.ru, audit-it.ru, base.garant.ru, com...",000000014C7749F896D82C2B01E8B801,[bank privat автокредитование банковский банко...


# Снижение размерности

## W2V

In [16]:
from gensim.models import word2vec
model = word2vec.Word2Vec.load_word2vec_format('ruscorpora_russe.model.bin', binary=True)

In [17]:
import nltk
from gensim import corpora
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

stops = set(stopwords.words('russian'))
def review_wordlist(text, remove_stopwords=True):
    #remove HTML
    text = review #bs(review, "lxml").get_text()
    words = tknzr.tokenize(text)
    words = [s for s in words if str.isalpha(s)]
    if remove_stopwords:
        stops = set(stopwords.words('english'))
        stops.update(set(stopwords.words('russian')))
        words = [s for s in words if not s in stops]
    return words
    #return [word for word in text.split() if word not in stops]


def summary2vec(summary):
    tokens = review_wordlist(summary)
    vec = [model[token.lower()] for token in tokens if token.lower() in model]
    #vec = [words2vec(review_wordlist(s)) for s in summary]
    if len(vec) != 0:
        return sum(vec) / len(vec)
    return [0 for i in range(300)]


In [18]:
summaryW2V = np.frompyfunc(lambda title: summary2vec(title), 1, 1)

## PCA

In [None]:
from sklearn import decomposition
pca = decomposition.PCA(n_components=3000)
pca.fit(train_df)
train_df_pca = pca.transform(data_train)

## TF-IDF

In [19]:
from sklearn.feature_extraction.text import HashingVectorizer

def parse(urls):
    return ' '.join(map(lambda url: url.replace('.', ' '), urls))

urls = map(parse, urls_train_df.url.values)
hw = HashingVectorizer(n_features=1000).fit(urls_str)
urls = hw.transform(urls_str).todense()

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()

# Обучение модели

## Random Forest

In [21]:
titles = titles_train_df.title.values
titles_vec = []
#summary2vec(titles[0][0])
for i in xrange(titles.shape[0]):
    titles_vec.append(np.mean(summaryW2V(titles[i]), axis=0))

In [22]:
titles_train_df['title'] = titles_vec

In [70]:
X_train_titles, y_train_titles = np.asarray(titles_train_df.title.values.tolist()), titles_train_df.age.values

In [25]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=500, n_jobs=7)
rf.fit(X_train_titles, y_train_titles)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=5, n_jobs=7, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [44]:
cross_val_score(rf, X_train_titles, y_train_titles, scoring='mean_squared_error', verbose=True)

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.2min finished


array([-178.4052408 , -175.10825752, -163.16848156])

## Linear Regression

In [46]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression(n_jobs=5)
linear.fit(X_train_titles, y_train_titles)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=5, normalize=False)

In [48]:
cross_val_score(linear, X_train_titles, y_train_titles, scoring='mean_squared_error', verbose=True)

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   22.1s finished


array([-155.12069991, -153.21563153, -133.78867627])

## KNN

In [71]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_jobs=6, weights='distance', n_neighbors=30)
knn.fit(X_train_titles, y_train_titles)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=6, n_neighbors=30, p=2,
          weights='distance')

In [72]:
cross_val_score(knn, X_train_titles, y_train_titles, scoring='mean_squared_error', verbose=True)

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 13.0min finished


array([-153.50906046, -160.73018578, -144.69242402])

## Gboost

In [74]:
from sklearn.ensemble import GradientBoostingRegressor
boost = GradientBoostingRegressor(n_estimators=10, learning_rate=0.1)
boost.fit(X_train_titles, y_train_titles)

GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=3, max_features=None, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, presort='auto',
             random_state=None, subsample=1.0, verbose=0, warm_start=False)

In [None]:
cross_val_score(boost, X_train_titles, y_train_titles, scoring='mean_squared_error', verbose=True)

# Stacking

Для titles используется W2V, модель - gboost.
Для urls используется tfidf, модель - rf.
Над ними обучается linear_reg


In [34]:
urls_train_df, urls_test_df = urls_train_df[:110000], urls_train_df[110000:]
titles_train_df, titles_test_df = titles_train_df[:110000], titles_train_df[110000:]
df = merge_data_by_id([titles_test_df, urls_test_df[['id', 'url']]], how='inner')

In [26]:
def for_titles(df):
    titles = df.title.values
    titles_vec = []
    for i in xrange(titles.shape[0]):
        titles_vec.append(np.mean(summaryW2V(titles[i]), axis=0))
    return boost.predict(titles_vec)


def for_urls(df):
    urls = map(parse, df.url.values)
    hw = HashingVectorizer(n_features=1000).fit(urls_str)
    urls = hw.transform(urls).todense()
    urls = tfidf.transform(urls)
    return rf.predict(urls)


def set_columns(df, pred):
    df['age'] = pred
    df = df[['id', 'age']]
    df.columns = ['Id', 'age']
    

In [29]:
X_train_titles = np.asarray(df.title.values.tolist())
pred_titles = boost.predict(X_train_titles)

In [65]:
rf = RandomForestRegressor(n_estimators=5, n_jobs=7)
rf.fit(X_train_urls, y_train_urls)
pred_urls = for_urls(df)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=5, n_jobs=7, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [38]:
df['titles_age'] = pred_titles
df['urls_age'] = pred_urls
df = stacking_df[['id', 'titles_age', 'urls_age', 'age']]
df.columns = ['id', 'titles_age', 'urls_age', 'age']
df.head()

Unnamed: 0,id,titles_age,urls_age,age
0,061C5B62540224B100000751708F1301,40.4,31.0415,30
1,061C5C4256F7CB9D0000055C06ACD801,48.4,35.464855,41
2,061C5CD9567BB3CF0000054E46EBFE01,35.0,37.303656,32
3,061C5DDC56E680DF0000054BA6199601,43.0,46.633536,43
4,061C5E3F548EC0C300017BDA5247F801,28.8,31.124153,27


In [39]:
X_train_age, y_train_age = df[['titles_age', 'urls_age']].values, df['age'].values

In [40]:
clf = LinearRegression(n_jobs=7)

In [41]:
cross_val_score(clf, X_train_age, y_train_age, scoring='mean_squared_error', verbose=True)

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


array([-126.30318112, -121.58070577, -131.21677281])

In [42]:
clf.fit(X_train_age, y_train_age)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=7, normalize=False)

# Отправка Решения

In [50]:
pred_titles = for_titles(test_df)
pred_urls = for_urls(test_df)
merged_pred = np.hstack([pred_titles[:, None], pred_urls[:, None]])
y_pred = clf.predict(merged_pred)
set_columns(test_df, y_pred)
y_pred

array([ 41.81870107,  35.44590914,  38.15391239, ...,  33.54285954,
        34.8411313 ,  42.32888924])

In [53]:
skiped_titles = set(titles_test_df.id.values) - set(test_df.Id.values)
skiped_titles_df = titles_test_df[titles_test_df['id'].isin(skiped_titles)].copy()

skiped_urls = set(urls_test_df.id.values) - set(test_df.Id.values)
skiped_urls_df = urls_test_df[urls_test_df['id'].isin(skiped_urls)].copy()

pred_titles_skiped = for_titles(skiped_titles_df)
pred_urls_skiped = for_urls(skiped_urls_df)

set_columns(skiped_titles_df, pred_titles_skiped)
set_columns(skiped_urls_df, pred_urls_skiped)

test_df = test_df.append(skiped_titles_df, ignore_index=True)
test_df = test_df.append(skiped_urls_df, ignore_index=True)

In [62]:
test_df.to_csv('solution.csv', index=False)