In [153]:
from tqdm import tqdm_notebook, tqdm
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy.sparse

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation

In [96]:
%%time
import pickle
with open('text_dict.p', 'rb') as fp:
    page_dict = pickle.load(fp)

CPU times: user 5.69 s, sys: 4.25 s, total: 9.94 s
Wall time: 13.8 s


In [97]:
train_data = pd.read_csv('train_groups.csv')
test_data = pd.read_csv('test_groups.csv')

In [98]:
text = pd.DataFrame.from_dict(page_dict, orient='index')

In [99]:
text['doc_id']=text.index

In [100]:
text.rename(columns={0: 'text'}, inplace=True)

In [101]:
text.head(2)

Unnamed: 0,text,doc_id
1,аншин центр репродукц генетик ферт медмоскв ан...,1
2,перевод кив кошельк главн перевод кив кошельк ...,2


In [102]:
train_data.tail(2)

Unnamed: 0,pair_id,group_id,doc_id,target
11688,11689,129,27885,0
11689,11690,129,27987,0


In [103]:
X_train = pd.merge(train_data,
                 text,
                 on='doc_id', 
                 how='left')

In [104]:
X_train.head(2)

Unnamed: 0,pair_id,group_id,doc_id,target,text
0,1,1,15731,0,зам подшипник ступиц нив автомануал руководств...
1,2,1,14829,0,ваз опт соч сравн цен куп потребительск товар ...


In [148]:
X_train['group_id'].unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129])

In [155]:
columns_cv = []
for i in range(1, 801):
    columns_cv.append('cv{}'.format(i))
    
columns_th = []
for i in range(1, 21):
    columns_th.append('theme{}'.format(i))

In [156]:
def preprocess(group, X_t):
    X = X_t[X_t['group_id']==group].copy()
    text = X['text'].copy()
    X.drop('text', axis=1, inplace=True)
    ind = X.index 
    
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=800)
    features_Count = vectorizer.fit_transform(text)
    Countdf = pd.SparseDataFrame(features_Count, columns=columns_cv).fillna(0)
    Countdf.index = ind
    
    lda = LatentDirichletAllocation(n_components=20, max_iter=5, 
                                learning_method='online', learning_offset=50.,
                                    random_state=0).fit_transform(features_Count)
    themes = pd.DataFrame(data=lda, columns=columns_th)
    themes.index = ind
    
    res_frame = pd.concat([X, Countdf, themes], axis=1)
    return res_frame

In [157]:
X_train_new = preprocess(1,  X_train)

In [159]:
for i in tqdm(range(2, len(X_train['group_id'].unique())+1)):
    X_temp = preprocess(i,  X_train)
    X_train_new = pd.concat([X_train_new, X_temp], axis=0)

100%|██████████| 128/128 [06:10<00:00,  2.90s/it]


In [161]:
X_train_new.shape

(11690, 824)

In [162]:
X_train.shape

(11690, 5)

In [163]:
X_train_new.to_csv('X_train_new.csv', index=False)

In [165]:
X_test = pd.merge(test_data,
                 text,
                 on='doc_id', 
                 how='left')

In [167]:
X_test['group_id'].unique()

array([130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
       182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
       195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
       208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,
       221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
       234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246,
       247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259,
       260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272,
       273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285,
       286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298,
       299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309])

In [168]:
X_test_new = preprocess(130,  X_test)

In [171]:
for i in tqdm(range(131, 310)):
    X_temp = preprocess(i,  X_test)
    X_test_new = pd.concat([X_test_new, X_temp], axis=0)

100%|██████████| 179/179 [08:43<00:00,  2.92s/it]


In [172]:
X_test_new.tail(4)

Unnamed: 0,pair_id,group_id,doc_id,cv1,cv2,cv3,cv4,cv5,cv6,cv7,...,theme11,theme12,theme13,theme14,theme15,theme16,theme17,theme18,theme19,theme20
16623,28314,309,16759,0,0,2,0,2,0,0,...,0.000111,0.000111,0.000111,0.000111,0.000111,0.000111,0.000111,0.000111,0.000111,0.000111
16624,28315,309,15358,0,0,0,0,2,0,0,...,8.3e-05,8.3e-05,8.3e-05,8.3e-05,8.3e-05,8.3e-05,8.3e-05,8.3e-05,8.3e-05,8.3e-05
16625,28316,309,17287,0,0,0,0,2,0,0,...,9e-05,9e-05,9e-05,9e-05,9e-05,9e-05,9e-05,9e-05,9e-05,9e-05
16626,28317,309,16026,6,3,0,0,0,0,0,...,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05


In [174]:
X_test.shape

(16627, 4)

In [175]:
X_test_new.shape

(16627, 823)

In [173]:
X_test_new.to_csv('X_test_new.csv', index=False)