In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip uninstall nltk -y
!pip install texthero

Uninstalling nltk-3.2.5:
  Successfully uninstalled nltk-3.2.5
Collecting texthero
  Downloading https://files.pythonhosted.org/packages/1f/5a/a9d33b799fe53011de79d140ad6d86c440a2da1ae8a7b24e851ee2f8bde8/texthero-1.0.9-py3-none-any.whl
Collecting nltk>=3.3
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 7.2MB/s 
Collecting unidecode>=1.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/9e/25/723487ca2a52ebcee88a34d7d1f5a4b80b793f179ee0f62d5371938dfa01/Unidecode-1.2.0-py2.py3-none-any.whl (241kB)
[K     |████████████████████████████████| 245kB 19.9MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.5-cp37-none-any.whl size=1434675 sha256=7ba58bf6b58be536ab0982132dbae9974f4884aa55669b1c7635829b4b15f45f
  Stored in directory: /root

In [13]:
import os

import numpy as np
import pandas as pd

import dill
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.mixture import GaussianMixture
import gc
from tqdm import tqdm
import time
import nltk
import random
import re

import texthero as hero
from texthero import preprocessing

SEED=42
clusters_num = 60
features_num = 200
def set_seed(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

set_seed(SEED)

INPUT_DIR = '/content/drive/MyDrive/citation_prediction/input/'

データの読み込み

In [4]:
all_title_abstract = pd.read_feather(os.path.join(INPUT_DIR, 'all_title_abstract_df.feather'))
all_title_abstract.head()

Unnamed: 0,title,abstract,cites,clean_title,clean_abstract
0,Heavy-Flavour Production at HERA,We review the theoretical and experimental s...,,heavy flavour production hera,review theoretical experimental status hea...
1,Spectropolarimetric Constraints on the Nature ...,While it is well recognized that interstella...,7.0,spectropolarimetric constraints nature inte...,well recognized interstellar grains made ...
2,A joint analysis of Planck and BICEP2 B modes ...,We analyze BICEP2 and Planck data using a mo...,188.0,joint analysis planck bicep2 b modes includ...,analyze bicep2 planck data using model inc...
3,Molecular movie of ultrafast coherent rotation...,Recording molecular movies on ultrafast time...,8.0,molecular movie ultrafast coherent rotational...,recording molecular movies ultrafast timescal...
4,A Modified Mixed Domain Method for Modeling Ac...,"In this paper, phase correction and amplitud...",,modified mixed domain method modeling acoust...,paper phase correction amplitude compensati...


In [5]:
file = os.path.join(INPUT_DIR, 'w2v_model_abstract.dill')
abstract_w2vmodel = dill.load(open(file,'rb'))

In [6]:
# ベクトル化
tfidf_vectorizer = TfidfVectorizer(min_df=20)
tfidf_vectorizer.fit(all_title_abstract['clean_abstract'].values)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=20, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [7]:
abstract_vectors = abstract_w2vmodel.wv.vectors
gmm = GaussianMixture(n_components=clusters_num, 
                      covariance_type='tied', max_iter=50, random_state=SEED)
gmm.fit(abstract_vectors)

GaussianMixture(covariance_type='tied', init_params='kmeans', max_iter=50,
                means_init=None, n_components=60, n_init=1,
                precisions_init=None, random_state=42, reg_covar=1e-06,
                tol=0.001, verbose=0, verbose_interval=10, warm_start=False,
                weights_init=None)

In [8]:
idf_dic = dict(zip(tfidf_vectorizer.get_feature_names(), tfidf_vectorizer._tfidf.idf_))
assign_dic = dict(zip(abstract_w2vmodel.wv.index2word, gmm.predict(abstract_vectors)))
soft_assign_dic = dict(zip(abstract_w2vmodel.wv.index2word, gmm.predict_proba(abstract_vectors)))
 
word_topic_vecs = {}
for word in assign_dic:
    word_topic_vecs[word] = np.zeros(features_num*clusters_num, dtype=np.float32)
    for i in range(0, clusters_num):
        try:
            word_topic_vecs[word][i*features_num:(i+1)*features_num] \
            = abstract_w2vmodel.wv[word]*soft_assign_dic[word][i]*idf_dic[word]
        except:
            continue

In [9]:
def analyzer(text):
    
    stop_words = ['i', 'a', 'an', 'the', 'to', 'and', 'or', 'if', 'is', 'are', 'am', 'it', 'this', 'that', 'of', 'from', 'in', 'on']
    text = text.lower() # 小文字化
    text = text.replace('\n', '') # 改行削除
    text = text.replace('\t', '') # タブ削除
    text = re.sub(re.compile(r'[!-\/:-@[-`{-~]'), ' ', text) # 記号をスペースに置き換え
    text = text.split(' ') # スペースで区切る
    
    words = []
    for word in text:
        if (re.compile(r'^.*[0-9]+.*$').fullmatch(word) is not None): # 数字が含まれるものは除外
            continue
        if word in stop_words: # ストップワードに含まれるものは除外
            continue
        if len(word) < 2: #  1文字、0文字（空文字）は除外
            continue
        words.append(word)
        
    return words

In [10]:
train_title_abstract = all_title_abstract.iloc[:851524, :]
test_title_abstract = all_title_abstract.iloc[851524:, :]
second_all_title_abstract \
= pd.concat([train_title_abstract[train_title_abstract['cites'].notnull()],
             test_title_abstract], axis=0).reset_index(drop=True)
print(train_title_abstract.shape, test_title_abstract.shape, second_all_title_abstract.shape)

(851524, 5) (59084, 5) (74201, 5)


In [11]:
del train_title_abstract, test_title_abstract, all_title_abstract
gc.collect()

150

In [12]:
scdvs = np.zeros((len(second_all_title_abstract.clean_abstract), clusters_num*features_num), 
                 dtype=np.float32)
 
a_min = 0
a_max = 0
 
for i, text in tqdm(enumerate(second_all_title_abstract.clean_abstract), 
                    total=len(second_all_title_abstract.clean_abstract)):
    tmp = np.zeros(clusters_num*features_num, dtype=np.float32)
    words = analyzer(text)
    for word in words:
        if word in word_topic_vecs:
            tmp += word_topic_vecs[word]
    norm = np.sqrt(np.sum(tmp**2))
    if norm > 0:
        tmp /= norm
    a_min += min(tmp)
    a_max += max(tmp)
    scdvs[i] = tmp

p = 0.04
a_min = a_min*1.0 / len(second_all_title_abstract.clean_abstract)
a_max = a_max*1.0 / len(second_all_title_abstract.clean_abstract)
thres = (abs(a_min)+abs(a_max)) / 2
thres *= p
 
scdvs[abs(scdvs) < thres] = 0
scdvs.shape # (2800, 12000)

100%|██████████| 74201/74201 [04:31<00:00, 273.64it/s]


(74201, 12000)

In [14]:
pca = PCA(n_components=20, random_state=SEED)
pca_abstract_df = pd.DataFrame(pca.fit_transform(scdvs))
pca_abstract_df.columns = ['pca_scdv_abstract_' + str(col) for col in pca_abstract_df.columns]

In [15]:
pca_abstract_df.to_feather(os.path.join(INPUT_DIR, 'scdv_abstract.feather'))