In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip uninstall nltk -y
!pip install texthero

Uninstalling nltk-3.2.5:
  Successfully uninstalled nltk-3.2.5
Collecting texthero
  Downloading https://files.pythonhosted.org/packages/1f/5a/a9d33b799fe53011de79d140ad6d86c440a2da1ae8a7b24e851ee2f8bde8/texthero-1.0.9-py3-none-any.whl
Collecting unidecode>=1.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/9e/25/723487ca2a52ebcee88a34d7d1f5a4b80b793f179ee0f62d5371938dfa01/Unidecode-1.2.0-py2.py3-none-any.whl (241kB)
[K     |████████████████████████████████| 245kB 6.3MB/s 
[?25hCollecting nltk>=3.3
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 9.6MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.5-cp37-none-any.whl size=1434677 sha256=f056bd3895862511e65e3bb01392f56b84e23bf3e017b43048b263dd43a21b7e
  Stored in directory: 

In [3]:
import os

import numpy as np
import pandas as pd

import dill
from sklearn.feature_extraction.text import TfidfVectorizer
import gc
from tqdm import tqdm
import time

import gensim
from gensim.models import Word2Vec
import hashlib
import nltk

import texthero as hero
from texthero import preprocessing


INPUT_DIR = '/content/drive/MyDrive/citation_prediction/input/'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


データの読み込み

In [4]:
all_title_abstract = pd.read_feather(os.path.join(INPUT_DIR, 'all_title_abstract_df.feather'))
all_title_abstract.head()

Unnamed: 0,title,abstract,cites,clean_title,clean_abstract
0,Heavy-Flavour Production at HERA,We review the theoretical and experimental s...,,heavy flavour production hera,review theoretical experimental status hea...
1,Spectropolarimetric Constraints on the Nature ...,While it is well recognized that interstella...,7.0,spectropolarimetric constraints nature inte...,well recognized interstellar grains made ...
2,A joint analysis of Planck and BICEP2 B modes ...,We analyze BICEP2 and Planck data using a mo...,188.0,joint analysis planck bicep2 b modes includ...,analyze bicep2 planck data using model inc...
3,Molecular movie of ultrafast coherent rotation...,Recording molecular movies on ultrafast time...,8.0,molecular movie ultrafast coherent rotational...,recording molecular movies ultrafast timescal...
4,A Modified Mixed Domain Method for Modeling Ac...,"In this paper, phase correction and amplitud...",,modified mixed domain method modeling acoust...,paper phase correction amplitude compensati...


In [5]:
# ベクトル化
vec_tfidf = TfidfVectorizer(min_df=20)
abstract_tfidf = vec_tfidf.fit_transform(all_title_abstract['clean_abstract'].values)

In [6]:
start = time.time()

sentences = []
print ("Parsing sentences from training set...")

# Loop over each news article.
for review in tqdm(all_title_abstract['clean_abstract']):
    try:
        # Split a review into parsed sentences.
        result = review
        h = result.split(" ")
        h = list(filter(("").__ne__, h))
        sentences.append(h)
    except:
        continue

num_features = 200     # Word vector dimensionality
min_word_count = 20   # Minimum word count
num_workers = 1       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words
seed = 42

hashfxn = lambda x: int(hashlib.md5(str(x).encode()).hexdigest(), 16)

print ("Training Word2Vec model...")
# Train Word2Vec model.
model = Word2Vec(sentences, workers=num_workers, hs = 0, sg = 1, negative = 10, iter = 5,\
            size=num_features, min_count = min_word_count, hashfxn=hashfxn,\
            window = context, sample = downsampling, seed=seed)

model_name = str(num_features) + "features_" + str(min_word_count) + "minwords_" \
+ str(context) + "context_len2alldata"
model.init_sims(replace=True)
endmodeltime = time.time()

print ("time : ", endmodeltime-start)

  1%|          | 5089/910608 [00:00<00:17, 50886.65it/s]

Parsing sentences from training set...


100%|██████████| 910608/910608 [00:27<00:00, 32579.74it/s]


Training Word2Vec model...
time :  15595.895079135895


In [7]:
tqdm.pandas()

def des_to_mean_vec(text):
    
    try:
        text = text.strip()
        text_ls = [s for s in text.split(' ') if '' != s]
        return np.mean([model.wv[word] for word in text_ls if word in model.wv.index2word], axis=0)
    except:
        return np.nan

df = all_title_abstract['clean_abstract'].progress_apply(lambda x: des_to_mean_vec(x))
df.head()


The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version


Mean of empty slice.

100%|██████████| 910608/910608 [3:48:06<00:00, 66.54it/s]


0    [-0.0010560127, -0.046459645, 0.0032742675, -0...
1    [-0.04712007, -0.034458704, 0.030664742, -0.01...
2    [-0.048914954, -0.012871409, 0.020217177, -0.0...
3    [-0.06298661, -0.031603467, 0.023065962, -0.01...
4    [-0.030056866, -0.0566545, 0.029133623, 0.0146...
Name: clean_abstract, dtype: object

In [8]:
abstract_df = pd.DataFrame()
for num in tqdm(list(range(0, 920000, 10000))):
    abstract_df = pd.concat([abstract_df, 
                             df.iloc[num:num+10000].apply(pd.Series)])
abstract_df.columns = ['w2v_abstract_' + str(col) for col in abstract_df.columns]
abstract_df.head()

100%|██████████| 92/92 [03:56<00:00,  2.57s/it]


Unnamed: 0,w2v_abstract_0,w2v_abstract_1,w2v_abstract_2,w2v_abstract_3,w2v_abstract_4,w2v_abstract_5,w2v_abstract_6,w2v_abstract_7,w2v_abstract_8,w2v_abstract_9,w2v_abstract_10,w2v_abstract_11,w2v_abstract_12,w2v_abstract_13,w2v_abstract_14,w2v_abstract_15,w2v_abstract_16,w2v_abstract_17,w2v_abstract_18,w2v_abstract_19,w2v_abstract_20,w2v_abstract_21,w2v_abstract_22,w2v_abstract_23,w2v_abstract_24,w2v_abstract_25,w2v_abstract_26,w2v_abstract_27,w2v_abstract_28,w2v_abstract_29,w2v_abstract_30,w2v_abstract_31,w2v_abstract_32,w2v_abstract_33,w2v_abstract_34,w2v_abstract_35,w2v_abstract_36,w2v_abstract_37,w2v_abstract_38,w2v_abstract_39,...,w2v_abstract_160,w2v_abstract_161,w2v_abstract_162,w2v_abstract_163,w2v_abstract_164,w2v_abstract_165,w2v_abstract_166,w2v_abstract_167,w2v_abstract_168,w2v_abstract_169,w2v_abstract_170,w2v_abstract_171,w2v_abstract_172,w2v_abstract_173,w2v_abstract_174,w2v_abstract_175,w2v_abstract_176,w2v_abstract_177,w2v_abstract_178,w2v_abstract_179,w2v_abstract_180,w2v_abstract_181,w2v_abstract_182,w2v_abstract_183,w2v_abstract_184,w2v_abstract_185,w2v_abstract_186,w2v_abstract_187,w2v_abstract_188,w2v_abstract_189,w2v_abstract_190,w2v_abstract_191,w2v_abstract_192,w2v_abstract_193,w2v_abstract_194,w2v_abstract_195,w2v_abstract_196,w2v_abstract_197,w2v_abstract_198,w2v_abstract_199
0,-0.001056,-0.04646,0.003274,-0.013795,0.048204,-0.007932,-0.095589,0.064571,-0.047574,-0.119232,0.04194,0.07835,0.040321,0.022759,0.030207,0.079702,0.042888,-0.081458,0.018531,-0.087164,0.048175,0.048038,-0.043655,-0.044508,0.00928,-0.031213,0.0181,0.010299,0.052719,0.023484,0.074915,-0.053559,0.017279,0.012867,-0.028437,0.060122,0.054427,0.040474,-0.005466,-0.062328,...,-0.027911,-0.035519,-0.006945,-0.005699,-0.040278,-0.025527,-0.021409,-0.013538,0.010735,-0.02371,-0.032937,-0.022408,0.028008,-0.031761,-0.075443,0.023496,0.038975,0.019033,-0.026649,-0.091361,0.01014,-0.003911,-0.027287,0.076928,0.012844,0.032848,-0.006803,-0.008888,-0.05646,0.046832,0.091426,0.029151,0.068026,0.015019,0.028284,-0.007886,0.013118,-0.035541,-0.014077,0.044765
1,-0.04712,-0.034459,0.030665,-0.012719,0.001387,-0.021903,-0.047224,0.035825,-0.034872,0.002469,0.045118,0.025159,-0.057636,0.017195,0.024891,0.019204,0.004612,0.005142,-0.009118,0.004241,-0.009758,0.031842,-0.023433,0.007433,0.015904,0.019857,0.040863,0.010192,0.04078,-0.006385,0.067007,-0.02803,0.038644,-0.047907,0.00321,-0.015345,0.024103,-0.013152,0.023318,0.034172,...,-0.055829,0.012705,-0.018732,-0.030804,-0.03523,-0.013734,-0.025836,-0.000394,-0.009652,0.002357,-0.000966,-0.020243,-0.065869,-0.010787,-0.062465,0.010522,0.029266,-0.049421,-0.03553,-0.064029,-0.064277,-0.003009,-0.014602,0.036263,0.011102,0.050588,-0.029466,-0.005773,-0.068247,0.077857,0.026311,0.013406,0.040993,0.067597,0.01575,-0.035001,-0.003701,-0.002696,0.018488,0.004892
2,-0.048915,-0.012871,0.020217,-0.01667,0.014561,-0.011071,-0.068235,0.038016,-0.03553,-0.021843,0.034472,0.012416,-0.012794,0.016559,-0.016953,-0.002736,0.009183,0.035518,-0.010164,0.003821,-0.005181,0.03714,-0.009072,-0.004474,0.002423,0.005733,0.034452,0.018697,0.057757,-0.000454,0.042549,-0.010243,0.038217,-0.010108,0.015855,-0.01332,0.043414,-0.015731,0.021628,0.010209,...,-0.063871,-0.026039,-0.022905,-0.06702,0.007118,-0.011643,-0.014329,0.007401,0.026204,0.007012,0.018228,-0.011937,-0.043741,-0.056231,-0.054684,-0.005667,0.018483,-0.010489,-0.050738,-0.066802,-0.045162,-0.015257,-0.001546,0.014218,0.01079,0.031142,0.004087,-0.011872,-0.048609,0.037381,0.029513,0.031669,0.064438,0.054666,0.006905,0.006245,-0.007139,-0.011783,-0.017488,0.004566
3,-0.062987,-0.031603,0.023066,-0.017808,0.025432,0.012372,-0.070937,0.010742,-0.061986,-0.025708,0.007267,-0.000881,-0.015726,0.009994,0.040477,0.00189,0.020964,0.04017,-0.002972,-0.040752,0.010589,0.013642,-0.043241,-0.030126,-0.040279,-0.010424,0.055366,0.046101,0.056563,-0.001701,0.061817,-0.0118,0.02695,-0.034033,0.017941,-0.025504,0.02662,0.014143,0.037483,0.023668,...,-0.018705,-0.011568,-0.001551,-0.0644,0.010583,0.008388,-0.023575,0.000229,0.002386,-0.021714,-0.000198,-0.021825,-0.077663,-0.030297,-0.043095,0.030463,0.018655,0.005165,-0.027871,-0.071837,-0.073613,-0.005149,-0.012791,0.034533,0.004011,0.040754,-0.006486,-0.001473,-0.02545,0.036722,0.015028,-0.014692,0.033351,0.029402,0.004243,0.001865,-0.019383,0.010167,-0.027028,0.014289
4,-0.030057,-0.056655,0.029134,0.014615,0.033908,0.014324,-0.061549,0.050868,-0.010088,-0.015368,0.048054,-0.006527,0.002445,-0.0042,0.042328,0.016586,0.026816,0.042247,0.009279,-0.025786,0.007069,0.022887,-0.031566,-0.036458,-0.02889,-0.015938,0.053175,0.02201,0.058568,-0.022432,0.067211,-0.030177,0.006294,-0.023081,0.011848,0.002077,0.042912,0.029883,0.02546,0.044215,...,-0.045942,-0.028339,0.006572,-0.054786,-0.014808,0.00849,-0.020497,0.009809,0.024761,-0.012747,-0.016571,-0.034113,-0.047288,-0.043393,-0.056403,0.008626,0.026536,-0.020007,-0.036358,-0.065779,-0.073248,0.006333,-0.026384,0.034116,0.011944,0.025195,0.003966,0.023356,-0.038195,0.063629,0.003056,0.017751,0.044247,0.019455,-0.002189,-0.010977,-0.002454,0.015168,-0.026725,-0.003094


In [9]:
abstract_df.to_feather(os.path.join(INPUT_DIR, 'abstract_df.feather'))

In [10]:
file = os.path.join(INPUT_DIR, 'w2v_model_abstract.dill')
dill.dump(model, open(file,'wb'))