<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Read-data" data-toc-modified-id="Read-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Read data</a></span></li></ul></div>

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly
import seaborn as sns

import os
from tqdm.notebook import tqdm
from glob import glob
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Read data

In [None]:
# investigate time istribution of papers

cnt = {}
for path in glob('clean_data/*/*'):
    _, yr, month = path.split('/')
    cnt[(int(yr), int(month))] = len(glob(path + '/*'))

In [None]:
# into the df
x = pd.DataFrame(cnt.items()).rename(columns={0: 'date', 1:'cnt'})
x.loc[:, 'date'] = x.date.astype(str)
# draw 
px.bar(x, x='date', y='cnt', )

In [None]:
# read all data 
data = dict(yr=[], month=[], text=[], title=[])
for path in tqdm(glob('clean_data/*/*')):
    _, yr, month = path.split('/')
    files = glob(path + '/*')
    for file in files:
        data['yr'].append(int(yr))
        data['month'].append(int(month))
        data['title'].append(file.split('/')[-1][:-4])
        data['text'].append(open(file).read())

In [None]:
# transform into df
df = pd.DataFrame(data)
df.loc[:, 'text'] = df.text.astype(pd.StringDtype())
df.loc[:, 'title'] = df.title.astype(pd.StringDtype())

In [None]:
# count all words
vocab_cnt = Counter()
for text in df.text.tolist():
    for word in text.split():
        if word.isalpha() and len(word) > 2:
            vocab_cnt[word] += 1

In [None]:
# what are the most common words
vocab_cnt.most_common()[:20]

In [None]:
# distribution of word counts
px.histogram(x=vocab_cnt.values(), log_y=True)

In [None]:
# transform to tfidf
tfidf_transformer = TfidfVectorizer(min_df=0.01)
tfidf = tfidf_transformer.fit_transform(df.text)

In [None]:
tfidf = tfidf.toarray()

In [None]:
tfidf.shape

In [None]:
vocabulary = np.array(tfidf_transformer.get_feature_names())

In [None]:
# remove words shorter than 3
indexes = np.array([True if len(vocabulary[i]) >= 3 else False for i in range(len(vocabulary))])
vocabulary = vocabulary[indexes]
tfidf = tfidf[:, indexes]

In [None]:
vocabulary.shape

In [None]:
tfidf_mean =  np.mean(tfidf, axis=0)
ind_max = np.argsort(tfidf_mean)[-20:][::-1]

In [None]:
for ind in ind_max:
    print(f'{vocabulary[ind]}\t{np.round(tfidf_mean[ind], 3)}')

In [None]:
# map highly dimensional tf-idf vector into 50D vector with PCS
pca_obj = PCA(n_components=50)
pca = pca_obj.fit_transform(tfidf)

In [None]:
# how much varinace is explained?
np.sum(pca_obj.explained_variance_ratio_)

In [None]:
# what are the words that contribute to given PCA component
component = 0
print('explained variance:', np.round(pca_obj.explained_variance_ratio_[component], 3))
ind_1pca = np.argsort(pca_obj.components_[component, :])[::-1]
for i in ind_1pca[:20]:
    print(f'{vocabulary[i]:15}{np.round(pca_obj.components_[component, :][i] ** 2, 2)}')

In [None]:
pca.shape

In [None]:
# reduce dimensionality further to 2D with tSNE
# for perp in [5, 10, 20, 40, 60]:
tsne = TSNE(2, perplexity=30, n_iter=3000).fit_transform(pca)

In [243]:
df.head()

Unnamed: 0,yr,month,text,title
0,2015,9,vacuo xray data collection graphenewrapped pro...,In vacuo X‐ray data collection from graphene‐w...
1,2015,9,"fast , parameterized model upper atmospheric i...","A fast, parameterized model of upper atmospher..."
2,2015,9,multivariate analysis extremely large tofsims ...,Multivariate analysis of extremely large ToFSI...
3,2015,9,reef flatten effect total richness specie resp...,Reef flattening effects on total richness and ...
4,2015,9,ecosystemlevel effect globally spread inverteb...,Ecosystem‐level effects of a globally spreadin...


In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=tsne[:, 0], y=tsne[:, 1], mode='markers', 
               marker_size=3, marker_color=df.index
#                hovertext=df.title.tolist(), hoverinfo=['text']
              )
)
fig.update_layout(width=700, height=700)