# 'Recommendation of similar articles from journal abstract analysis'  
# Modeling
## 2019, Misty M. Giles
### https://github.com/OhThatMisty/astro_categories/

In [30]:
import gensim
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
import logging
import os
import pandas as pd
import pickle
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import unicodedata

# Set up logging 
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [52]:
def normalize(text):
    '''Convert to ascii, remove special characters associated with LaTeX when given a df column,
       only keep alpha chars'''
    normalized_text = []
    
    for t in text:
        t = unicodedata.normalize('NFKD', t).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        t = re.sub('\\\\', ' ', t)
        t = re.sub('[${}()"]', '', t)
        t = re.sub('[^A-Za-z\']+', ' ', t)
        normalized_text.append(t)
    return normalized_text

# This function is to remove punctuation after spaCy parsing.

def remove(token):
    '''Provide feedback on whether a token is punctuation, whitespace, or stopword'''
    return token.is_punct or token.is_space or token.is_stop

# This function ensures that all printouts use the same formula

def join_tokens(doc):
    '''Joins tokens in a sent without whatever is in in remove(), adds pronoun back
       in instead of -PRON-'''
    return ' '.join([token.lemma_ if token.lemma_ != '-PRON-' else token.text.lower()
                     for token in doc if not remove(token)])

In [45]:
file = os.path.join('..','data','astro_intermediate.csv')
df = pd.read_csv(file, index_col=False, usecols=['abstract'])
df.head()

Unnamed: 0,abstract
0,We have constructed a Fourier-transform spectr...
1,We report possible interaction between multipl...
2,"Young, low-mass stars in the solar neighborhoo..."
3,We investigate the energy extraction by the Pe...
4,We investigate the effect cosmological constan...


In [53]:
%%time
test = df.abstract[:50]
text = [join_tokens(doc) for doc in nlp.pipe(normalize(test), batch_size=1000)]
#text = [join_tokens(doc) for doc in nlp.pipe(normalize(df.abstract), batch_size=1000)]

Wall time: 5.01 s


In [48]:
tfidf = TfidfVectorizer(ngram_range=(1,3), max_df=0.85, min_df=2)

In [58]:
%%time
X = tfidf.fit_transform(text)

Wall time: 258 ms


In [59]:
X.shape

(50, 895)

In [60]:
tfidf.get_feature_names()

['ability', 'able', 'absorption', 'acceleration', 'accessible', 'account', 'active', 'activity', 'add', 'additional', 'address', 'affect', 'age', 'agreement', 'aim', 'al', 'algorithm', 'all', 'allow', 'alma', 'alpha', 'altitude', 'amplitude', 'an', 'analysis', 'analytic', 'angle', 'angular', 'antenna', 'anti', 'apparent', 'appear', 'applicable', 'application', 'apply', 'approach', 'approx', 'approximate', 'approximation', 'area', 'array', 'array cta', 'array pta', 'article', 'as', 'association', 'assume', 'astrometry', 'asymmetry', 'at', 'atmosphere', 'atmospheric', 'atmospheric structure', 'au', 'author', 'available', 'average', 'background', 'band', 'bao', 'baryonic', 'baryonic matter', 'base', 'behavior', 'behaviour', 'belong', 'beta', 'bf', 'bias', 'binari', 'binary', 'binary system', 'black', 'black hole', 'black hole smbh', 'blue', 'body', 'body the', 'box', 'break', 'bright', 'brightness', 'build', 'burst', 'by', 'by combine', 'calculate', 'camera', 'canonical', 'capability', 'c




In [None]:
# Stream the data 
abstracts = LineSentence(infile)