In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import pandas as pd
from nltk.util import ngrams
from nltk import FreqDist
from collections import Counter
from nltk.tokenize import sent_tokenize

import json
from data_preprocess import *
import pickle
from gensim.parsing.preprocessing import remove_stopwords
from tqdm import tqdm,trange


In [3]:
data_path = "./data_augmented/"
augment_data = pickle.load(open("extra_s2orc.pickle","rb"))

In [11]:
def Build_mapping(year:str):
    '''
    Build index to word and word to index map for given year
    If year is not available, build map for entire corpus
    '''
    target_files = [x for x in os.listdir(data_path)]
    index2word={}
    word2index={}
    if year == "all":
        for f in target_files:
            temp = open(data_path+f,"r").read()
            for word in temp.split():
                if word not in word2index:
                    word2index[word] = len(word2index)
                    index2word[len(word2index)-1] = word
    else:
        temp = open(data_path+f"{year}.txt","r").read()
        for word in temp.split():
            if word not in word2index:
                word2index[word] = len(word2index)
                index2word[len(word2index)-1] = word

    return index2word, word2index

def Build_freq_dict(year:str):
    '''
    Build frequency dictionary for given year
    '''
    f = open(data_path+f"{year}.txt","r").read()
    unigram = FreqDist(f.split())
    return unigram


def Get_Unigram(word:str, year:str):
    '''
    return frequency of a word in text from a given year
    '''
    f = open(data_path+f"{year}.txt","r").read()
    unigram = FreqDist(f.split())
    return unigram[word] if word in unigram else 0

def Get_topk(k:int, year:str):
    '''
    Get top k common words in the text from given year
    '''
    f = open(data_path+f"{year}.txt","r").read()
    freq = FreqDist(f.split())
    return freq.most_common(k)

def Get_sentences(year:str):
    '''
    get sentences from unprocessed data for a given year
    '''
    data = json.load(open(data_path+'unprocessed.json'))
    abstract = ""
    for record in data[year]:
        abstract += record["abstract"]
    sentences = sent_tokenize(abstract)
    
    for i in range(len(sentences)):
        sentences[i] = sentences[i].replace("\n"," ")
        temp = remove_punctuation(sentences[i])
        temp = to_lower_case(temp)
        temp = remove_stopwords(temp.split())
        temp = lemmatise_verbs(temp)
        temp = remove_numbers(temp)
        sentences[i] = " ".join(temp)
        
    for s in sentences:    
        if (not s) or (len(s.split())<2):
            sentences.remove(s)
    return sentences


def Data_Augmentation(year:int):

    file = open(f"./data/{year}.txt")
    lines = file.read()
    extra = " ".join(augment_data[year])

    extra = remove_punctuation(extra)
    extra = to_lower_case(extra)
    extra = remove_stopwords(extra)
    extra = lemmatise_verbs(extra)
    extra = remove_numbers(extra)
    #lines += " "
    lines += ("".join(extra))

    out_file = open(f"./data_augmented/{year}.txt", "w",encoding='utf-8')
    out_file.write(lines)
    
    file.close()
    out_file.close()




In [26]:
def Get_common_words(threshold):

    '''
    Get common words across all years and filter the words that have frequency more than threhold
    '''
    common_words = set([k for k, v in Build_freq_dict(year="1994").items() if v > threshold])
    for yr in trange(1995,2021):
        
        temp = [k for k, v in Build_freq_dict(year=f"{yr}").items() if v > threshold]
        common_words.intersection_update(temp)
    
    return common_words
            


In [27]:
a=Get_common_words(threshold=100)

100%|██████████| 26/26 [03:11<00:00,  7.35s/it]


In [31]:
a

{'integrating',
 'possible',
 'recently',
 'lms',
 'personal',
 'uncertain',
 'forming',
 'introduction',
 'allowed',
 'display',
 'perspectives',
 'contact',
 'detected',
 'accurately',
 'larger',
 'steering',
 'transformations',
 'retrieved',
 'collaboration',
 'parameter',
 'contour',
 'respectively',
 'lengths',
 'defines',
 'graph',
 'intermediate',
 'analog',
 'bit',
 'avoids',
 'constraint',
 'optimal',
 'ensure',
 'message',
 'profile',
 'question',
 'ratio',
 'included',
 'air',
 'channel',
 'formally',
 'exists',
 'reliability',
 'statement',
 'advantages',
 'strategy',
 'paradigm',
 'depth',
 'manage',
 'levels',
 'materials',
 'computational',
 'fraction',
 'curves',
 'drawing',
 'strength',
 'variations',
 'motivated',
 'manufacturing',
 'attempt',
 'arise',
 'equally',
 'spatiotemporal',
 'receive',
 'methods',
 'definition',
 'cmos',
 'motivation',
 'demonstrating',
 'complicated',
 'inverse',
 'change',
 'roles',
 'shape',
 'highlevel',
 'factors',
 'standardized',
 'st

In [None]:
for yr in range(1994,2022):
    Data_Augmentation(year=yr)