In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import pandas as pd
from nltk.util import ngrams
from nltk import FreqDist
from collections import Counter
from nltk.tokenize import sent_tokenize
import json
from data_preprocess import *

In [4]:
data_path = "./data/"

In [27]:
def Build_mapping(year:str):
    '''
    Build index to word and word to index map for given year
    If year is not available, build map for entire corpus
    '''
    target_files = [x for x in os.listdir(data_path)]
    index2word={}
    word2index={}
    if year == "all":
        for f in target_files:
            temp = open(data_path+f,"r").read()
            for word in temp.split():
                if word not in word2index:
                    word2index[word] = len(word2index)
                    index2word[len(word2index)-1] = word
    else:
        temp = open(data_path+f"{year}.txt","r").read()
        for word in temp.split():
            if word not in word2index:
                word2index[word] = len(word2index)
                index2word[len(word2index)-1] = word

    return index2word, word2index

def Get_Unigram(word:str, year:str):
    '''
    return frequency of a word in text from a given year
    '''
    f = open(data_path+f"{year}.txt","r").read()
    unigram = Counter(ngrams(f.split(),1))
    return unigram[(word,)] if (word,) in unigram else "Not Exist"

def Get_topk(k:int, year:str):
    '''
    Get top k common words in the text from given year
    '''
    f = open(data_path+f"{year}.txt","r").read()
    freq = FreqDist(f.split())
    return freq.most_common(k)

def Get_sentences(year:str):
    '''
    get sentences from unprocessed data for a given year
    '''
    data = json.load(open(data_path+'unprocessed.json'))
    abstract = ""
    for record in data[year]:
        abstract += record["abstract"]
    sentences = sent_tokenize(abstract)
    
    for i in range(len(sentences)):
        sentences[i] = sentences[i].replace("\n"," ")
        temp = remove_punctuation(sentences[i])
        temp = to_lower_case(temp)
        temp = remove_stopwords(temp.split())
        temp = lemmatise_verbs(temp)
        temp = remove_numbers(temp)
        sentences[i] = " ".join(temp)
        
    for s in sentences:    
        if (not s) or (len(s.split())<2):
            sentences.remove(s)
    return sentences

In [6]:
index2word, word2index = Build_mapping(year="1994")
print(len(word2index))  #unique tokens in each year, or all years

2794


In [7]:
word2index

{'precise': 0,
 'formulation': 1,
 'derivation': 2,
 'tree': 3,
 'adjoin': 4,
 'grammars': 5,
 'important': 6,
 'ramifications': 7,
 'wide': 8,
 'variety': 9,
 'use': 10,
 'formalism': 11,
 'syntactic': 12,
 'analysis': 13,
 'semantic': 14,
 'interpretation': 15,
 'statistical': 16,
 'language': 17,
 'model': 18,
 'argue': 19,
 'definition': 20,
 'must': 21,
 'reformulate': 22,
 'order': 23,
 'manifest': 24,
 'proper': 25,
 'linguistic': 26,
 'dependencies': 27,
 'derivations': 28,
 'particular': 29,
 'proposal': 30,
 'precisely': 31,
 'characterizable': 32,
 'tag': 33,
 'equivalence': 34,
 'class': 35,
 'computationally': 36,
 'operational': 37,
 'virtue': 38,
 'compilation': 39,
 'linear': 40,
 'index': 41,
 'together': 42,
 'efficient': 43,
 'algorithm': 44,
 'recognition': 45,
 'parse': 46,
 'accord': 47,
 'compile': 48,
 'grammar': 49,
 'report': 50,
 'recent': 51,
 'loebner': 52,
 'prize': 53,
 'competition': 54,
 'inspire': 55,
 'turing': 56,
 'test': 57,
 'intelligent': 58,
 'b

In [8]:
Get_topk(k=5,year=1999)

[('use', 53), ('model', 51), ('base', 44), ('language', 42), ('paper', 38)]

In [9]:
Get_Unigram(word="use",year="1994")  # get frequency of a word in a given year

200

In [32]:
sentences=Get_sentences("1999")

In [33]:
sentences

['schedule dialogs people negotiate time appointments common everyday life',
 'paper report result depth empirical investigation resolve explicit temporal reference schedule dialogs',
 'four phase work data annotation evaluation model development system implementation evaluation model evaluation analysis',
 'system model develop primarily one set data apply later much complex data set assess generalizability model task perform',
 'many different type empirical methods apply pinpoint strengths weaknesses approach',
 'detail annotation instructions develop intercoder reliability study perform show naive annotators reliably perform target annotations',
 'fully automatic system develop evaluate unseen test data good result data set',
 'adopt pure realization recency base focus model identify precisely adequate task address',
 'addition system result depth evaluation model present base detail manual annotations',
 'result errors occur specifically due model focus use set anaphoric relations

In [39]:
"02g155".isnumeric()

False

In [35]:
tf_idf = TfidfVectorizer()
tf_idf_matrix=tf_idf.fit_transform(sentences)
tf_idf.get_feature_names()
#pd.DataFrame(data = td_idf_matrix.toarray(),columns=vacab)

['021',
 '15',
 '1970',
 '1994',
 '1995',
 '1996',
 '1997',
 '20',
 '58',
 '63',
 '636',
 '66',
 '68',
 '69',
 '698previous',
 '71',
 '73',
 '771',
 '773',
 '77verbs',
 '78',
 '79',
 '790',
 '80',
 '800',
 '82',
 '85',
 '87',
 '938',
 '947',
 '95',
 '95the',
 'abandon',
 'ability',
 'able',
 'abney',
 'abstract',
 'abstraction',
 'abstractly',
 'accept',
 'acceptance',
 'accomplish',
 'accordingly',
 'account',
 'accuracy',
 'accurate',
 'accurately',
 'achieve',
 'acquisition',
 'act',
 'acyclic',
 'adapt',
 'adaptable',
 'adaptation',
 'add',
 'addition',
 'additional',
 'address',
 'adequate',
 'adjectives',
 'adjunct',
 'adjuncts',
 'adopt',
 'adult',
 'advance',
 'advantage',
 'adverbial',
 'ai',
 'aid',
 'aim',
 'algebraic',
 'algorithm',
 'algorithms',
 'alike',
 'allow',
 'almost',
 'alphabet',
 'already',
 'also',
 'alternative',
 'although',
 'ambiguity',
 'among',
 'amount',
 'analyse',
 'analysesa',
 'analysis',
 'analytic',
 'analyze',
 'anaphora',
 'anaphoric',
 'annotate