In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import pandas as pd
from nltk.util import ngrams
from nltk import FreqDist
from collections import Counter
from nltk.tokenize import sent_tokenize
import json
from data_preprocess import *

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\royxj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\royxj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\royxj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data_path = "./data/"

In [3]:
def Build_mapping(year:str):
    '''
    Build index to word and word to index map for given year
    If year is not available, build map for entire corpus
    '''
    target_files = [x for x in os.listdir(data_path)]
    index2word={}
    word2index={}
    if year == "all":
        for f in target_files:
            temp = open(data_path+f,"r").read()
            for word in temp.split():
                if word not in word2index:
                    word2index[word] = len(word2index)
                    index2word[len(word2index)-1] = word
    else:
        temp = open(data_path+f"{year}.txt","r").read()
        for word in temp.split():
            if word not in word2index:
                word2index[word] = len(word2index)
                index2word[len(word2index)-1] = word

    return index2word, word2index

def Get_Unigram(word:str, year:str):
    '''
    return frequency of a word in text from a given year
    '''
    f = open(data_path+f"{year}.txt","r").read()
    unigram = Counter(ngrams(f.split(),1))
    return unigram[(word,)] if (word,) in unigram else "Not Exist"

def Get_topk(k:int, year:str):
    '''
    Get top k common words in the text from given year
    '''
    f = open(data_path+f"{year}.txt","r").read()
    freq = FreqDist(f.split())
    return freq.most_common(k)

def Get_sentences(year:str):
    '''
    get sentences from unprocessed data for a given year
    '''
    data = json.load(open(data_path+'unprocessed.json'))
    abstract = ""
    for record in data[year]:
        abstract += record["abstract"]
    sentences = sent_tokenize(abstract)
    
    for i in range(len(sentences)):
        sentences[i] = sentences[i].replace("\n"," ")
        temp = remove_punctuation(sentences[i])
        temp = to_lower_case(temp)
        temp = remove_stopwords(temp.split())
        temp = lemmatise_verbs(temp)
        temp = remove_numbers(temp)
        sentences[i] = " ".join(temp)
        
    for s in sentences:    
        if (not s) or (len(s.split())<2):
            sentences.remove(s)
    return sentences

In [4]:
index2word, word2index = Build_mapping(year="1994")
print(len(word2index))  #unique tokens in each year, or all years

2794


In [5]:
word2index

{'precise': 0,
 'formulation': 1,
 'derivation': 2,
 'tree': 3,
 'adjoin': 4,
 'grammars': 5,
 'important': 6,
 'ramifications': 7,
 'wide': 8,
 'variety': 9,
 'use': 10,
 'formalism': 11,
 'syntactic': 12,
 'analysis': 13,
 'semantic': 14,
 'interpretation': 15,
 'statistical': 16,
 'language': 17,
 'model': 18,
 'argue': 19,
 'definition': 20,
 'must': 21,
 'reformulate': 22,
 'order': 23,
 'manifest': 24,
 'proper': 25,
 'linguistic': 26,
 'dependencies': 27,
 'derivations': 28,
 'particular': 29,
 'proposal': 30,
 'precisely': 31,
 'characterizable': 32,
 'tag': 33,
 'equivalence': 34,
 'class': 35,
 'computationally': 36,
 'operational': 37,
 'virtue': 38,
 'compilation': 39,
 'linear': 40,
 'index': 41,
 'together': 42,
 'efficient': 43,
 'algorithm': 44,
 'recognition': 45,
 'parse': 46,
 'accord': 47,
 'compile': 48,
 'grammar': 49,
 'report': 50,
 'recent': 51,
 'loebner': 52,
 'prize': 53,
 'competition': 54,
 'inspire': 55,
 'turing': 56,
 'test': 57,
 'intelligent': 58,
 'b

In [6]:
Get_topk(k=5,year=1999)

[('use', 53), ('model', 51), ('base', 44), ('language', 42), ('paper', 38)]

In [7]:
Get_Unigram(word="use",year="1994")  # get frequency of a word in a given year

200

In [8]:
sentences=Get_sentences("1999")
print(len(sentences))
print(sentences[:3])

243
['schedule dialogs people negotiate time appointments common everyday life', 'paper report result depth empirical investigation resolve explicit temporal reference schedule dialogs', 'four phase work data annotation evaluation model development system implementation evaluation model evaluation analysis']


In [12]:
tf_idf = TfidfVectorizer()
tf_idf_matrix=tf_idf.fit_transform(sentences)
vacab=tf_idf.get_feature_names()
df = pd.DataFrame(data = tf_idf_matrix.toarray(),columns=vacab)
df



Unnamed: 0,698previous,77verbs,95the,abandon,ability,able,abney,abstract,abstraction,abstractly,...,wrong,wrongly,wsd,wsj,year,years,yet,yield,young,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
240,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
list(df["use"])

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.08904217996017656,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.059263837649583234,
 0.0,
 0.0959708432088431,
 0.11999617221225646,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.08384971167325699,
 0.0,
 0.0,
 0.121452138967634,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.10167952297640827,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.13037142532015406,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.251689625814478,
 0.0,
 0.10515898744748146,
 0.10871960621635372,
 0.0,
 0.0,
 0.0,
 0.09595018202192329,
 0.0,
 0.18837820093795163,
 0.0,
 0.13111265084126023,
 0.20183553923964556,
 0.0,
 0.0,
 0.18482149715652346,
 0.1447002043840909,
 0.0,
 0.0,
 0.0,
 0.1278340912606314