In [60]:
import json
import string
import re
import nltk
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from warnings import filterwarnings
filterwarnings("ignore")

In [62]:
train = pd.read_json("train-1.json")
test = pd.read_json("test.json")

In [66]:
df_train = pd.DataFrame(train)
df_test = pd.DataFrame(test)

In [69]:
print(df_train.shape)
print(df_test.shape)

(9658, 11)
(1000, 10)


In [72]:
df_train.describe()

Unnamed: 0,year,references,citations
count,9655.0,9658.0,9658.0
mean,2015.384878,31.994202,36.994823
std,7.562034,24.189335,191.394827
min,1979.0,0.0,0.0
25%,2015.0,19.0,2.0
50%,2018.0,29.0,8.0
75%,2020.0,41.0,25.0
max,2021.0,668.0,9094.0


In [73]:
df_test.describe()

Unnamed: 0,year,references
count,1000.0,1000.0
mean,2015.886,32.429
std,6.922494,22.417259
min,1979.0,0.0
25%,2015.0,20.0
50%,2018.0,29.0
75%,2020.0,41.0
max,2021.0,304.0


In [74]:
df_train.isnull().sum()

doi                  0
title                0
abstract           159
authors              0
venue                0
year                 3
references           0
topics               0
is_open_access       0
fields_of_study    136
citations            0
dtype: int64

In [75]:
df_test.isnull().sum()

is_open_access      0
doi                 0
title               0
fields_of_study    13
abstract           19
year                0
venue               0
references          0
authors             0
topics              0
dtype: int64

In [76]:
#fill null values on abstract for train and test data
df_train["abstract"] = df_train["abstract"].fillna("")
df_test["abstract"] = df_test["abstract"].fillna("")

In [77]:
df_train["abstract"][1]

'Word embeddings are an active topic in the NLP research community. State-of-the-art neural models achieve high performance on downstream tasks, albeit at the cost of computationally expensive training. Cost aware solutions require cheaper models that still achieve good performance. We present several reproduction studies of intrinsic evaluation tasks that evaluate non-contextual word representations in multiple languages. Furthermore, we present 50-8-8, a new data set for the outlier identification task, which avoids limitations of the original data set, such as ambiguous words, infrequent words, and multi-word tokens, while increasing the number of test cases. The data set is expanded to contain semantic and syntactic tests and is multilingual (English, German, and Italian). We provide an in-depth analysis of word embedding models with a range of hyper-parameters. Our analysis shows the suitability of different models and hyper-parameters for different tasks and the greater difficult

In [78]:
df_test["abstract"][1]

'Pretraining NLP models with variants of Masked Language Model (MLM) objectives has recently led to a significant improvements on many tasks. This paper examines the benefits of pretrained models as a function of the number of training samples used in the downstream task. On several text classification tasks, we show that as the number of training examples grow into the millions, the accuracy gap between finetuning BERT-based model and training vanilla LSTM from scratch narrows to within 1%. Our findings indicate that MLM-based models might reach a diminishing return point as the supervised data size increases significantly.'

In [79]:
Punctuations = string.punctuation
print(Punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [80]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [81]:
Stopwords = set(stopwords.words('english'))

In [82]:
def clean_text(text):
    text_clean = "".join([word.lower() for word in text if word not in string.punctuation])
    text_rc = re.sub('[0-9]+', '', text_clean)
    tokens = re.split('\W+', text_rc)
    text = [word for word in tokens if word not in Stopwords]
    return text

In [83]:
abstract_train = df_train[["abstract"]]
abstract_train['clean_abstract_train'] = abstract_train["abstract"].apply(lambda x: clean_text(x))

In [84]:
abstract_test = df_test[["abstract"]]
abstract_test['clean_abstract_test'] = abstract_test["abstract"].apply(lambda x: clean_text(x))

In [85]:
abstract_test.head()

Unnamed: 0,abstract,clean_abstract_test
0,Sequence-to-sequence models usually transfer a...,"[sequencetosequence, models, usually, transfer..."
1,Pretraining NLP models with variants of Masked...,"[pretraining, nlp, models, variants, masked, l..."
2,"According to the wide-spread belief, although ...","[according, widespread, belief, although, ngan..."
3,A number of psycholinguistic studies have fact...,"[number, psycholinguistic, studies, factoriall..."
4,This paper describes our submission to theSemE...,"[paper, describes, submission, thesemeval, tas..."


In [86]:
#delete the other abstract
abstract_train.drop(['abstract'], inplace = True, axis = 1)
abstract_test.drop(['abstract'], inplace = True, axis = 1)

In [87]:
abstract_train.head()

Unnamed: 0,clean_abstract_train
0,"[semantic, role, labeling, srl, crucial, natur..."
1,"[word, embeddings, active, topic, nlp, researc..."
2,"[propose, new, shared, task, tactical, datatot..."
3,"[evaluate, semantic, parser, based, characterb..."
4,"[paper, gauge, utility, generalpurpose, opendo..."


In [88]:
#add the cleaned abstract to the train and test dataset 
df_train['abstract'] = abstract_train['clean_abstract_train']
df_test['abstract'] = abstract_test['clean_abstract_test']

In [89]:
df_train.head()

Unnamed: 0,doi,title,abstract,authors,venue,year,references,topics,is_open_access,fields_of_study,citations
0,10.3115/v1/P15-1039,Generating High Quality Proposition Banks for ...,"[semantic, role, labeling, srl, crucial, natur...","[A. Akbik, Laura Chiticariu, Marina Danilevsky...",ACL,2015.0,39,[Semantic role labeling],True,[Computer Science],60
1,10.18653/v1/2020.eval4nlp-1.12,One of these words is not like the other: a re...,"[word, embeddings, active, topic, nlp, researc...","[Jesper Brink Andersen, Mikkel Bak Bertelsen, ...",EVAL4NLP,2020.0,44,[],True,[Computer Science],1
2,10.18653/v1/W17-3516,The Code2Text Challenge: Text Generation in So...,"[propose, new, shared, task, tactical, datatot...","[Kyle Richardson, Sina Zarrieß, Jonas Kuhn]",INLG,2017.0,30,"[Natural language generation, Library (computi...",True,[Computer Science],5
3,10.18653/v1/S17-2160,The Meaning Factory at SemEval-2017 Task 9: Pr...,"[evaluate, semantic, parser, based, characterb...","[Rik van Noord, Johan Bos]",SemEval@ACL,2017.0,11,"[Parsing, Convolutional neural network, Text-b...",True,[Computer Science],5
4,10.18653/v1/W15-2205,Semantic Parsing for Textual Entailment,"[paper, gauge, utility, generalpurpose, opendo...","[Elisabeth Lien, Milen Kouylekov]",IWPT,2015.0,26,"[Textual entailment, Parsing, SemEval, Semanti...",True,[Computer Science],10


In [90]:
df_test.head()

Unnamed: 0,is_open_access,doi,title,fields_of_study,abstract,year,venue,references,authors,topics
0,False,10.18653/v1/2021.findings-acl.255,On Sparsifying Encoder Outputs in Sequence-to-...,[Computer Science],"[sequencetosequence, models, usually, transfer...",2021,FINDINGS,47,"[Biao Zhang, Ivan Titov, Rico Sennrich]","[Encoder, Transformer, Automatic summarization..."
1,True,10.18653/v1/2020.acl-main.200,To Pretrain or Not to Pretrain: Examining the ...,"[Computer Science, Mathematics]","[pretraining, nlp, models, variants, masked, l...",2020,ACL,18,"[Sinong Wang, Madian Khabsa, Hao Ma]","[Supervised learning, Language model, Document..."
2,True,10.18653/v1/W18-0211,Utilization of Nganasan digital resources: a s...,[Computer Science],"[according, widespread, belief, although, ngan...",2018,,8,[L. Fejes],"[Rounding, Lexicon, Tracer, Body of uterus, Cl..."
3,True,10.18653/v1/N19-1413,A large-scale study of the effects of word fre...,[Computer Science],"[number, psycholinguistic, studies, factoriall...",2019,NAACL,52,[Cory Shain],"[Word lists by frequency, Text corpus, Sentenc..."
4,False,10.18653/v1/2021.semeval-1.168,Amherst685 at SemEval-2021 Task 7: Joint Model...,[Computer Science],"[paper, describes, submission, thesemeval, tas...",2021,SEMEVAL,17,"[Brian Zylich, Akshay Gugnani, Gabriel Brookma...",[]


# bag of words method

In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [101]:
documentA = df_train['abstract']
documentB = df_test['abstract']

In [102]:
bagOfWordsA = [",".join(i) for i in documentA] 
bagOfWordsB = [",".join(i) for i in documentB] 

In [103]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [104]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

In [105]:
print(numOfWordsB)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# TF*IDF

# Term Frequency (TF)
the number of times a word appears in a document divided by the total number of words in the document. Every document has its own term frequency.


In [106]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [107]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

# Inverse Data Frequency (IDF)
The log of the number of documents divided by the number of documents that contain the word w. Inverse data frequency determines the weight of rare words across all documents in the corpus.

In [117]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
                
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [118]:
idfs = computeIDF([numOfWordsA, numOfWordsB])

# TF*IDF 
TF-IDF is simply the TF multiplied by IDF

In [119]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [121]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])

In [122]:
df

Unnamed: 0,Unnamed: 1,"rover,widely,used,method,combine,output,multiple,automatic,speech,recognition,asr,systems,though,effective,basic,approach,variants,suffer,potential,drawbacks,results,depend,order,hypotheses,used,feed,combination,process,ii,applied,combine,long,hypotheses,disregard,possible,differences,transcription,quality,local,level,iii,often,rely,word,confidence,information,address,issues,proposing,segmentbased,rover,hypothesis,ranking,obtained,confidenceindependent,asr,quality,estimation,method,results,english,data,iwslt,iwslt,evaluation,campaigns,significantly,outperform,standard,rover,approximate,two,strong,oracles","propose,joint,event,temporal,relation,extraction,model,shared,representation,learning,structured,prediction,proposed,method,two,advantages,existing,work,first,improves,event,representation,allowing,event,relation,modules,share,contextualized,embeddings,neural,representation,learner,second,avoids,error,propagation,conventional,pipeline,systems,leveraging,structured,inference,learning,methods,assign,event,labels,temporal,relation,labels,jointly,experiments,show,proposed,method,improve,event,extraction,temporal,relation,extraction,stateoftheart,systems,endtoend,f,improved,two,benchmark,datasets,respectively","introduce,new,features,incorporating,semantic,predicateargument,structures,machine,translation,mt,methods,focus,completeness,semantic,structures,translations,well,order,translated,semantic,roles,experiment,translation,rules,contain,core,arguments,predicates,source,side,mt,system,observe,using,rules,significantly,improves,translation,quality,also,present,new,semantic,feature,resembles,language,model,results,show,language,model,feature,also,significantly,improve,mt,results","image,description,new,natural,language,generation,task,aim,generate,humanlike,description,image,evaluation,computergenerated,text,notoriously,difficult,problem,however,quality,image,descriptions,typically,measured,using,unigram,bleu,human,judgements,focus,paper,determine,correlation,automatic,measures,human,judgements,task,estimate,correlation,unigram,smoothed,bleu,ter,rougesu,meteor,human,judgements,two,data,sets,main,finding,unigram,bleu,weak,correlation,meteor,strongest,correlation,human,judgements","paper,propose,architecture,machine,translation,mt,capable,obtaining,multilingual,sentence,representations,incorporating,intermediate,attention,bridge,shared,across,languages,train,model,languagespecific,encoders,decoders,connected,innerattention,layer,encoder,side,attention,bridge,exploits,semantics,language,translation,develops,languageagnostic,meaning,representation,efficiently,used,transfer,learning,present,new,framework,efficient,development,multilingual,neural,machine,translation,nmt,using,model,scheduled,training,tested,approach,systematic,way,multiparallel,data,set,model,achieves,substantial,improvements,strong,bilingual,models,performs,well,zeroshot,translation,demonstrates,ability,abstraction,transfer,learning","ezafe,grammatical,particle,iranian,languages,links,two,words,together,regardless,important,information,conveys,almost,always,indicated,persian,script,resulting,mistakes,reading,complex,sentences,errors,natural,language,processing,tasks,paper,experiment,different,machine,learning,methods,achieve,stateoftheart,results,task,ezafe,recognition,transformerbased,methods,bert,xlmroberta,achieve,best,results,latter,achieving,fscore,previous,stateoftheart,moreover,use,ezafe,information,improve,persian,partofspeech,tagging,results,show,information,useful,transformerbased,methods,explain,might,case","various,difficulties,accomodating,traditional,masscount,distinction,grammar,english,goal,production,logical,form,semantic,translations,initial,english,sentences,present,paper,surveys,difficulties,one,puzzle,whether,distinction,syntactic,one,semantic,one,ie,whether,wellformedness,constraint,whether,description,semantic,translations,produced,another,puzzle,whether,applied,simple,words,occur,lexicon,whether,apply,longer,units,entire,nps,wide,variety,possible,theories,two,seem,produce,required,results,plausible,inferences,intuitively,satisfying,semantic,representations,two,theories,developed,compared","differently,phrasebased,paradigm,neural,machine,translation,nmt,operates,word,sentence,representations,continuous,space,makes,decoding,process,difficult,interpret,also,harder,influence,external,knowledge,latter,problem,effective,solutions,like,xmlmarkup,used,phrasebased,models,inject,fixed,translation,options,constraints,decoding,time,yet,available,propose,guide,mechanism,enhances,existing,nmt,decoder,ability,prioritize,adequately,handle,translation,options,presented,form,xml,annotations,source,words,positive,results,obtained,two,different,translation,tasks,indicate,effectiveness,approach","training,models,map,natural,language,instructions,programs,given,target,world,supervision,requires,searching,good,programs,training,time,search,commonly,done,using,beam,search,space,partial,programs,program,trees,length,instructions,grows,finding,good,program,becomes,difficult,work,propose,search,algorithm,uses,target,world,state,known,training,time,train,critic,network,predicts,expected,reward,every,search,state,score,search,states,beam,interpolating,expected,reward,likelihood,programs,represented,search,state,moreover,search,space,programs,compressed,state,program,executions,augmented,recent,entities,actions,scone,dataset,show,algorithm,dramatically,improves,performance,three,domains,compared,standard,beam,search,baselines",...,"deep,neural,networks,excel,learning,labeled,data,achieve,stateoftheart,results,wide,array,natural,language,processing,tasks,contrast,learning,unlabeled,data,especially,domain,shift,remains,challenge,motivated,latest,advances,survey,review,neural,unsupervised,domain,adaptation,techniques,require,labeled,target,domain,data,challenging,yet,widely,applicable,setup,outline,methods,early,traditional,nonneural,methods,pretrained,model,transfer,also,revisit,notion,domain,uncover,bias,type,natural,language,processing,tasks,received,attention,lastly,outline,future,directions,particularly,broader,need,outofdistribution,generalization,future,nlp","fewshot,classification,requires,classifiers,adapt,new,classes,training,instances,stateoftheart,metalearning,approaches,maml,learn,initialize,fast,adapt,parameters,limited,instances,shown,promising,results,fewshot,classification,however,existing,metalearning,models,solely,rely,implicit,instancebased,statistics,thus,suffer,instance,unreliability,weak,interpretability,solve,problem,propose,novel,metainformation,guided,metalearning,miml,framework,semantic,concepts,classes,provide,strong,guidance,metalearning,initialization,adaptation,effect,model,establish,connections,instancebased,information,semanticbased,information,enables,effective,initialization,faster,adaptation,comprehensive,experimental,results,fewshot,relation,classification,demonstrate,effectiveness,proposed,framework,notably,miml,achieves,comparable,superior,performance,humans,one,shot,fewrel,evaluation","recent,years,large,pretrained,models,demonstrated,stateoftheart,performance,many,nlp,tasks,however,deployment,models,devices,limited,resources,challenging,due,models,large,computational,consumption,memory,requirements,moreover,need,considerable,amount,labeled,training,data,also,hinders,realworld,deployment,scenarios,model,distillation,shown,promising,results,reducing,model,size,computational,load,data,efficiency,paper,test,boundaries,bert,model,distillation,terms,model,compression,inference,efficiency,data,scarcity,show,classification,tasks,require,capturing,general,lexical,semantics,successfully,distilled,simple,efficient,models,require,relatively,small,amount,labeled,training,data,also,show,distillation,large,pretrained,models,effective,reallife,scenarios,limited,amounts,labeled,training,available","isolating,domaindependent,information,within,large,natural,language,system,offers,general,advantages,modular,design,greatly,enhances,portability,system,new,domains,explored,problem,isolating,domain,dependencies,within,two,large,natural,language,systems,one,generating,tabular,data,base,text,information,formatting,retrieving,information,data,base,describe,domain,information,schema,used,capture,domainspecific,information,indicate,information,used,throughout,two,systems","languages,vary,placement,multiple,adjectives,surrounding,noun,typically,exhibit,strong,intralanguage,tendencies,relative,order,adjectives,eg,preference,big,blue,box,english,grande,boîte,bleue,french,alsundūq,al,azraq,alkabı,r,arabic,advance,new,quantitative,account,adjective,order,across,typologicallydistinct,languages,based,maximizing,information,gain,model,addresses,leftright,asymmetry,frenchtype,ana,sequences,approach,aan,naa,orderings,without,appeal,mechanisms,find,across,languages,preferred,order,adjectives,mirrors,efficient,algorithm,maximizing,information,gain","chinese,spelling,check,csc,task,detect,correct,spelling,errors,chinese,text,stateoftheart,works,csc,task,adopt,bertbased,nonautoregressive,language,model,relies,output,independence,assumption,inappropriate,independence,assumption,prevents,bertbased,models,learning,dependencies,among,target,tokens,resulting,incoherent,problem,address,issue,propose,novel,architecture,named,dynamic,connected,networks,dcn,generates,candidate,chinese,characters,via,pinyin,enhanced,candidate,generator,utilizes,attentionbased,network,model,dependencies,two,adjacent,chinese,characters,experimental,results,show,proposed,method,achieves,new,stateoftheart,performance,three,humanannotated,datasets","natural,language,processing,many,methods,proposed,solve,ambiguity,problems,paper,propose,technique,combine,method,interactive,disambiguation,automatic,one,ambiguous,words,characteristic,method,accuracy,interactive,disambiguation,considered,method,solves,two,following,problems,combining,disambiguation,methods,interactive,disambiguation,executed,ambiguous,word,disambiguated,one,ambiguous,words,exist,sentence,method,defines,condition,executing,interaction,users,order,disambiguation,based,strategy,accuracy,result,maximized,considering,accuracy,interactive,disambiguation,automatic,one,using,method,user,interaction,controlled,holding,accuracy,results","help,individuals,express,better,quotation,recommendation,receiving,growing,attention,nevertheless,prior,efforts,focus,modeling,quotations,queries,separately,ignore,relationship,quotations,queries,work,introduce,transformation,matrix,directly,maps,query,representations,quotation,representations,better,learn,mapping,relationship,employ,mapping,loss,minimizes,distance,two,semantic,spaces,one,quotation,another,mappedquery,furthermore,explore,using,words,history,queries,interpret,figurative,language,quotations,quotationaware,attention,applied,top,history,queries,highlight,indicator,words,experiments,two,datasets,english,chinese,show,model,outperforms,previous,stateoftheart,models","recent,years,large,neural,networks,natural,language,generation,nlg,made,leaps,bounds,ability,generate,fluent,text,however,tasks,evaluating,quality,differences,nlg,systems,understanding,humans,perceive,generated,text,remain,crucial,difficult,system,demonstration,present,real,fake,text,roft,website,tackles,challenges,inviting,users,try,hand,detecting,machinegenerated,text,variety,domains,introduce,novel,evaluation,task,based,detecting,boundary,text,passage,starts,humanwritten,transitions,machinegenerated,show,preliminary,results,using,roft,evaluate,detection,machinegenerated,news,articles","paper,presents,results,wmt,shared,tasks,included,five,machine,translation,mt,tasks,standard,news,itdomain,biomedical,multimodal,pronoun,three,evaluation,tasks,metrics,tuning,runtime,estimation,mt,quality,automatic,postediting,task,bilingual,document,alignment,task,year,mt,systems,institutions,plus,anonymized,online,systems,submitted,translation,directions,news,translation,task,itdomain,task,received,submissions,institutions,directions,biomedical,task,received,submissions,systems,institutions,evaluation,automatic,manual,relative,ranking,point,scale,assessments,quality,estimation,task,three,subtasks,total,teams,submitting,entries,automatic,postediting,task,total,teams,submitting,entries"
0,0.0,7.2e-05,7.2e-05,7.2e-05,7.2e-05,7.2e-05,7.2e-05,7.2e-05,7.2e-05,7.2e-05,...,7.2e-05,7.2e-05,7.2e-05,7.2e-05,7.2e-05,7.2e-05,7.2e-05,7.2e-05,7.2e-05,7.2e-05
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
