In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import re
import pandas as pd
import textract
import numpy as np
from nltk.corpus import stopwords

# Get the Paper Text

In [2]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [3]:
path = 'ACMPapers/3106328.3106330.pdf'
words = convert_pdf_to_txt(path)

print(words)

copycat: Testing Differential Treatment
of New Transport Protocols in the Wild

Korian Edeline∗, Mirja Kühlewind‡, Brian Trammell‡, Benoit Donnet∗

∗ Université de Liège, Montefiore Institute – Belgium
‡ ETH Zurich, Networked Systems Group – Switzerland

ABSTRACT

Recent years have seen the development of multiple transport
solutions to address the ossification of TCP in the Internet, and
to ease transport-layer extensibility and deployability. Recent ap-
proaches, such as PLUS and Google’s QUIC, introduce an upper
transport layer atop UDP; their deployment therefore relies on UDP
not being disadvantaged with respect to TCP by the Internet.

This paper introduces copycat, a generic transport protocol test-
ing tool that highlights differential treatment by the path in terms
of connectivity and QoS between TCP and a non-TCP transport pro-
tocol. copycat generates TCP-shaped traffic with custom headers,
and compares its performance in terms of loss and delay with TCP.
We present a proof-

# Remove Unnecessary Data

In [None]:
lines = words.split("\n")
print(lines)

In [None]:
for line in lines:
    if line == "":
        lines.remove(line)
    
lines

In [28]:
# remove stopwords
stopwords = list(set(stopwords))

new_lines = []
for line in lines:
    words = line.split(" ")
    new_words = []
    for word in words:
        if word not in stopwords:
            new_words.append(word)
    new_line = " ".join(new_words)
    new_lines.append(new_line)
print(new_lines)

words = " ".join(new_lines)

['copycat: Testing Differential Treatment', 'New Transport Protocols Wild', 'Korian Edeline∗, Mirja Kühlewind‡, Brian Trammell‡, Benoit Donnet∗', '∗ Université de Liège, Montefiore Institute – Belgium', '‡ ETH Zurich, Networked Systems Group – Switzerland', 'ABSTRACT', 'Recent years seen development multiple transport', 'solutions address ossification TCP Internet,', 'ease transport-layer extensibility deployability. Recent ap-', 'proaches, PLUS Google’s QUIC, introduce upper', 'transport layer atop UDP; deployment therefore relies UDP', 'disadvantaged respect TCP Internet.', 'This paper introduces copycat, generic transport protocol test-', 'ing tool highlights differential treatment path terms', 'connectivity QoS TCP non-TCP transport pro-', 'tocol. copycat generates TCP-shaped traffic custom headers,', 'compares performance terms loss delay TCP.', 'We present proof-of-concept case study (UDP vs. TCP) order', 'answer questions deployability current transport', 'evolution approaches, 

In [29]:
# convert it back into words

text = "".join(words)
text = text.replace("\n", " ")
text

'copycat: Testing Differential Treatment New Transport Protocols Wild Korian Edeline∗, Mirja Kühlewind‡, Brian Trammell‡, Benoit Donnet∗ ∗ Université de Liège, Montefiore Institute – Belgium ‡ ETH Zurich, Networked Systems Group – Switzerland ABSTRACT Recent years seen development multiple transport solutions address ossification TCP Internet, ease transport-layer extensibility deployability. Recent ap- proaches, PLUS Google’s QUIC, introduce upper transport layer atop UDP; deployment therefore relies UDP disadvantaged respect TCP Internet. This paper introduces copycat, generic transport protocol test- ing tool highlights differential treatment path terms connectivity QoS TCP non-TCP transport pro- tocol. copycat generates TCP-shaped traffic custom headers, compares performance terms loss delay TCP. We present proof-of-concept case study (UDP vs. TCP) order answer questions deployability current transport evolution approaches, demonstrate extent copycat’s capabilities possible applica

# Extract Keywords

In [30]:

keywords = re.findall(r'[a-zA-Z]\w+',text)
len(keywords)

3683

In [31]:
df = pd.DataFrame(list(set(keywords)),columns=['keywords'])  #Dataframe with unique keywords to avoid repetition in rows
df.head(50)

Unnamed: 0,keywords
0,illuminating
1,unused
2,advances
3,AS
4,modularity
5,Passive
6,Net
7,addressed
8,inside
9,node


In [9]:
def weightage(word,text,number_of_documents=1):
    word_list = re.findall(word,text)
    number_of_times_word_appeared =len(word_list)
    tf = number_of_times_word_appeared/float(len(text))
    idf = np.log((number_of_documents)/float(number_of_times_word_appeared))
    tf_idf = tf*idf
    return number_of_times_word_appeared,tf,idf ,tf_idf

In [10]:
df['number_of_times_word_appeared'] = df['keywords'].apply(lambda x: weightage(x,text)[0])
df['tf'] = df['keywords'].apply(lambda x: weightage(x,text)[1])
df['idf'] = df['keywords'].apply(lambda x: weightage(x,text)[2])
df['tf_idf'] = df['keywords'].apply(lambda x: weightage(x,text)[3])

In [11]:
df = df.sort_values('tf_idf',ascending=True)
df.to_csv('Keywords.csv')
df.head(25)

Unnamed: 0,keywords,number_of_times_word_appeared,tf,idf,tf_idf
959,th,451,0.012349,-6.111467,-0.075471
1486,an,435,0.011911,-6.075346,-0.072363
857,in,385,0.010542,-5.953243,-0.062758
249,on,360,0.009857,-5.886104,-0.058021
478,or,356,0.009748,-5.874931,-0.057268
863,re,350,0.009584,-5.857933,-0.05614
388,at,328,0.008981,-5.793014,-0.052028
438,nd,258,0.007064,-5.55296,-0.039228
1291,the,234,0.006407,-5.455321,-0.034954
1429,as,230,0.006298,-5.438079,-0.034248
