In [4]:
import nltk
import sklearn
import numpy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd

#Exploring Dataset

In [5]:
df = pd.read_fwf("/content/corpus.train.en", header=None, encoding='utf-8')
  
# display DataFrame
print(df)

                                                        0      1   \
0        better o . Guaranteeing that every individual ...    NaN   
1                                                    trump    NaN   
2                                                overrides    NaN   
3            "They warm the heart and ease the daily load.    NaN   
4                                              I love you.    NaN   
...                                                    ...    ...   
2659718  He also cites an incident where the Ecuadorian...    NaN   
2659719                                                Hue    NaN   
2659720                  Satan 'has means to cause death'?    NaN   
2659721  Such mating contradicts the will of Nature tow...  allow   
2659722  A 1% provision was taken for all non-classifie...    NaN   

                             2    3    4      5    6    7    8    9    10  
0                           NaN  NaN  NaN    NaN  NaN  NaN  NaN  NaN  NaN  
1                  

In [6]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2659723 entries, 0 to 2659722
Data columns (total 11 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   0       object
 1   1       object
 2   2       object
 3   3       object
 4   4       object
 5   5       object
 6   6       object
 7   7       object
 8   8       object
 9   9       object
 10  10      object
dtypes: object(11)
memory usage: 223.2+ MB
None
                                                  0    1    2    3    4    5   \
0  better o . Guaranteeing that every individual ...  NaN  NaN  NaN  NaN  NaN   
1                                              trump  NaN  NaN  NaN  NaN  NaN   
2                                          overrides  NaN  NaN  NaN  NaN  NaN   
3      "They warm the heart and ease the daily load.  NaN  NaN  NaN  NaN  NaN   
4                                        I love you.  NaN  NaN  NaN  NaN  NaN   

    6    7    8    9    10  
0  NaN  NaN  NaN  NaN  NaN  
1  NaN  NaN  NaN  NaN  NaN  
2  Na

In [7]:
text = df[0]

In [8]:
print(text)

0          better o . Guaranteeing that every individual ...
1                                                      trump
2                                                  overrides
3              "They warm the heart and ease the daily load.
4                                                I love you.
                                 ...                        
2659718    He also cites an incident where the Ecuadorian...
2659719                                                  Hue
2659720                    Satan 'has means to cause death'?
2659721    Such mating contradicts the will of Nature tow...
2659722    A 1% provision was taken for all non-classifie...
Name: 0, Length: 2659723, dtype: object


#Pre-processing

In [9]:
#removing punctuations
processed = text.str.replace(r'[^\w\d\s]', ' ')

#replacing whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

#replace leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

  
  """
  


In [10]:
processed = processed.str.lower()
print(processed)

0          better o guaranteeing that every individual wi...
1                                                      trump
2                                                  overrides
3                they warm the heart and ease the daily load
4                                                 i love you
                                 ...                        
2659718    he also cites an incident where the ecuadorian...
2659719                                                  hue
2659720                       satan has means to cause death
2659721    such mating contradicts the will of nature tow...
2659722    a 1 provision was taken for all non classified...
Name: 0, Length: 2659723, dtype: object


#Stemming and Lemmatisation

In [11]:
#remove stop words from text messages

from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in str(x).split() if term not in stop_words))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
#remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in str(x).split()))

In [13]:
print(processed)

0          better guarante everi individu free wish inevi...
1                                                      trump
2                                                    overrid
3                                  warm heart eas daili load
4                                                       love
                                 ...                        
2659718    also cite incid ecuadorian televis channel pre...
2659719                                                  hue
2659720                                satan mean caus death
2659721    mate contradict natur toward select improv lif...
2659722    1 provis taken non classifi outstand loan rate...
Name: 0, Length: 2659723, dtype: object


In [14]:
processed = processed.str.lower()
print(processed)

0          better guarante everi individu free wish inevi...
1                                                      trump
2                                                    overrid
3                                  warm heart eas daili load
4                                                       love
                                 ...                        
2659718    also cite incid ecuadorian televis channel pre...
2659719                                                  hue
2659720                                satan mean caus death
2659721    mate contradict natur toward select improv lif...
2659722    1 provis taken non classifi outstand loan rate...
Name: 0, Length: 2659723, dtype: object


In [17]:
#remove word stems using a Porter stemmer
from nltk.stem import 	WordNetLemmatizer
nltk.download('wordnet')
lm = WordNetLemmatizer()

processed = processed.apply(lambda x: ' '.join(lm.lemmatize(term) for term in str(x).split()))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [18]:
processed = processed.str.lower()
print(processed)

0          better guarante everi individu free wish inevi...
1                                                      trump
2                                                    overrid
3                                   warm heart ea daili load
4                                                       love
                                 ...                        
2659718    also cite incid ecuadorian televis channel pre...
2659719                                                  hue
2659720                                satan mean caus death
2659721    mate contradict natur toward select improv lif...
2659722    1 provis taken non classifi outstand loan rate...
Name: 0, Length: 2659723, dtype: object


#Sentence/Instance Repetition

In [19]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

all_sentences = []

for text in processed:
  sentences = sent_tokenize(text)
  for s in sentences:
    all_sentences.append(s)

all_sentences = nltk.FreqDist(all_sentences)    

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [20]:
#print the total number of words
print('Number of sentencess: {}'.format(len(all_sentences)))
print('Most common sentences: {}'.format(all_sentences.most_common()))

Number of sentencess: 1743452


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [21]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
#create a bag-of-words
all_words = []

for text in processed:
  words = word_tokenize(text)
  for w in words:
    all_words.append(w)

all_words = nltk.FreqDist(all_words)    

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
#print the total number of words
print('Number of words: {}'.format(len(all_words)))


Number of words: 256242


In [23]:
len(all_words)

256242

In [24]:
print('Most common words: {}'.format(all_words.most_common(100)))

Most common words: [('one', 86076), ('1', 71381), ('jehovah', 68224), ('god', 67456), ('time', 61180), ('peopl', 55769), ('also', 53579), ('u', 53511), ('year', 51362), ('2', 51157), ('use', 49332), ('first', 42479), ('work', 42090), ('may', 41894), ('would', 41407), ('3', 40129), ('day', 39007), ('like', 37802), ('make', 36649), ('world', 36421), ('mani', 36262), ('jesu', 34818), ('bangladesh', 34647), ('new', 34621), ('bibl', 34227), ('4', 34182), ('5', 34132), ('life', 34109), ('two', 33665), ('govern', 33264), ('go', 32521), ('state', 31231), ('say', 30576), ('help', 30387), ('name', 30065), ('come', 30015), ('nation', 29987), ('know', 29350), ('take', 28853), ('good', 28768), ('6', 28479), ('right', 27994), ('even', 27814), ('way', 27735), ('get', 27703), ('could', 27376), ('10', 27330), ('need', 27174), ('see', 27101), ('made', 26518), ('love', 26440), ('countri', 26214), ('person', 26136), ('live', 26008), ('call', 25885), ('well', 25639), ('child', 25528), ('said', 24777), ('fo

#Bi-Grams & Tri-Grams

In [25]:
def n_grams(text, n):
  tokens = word_tokenize(text)
  n_grams=[]

  for i in range(len(tokens)-n+1):
    temp = [tokens[j] for j in range(i, i+n)]
    n_grams.append(" ".join(temp))
    
  return n_grams  

In [26]:
text = str(processed)

In [27]:
bi_grams= n_grams(text, 2)

In [28]:
bi_grams

['0 better',
 'better guarante',
 'guarante everi',
 'everi individu',
 'individu free',
 'free wish',
 'wish inevi',
 'inevi ...',
 '... 1',
 '1 trump',
 'trump 2',
 '2 overrid',
 'overrid 3',
 '3 warm',
 'warm heart',
 'heart ea',
 'ea daili',
 'daili load',
 'load 4',
 '4 love',
 'love ...',
 '... 2659718',
 '2659718 also',
 'also cite',
 'cite incid',
 'incid ecuadorian',
 'ecuadorian televis',
 'televis channel',
 'channel pre',
 'pre ...',
 '... 2659719',
 '2659719 hue',
 'hue 2659720',
 '2659720 satan',
 'satan mean',
 'mean caus',
 'caus death',
 'death 2659721',
 '2659721 mate',
 'mate contradict',
 'contradict natur',
 'natur toward',
 'toward select',
 'select improv',
 'improv lif',
 'lif ...',
 '... 2659722',
 '2659722 1',
 '1 provis',
 'provis taken',
 'taken non',
 'non classifi',
 'classifi outstand',
 'outstand loan',
 'loan rate',
 'rate ...',
 '... Name',
 'Name :',
 ': 0',
 '0 ,',
 ', Length',
 'Length :',
 ': 2659723',
 '2659723 ,',
 ', dtype',
 'dtype :',
 ': obje

In [29]:
tri_grams= n_grams(text, 3)

In [30]:
tri_grams

['0 better guarante',
 'better guarante everi',
 'guarante everi individu',
 'everi individu free',
 'individu free wish',
 'free wish inevi',
 'wish inevi ...',
 'inevi ... 1',
 '... 1 trump',
 '1 trump 2',
 'trump 2 overrid',
 '2 overrid 3',
 'overrid 3 warm',
 '3 warm heart',
 'warm heart ea',
 'heart ea daili',
 'ea daili load',
 'daili load 4',
 'load 4 love',
 '4 love ...',
 'love ... 2659718',
 '... 2659718 also',
 '2659718 also cite',
 'also cite incid',
 'cite incid ecuadorian',
 'incid ecuadorian televis',
 'ecuadorian televis channel',
 'televis channel pre',
 'channel pre ...',
 'pre ... 2659719',
 '... 2659719 hue',
 '2659719 hue 2659720',
 'hue 2659720 satan',
 '2659720 satan mean',
 'satan mean caus',
 'mean caus death',
 'caus death 2659721',
 'death 2659721 mate',
 '2659721 mate contradict',
 'mate contradict natur',
 'contradict natur toward',
 'natur toward select',
 'toward select improv',
 'select improv lif',
 'improv lif ...',
 'lif ... 2659722',
 '... 2659722 1'

#Word Frequency using Zipf's Law

In [31]:
import re
from operator import itemgetter    
 
frequency = {}
open_file = open('/content/corpus.train.en', 'r')
file_to_string = open_file.read()
words = re.findall(r'(\b[A-Za-z][a-z]{2,9}\b)', file_to_string)
 
for word in words:
    count = frequency.get(word,0)
    frequency[word] = count + 1
     
for key, value in reversed(sorted(frequency.items(), key = itemgetter(1))):
    print(value, key)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1 Tetanous
1 bacchanals
1 soppiest
1 Naajayaz
1 weller
1 ryes
1 Sibaak
1 Aoyami
1 Oldboy
1 Barfe
1 Griddliss
1 Eppothum
1 Engaeyum
1 Denielle
1 Cyberjaya
1 Bimetallic
1 Bagchar
1 troups
1 Cunt
1 Radlett
1 Griffth
1 sixed
1 Aathi
1 Ubah
1 Stegner
1 Rivail
1 Denizard
1 Hyppolyte
1 iesus
1 Pontoise
1 Cergy
1 Causae
1 Kashmar
1 Leonis
1 Chequeado
1 Baghban
1 dysgenesis
1 Gonadal
1 Mishtar
1 Mishtara
1 Safae
1 Birkuli
1 Kardakadh
1 Nawinabad
1 Jondiu
1 Darzanidan
1 Hanbu
1 snd
1 Sumepter
1 Smushed
1 katray
1 Bandhudevi
1 hadjis
1 archimedes
1 hieratism
1 garmenting
1 knaveries
1 cynicals
1 Membranous
1 Poilu
1 Yajneswar
1 prescinded
1 Santerre
1 Lampang
1 decliner
1 Fundation
1 conchoid
1 arised
1 Mortified
1 Flune
1 Kushka
1 rahmad
1 malingerer
1 Bipa
1 Mohtarimun
1 Kabona
1 Cosmas
1 Guzargah
1 Hidimba
1 Perspolis
1 binning
1 Hyfryd
1 Hermoso
1 Trevelin
1 Dolavon
1 Vehanouch
1 yoctometre
1 pinda
1 Shanzaf
1 tazeen
1 anthemed


#Tf-Idf Measurements

In [32]:
text1 = processed[:200]
print(text1)

0      better guarante everi individu free wish inevi...
1                                                  trump
2                                                overrid
3                               warm heart ea daili load
4                                                   love
                             ...                        
195                                    quickli open file
196                                           song 48 80
197    spoke aunt live la ceiba said strong scare dec...
198                                     obstruct prevent
199                                  screenshot aliv app
Name: 0, Length: 200, dtype: object


In [33]:
text2 = processed[200:400]
print(text2)

200    gener rel use predict begin univers bodi conta...
201                                                 stop
202    putin simpli repeat predecessor bori yeltsin 1999
203                                                coars
204                                               deceit
                             ...                        
395                                                  dad
396    malaysia mena oper televis network rest world ...
397    sourc drink water tube well 88 41 tap 2 95 pon...
398    mani peopl serv solemn remind man inabl despit...
399     traffic congest drought past month disprov claim
Name: 0, Length: 200, dtype: object


In [34]:
bagOfWords1 = str(text1).split()
bagOfWords2 = str(text2).split()

In [35]:
uniqueWords = set(bagOfWords1).union(set(bagOfWords2))
print(uniqueWords)

{'200,', 'sourc', 'yeltsin', 'dad', 'Length:', 'predict', 'trump', 'mani', '...', 'scare', '4', 'traffic', 'la', 'live', 'drought', '399', 'screenshot', '196', '2', 'network', 'inevi...', 'free', 'dec...', 'pon...', 'Name:', '1999', '200', 'simpli', 'song', 'mena', 'stop', 'open', 'tube', '3', 'love', 'world', 'remind', 'claim', 'conta...', '80', '398', 'univers', 'rel', '88', 'wish', 'oper', 'aliv', 'despit...', 'prevent', '201', 'ceiba', '198', 'dtype:', '95', '203', 'bodi', 'solemn', 'said', '1', 'everi', 'month', 'serv', 'app', 'deceit', 'well', 'repeat', '41', 'overrid', 'aunt', 'strong', 'putin', 'spoke', 'use', 'congest', 'better', 'load', 'warm', 'ea', 'peopl', 'gener', 'obstruct', '202', 'drink', '397', '197', 'daili', 'object', 'water', 'past', 'bori', 'individu', 'man', 'rest', '204', 'malaysia', '395', '0', 'begin', '195', 'inabl', 'coars', 'guarante', 'disprov', 'predecessor', 'tap', 'file', '48', 'heart', '199', '396', 'televis', 'quickli', '0,'}


In [36]:
numberOfWords1 = dict.fromkeys(uniqueWords, 0)
for word in bagOfWords1:
  numberOfWords1[word] += 1
numberOfWords2 = dict.fromkeys(uniqueWords, 0)
for word in bagOfWords2:
  numberOfWords2[word] += 1  

In [37]:
print(numberOfWords1)
print(numberOfWords2)

{'200,': 1, 'sourc': 0, 'yeltsin': 0, 'dad': 0, 'Length:': 1, 'predict': 0, 'trump': 1, 'mani': 0, '...': 1, 'scare': 1, '4': 1, 'traffic': 0, 'la': 1, 'live': 1, 'drought': 0, '399': 0, 'screenshot': 1, '196': 1, '2': 1, 'network': 0, 'inevi...': 1, 'free': 1, 'dec...': 1, 'pon...': 0, 'Name:': 1, '1999': 0, '200': 0, 'simpli': 0, 'song': 1, 'mena': 0, 'stop': 0, 'open': 1, 'tube': 0, '3': 1, 'love': 1, 'world': 0, 'remind': 0, 'claim': 0, 'conta...': 0, '80': 1, '398': 0, 'univers': 0, 'rel': 0, '88': 0, 'wish': 1, 'oper': 0, 'aliv': 1, 'despit...': 0, 'prevent': 1, '201': 0, 'ceiba': 1, '198': 1, 'dtype:': 1, '95': 0, '203': 0, 'bodi': 0, 'solemn': 0, 'said': 1, '1': 1, 'everi': 1, 'month': 0, 'serv': 0, 'app': 1, 'deceit': 0, 'well': 0, 'repeat': 0, '41': 0, 'overrid': 1, 'aunt': 1, 'strong': 1, 'putin': 0, 'spoke': 1, 'use': 0, 'congest': 0, 'better': 1, 'load': 1, 'warm': 1, 'ea': 1, 'peopl': 0, 'gener': 0, 'obstruct': 1, '202': 0, 'drink': 0, '397': 0, '197': 1, 'daili': 1, 'obj

In [38]:
def computeTF(wordDict, bagOfWords):
  tfDict = {}
  bagOfWordsCount = len(bagOfWords)
  for word, count in wordDict.items():
    tfDict[word] = count / float(bagOfWordsCount)
  return tfDict  

In [39]:
tf1 = computeTF(numberOfWords1, bagOfWords1)
tf2 = computeTF(numberOfWords2, bagOfWords2)
print(tf1)
print(tf2)

{'200,': 0.019230769230769232, 'sourc': 0.0, 'yeltsin': 0.0, 'dad': 0.0, 'Length:': 0.019230769230769232, 'predict': 0.0, 'trump': 0.019230769230769232, 'mani': 0.0, '...': 0.019230769230769232, 'scare': 0.019230769230769232, '4': 0.019230769230769232, 'traffic': 0.0, 'la': 0.019230769230769232, 'live': 0.019230769230769232, 'drought': 0.0, '399': 0.0, 'screenshot': 0.019230769230769232, '196': 0.019230769230769232, '2': 0.019230769230769232, 'network': 0.0, 'inevi...': 0.019230769230769232, 'free': 0.019230769230769232, 'dec...': 0.019230769230769232, 'pon...': 0.0, 'Name:': 0.019230769230769232, '1999': 0.0, '200': 0.0, 'simpli': 0.0, 'song': 0.019230769230769232, 'mena': 0.0, 'stop': 0.0, 'open': 0.019230769230769232, 'tube': 0.0, '3': 0.019230769230769232, 'love': 0.019230769230769232, 'world': 0.0, 'remind': 0.0, 'claim': 0.0, 'conta...': 0.0, '80': 0.019230769230769232, '398': 0.0, 'univers': 0.0, 'rel': 0.0, '88': 0.0, 'wish': 0.019230769230769232, 'oper': 0.0, 'aliv': 0.0192307

In [40]:
def computeIDF(documents):
  import math
  N = len(documents)

  idfDict = dict.fromkeys(documents[0].keys(), 0)
  for document in documents:
    for word, val in document.items():
      if val > 0 :
        idfDict[word] += 1

  for word, val in idfDict.items():
    idfDict[word] = math.log(N / float(val))
  return idfDict        

In [41]:
idfs = computeIDF([numberOfWords1, numberOfWords2])
print(idfs)

{'200,': 0.0, 'sourc': 0.6931471805599453, 'yeltsin': 0.6931471805599453, 'dad': 0.6931471805599453, 'Length:': 0.0, 'predict': 0.6931471805599453, 'trump': 0.6931471805599453, 'mani': 0.6931471805599453, '...': 0.0, 'scare': 0.6931471805599453, '4': 0.6931471805599453, 'traffic': 0.6931471805599453, 'la': 0.6931471805599453, 'live': 0.6931471805599453, 'drought': 0.6931471805599453, '399': 0.6931471805599453, 'screenshot': 0.6931471805599453, '196': 0.6931471805599453, '2': 0.0, 'network': 0.6931471805599453, 'inevi...': 0.6931471805599453, 'free': 0.6931471805599453, 'dec...': 0.6931471805599453, 'pon...': 0.6931471805599453, 'Name:': 0.0, '1999': 0.6931471805599453, '200': 0.6931471805599453, 'simpli': 0.6931471805599453, 'song': 0.6931471805599453, 'mena': 0.6931471805599453, 'stop': 0.6931471805599453, 'open': 0.6931471805599453, 'tube': 0.6931471805599453, '3': 0.6931471805599453, 'love': 0.6931471805599453, 'world': 0.6931471805599453, 'remind': 0.6931471805599453, 'claim': 0.69

In [42]:
def computeTFIDF(tfBagOfWords, idfs):
  tfidf = {}
  for word, val in tfBagOfWords.items():
    tfidf[word] = val * idfs[word]
  return tfidf  

In [43]:
tfidf1 = computeTFIDF(tf1, idfs)
tfidf2 = computeTFIDF(tf2, idfs)
df = pd.DataFrame([tfidf1, tfidf2])

In [44]:
df

Unnamed: 0,"200,",sourc,yeltsin,dad,Length:,predict,trump,mani,...,scare,....1,predecessor,tap,file,48,heart,199,396,televis,quickli,"0,"
0,0.0,0.0,0.0,0.0,0.0,0.0,0.01333,0.0,0.0,0.01333,...,0.0,0.0,0.01333,0.01333,0.01333,0.01333,0.0,0.0,0.01333,0.0
1,0.0,0.009902,0.009902,0.009902,0.0,0.009902,0.0,0.009902,0.0,0.0,...,0.009902,0.009902,0.0,0.0,0.0,0.0,0.009902,0.009902,0.0,0.0


#Extras as per No. 7 Instruction

In [45]:
word_features = list(all_words.keys())
#define a find_features function
def find_features(text):
  words = word_tokenize(text)
  features = {}
  for word in word_features:
    features[word] = (word in words)
    
  return features 
features = find_features(text)  

In [46]:
features

{'better': True,
 'guarante': True,
 'everi': True,
 'individu': True,
 'free': True,
 'wish': True,
 'inevit': False,
 'short': False,
 'chang': False,
 'equal': False,
 'trump': True,
 'overrid': True,
 'warm': True,
 'heart': True,
 'ea': True,
 'daili': True,
 'load': True,
 'love': True,
 'port': False,
 'compani': False,
 'limit': False,
 'kpcl': False,
 'commot': False,
 'clemenceau': False,
 'also': True,
 'express': False,
 'skeptic': False,
 'frustrat': False,
 'wilson': False,
 'fourteen': False,
 'point': False,
 'mr': False,
 'bore': False,
 'complain': False,
 'twitter': False,
 'user': False,
 'went': False,
 'comment': False,
 'jalloud': False,
 'interview': False,
 'al': False,
 'jazeera': False,
 'attitud': False,
 'club': False,
 'let': False,
 'u': False,
 'investig': False,
 'trial': False,
 '2012': False,
 'alrawahi': False,
 'featur': False,
 'global': False,
 'voic': False,
 'advocaci': False,
 'detain': False,
 'critic': False,
 'sultan': False,
 'qaboo': False

##Parts of Speech

In [47]:
text = str(processed)
import nltk
nltk.download('tagsets')
nltk.help.upenn_tagset("MD")

[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.
MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would


In [48]:
tokens = nltk.word_tokenize(text)
print(tokens)

['0', 'better', 'guarante', 'everi', 'individu', 'free', 'wish', 'inevi', '...', '1', 'trump', '2', 'overrid', '3', 'warm', 'heart', 'ea', 'daili', 'load', '4', 'love', '...', '2659718', 'also', 'cite', 'incid', 'ecuadorian', 'televis', 'channel', 'pre', '...', '2659719', 'hue', '2659720', 'satan', 'mean', 'caus', 'death', '2659721', 'mate', 'contradict', 'natur', 'toward', 'select', 'improv', 'lif', '...', '2659722', '1', 'provis', 'taken', 'non', 'classifi', 'outstand', 'loan', 'rate', '...', 'Name', ':', '0', ',', 'Length', ':', '2659723', ',', 'dtype', ':', 'object']


In [49]:
nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(tokens)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('0', 'CD'),
 ('better', 'JJR'),
 ('guarante', 'NN'),
 ('everi', 'FW'),
 ('individu', 'JJ'),
 ('free', 'JJ'),
 ('wish', 'NN'),
 ('inevi', 'NN'),
 ('...', ':'),
 ('1', 'CD'),
 ('trump', 'NN'),
 ('2', 'CD'),
 ('overrid', 'JJ'),
 ('3', 'CD'),
 ('warm', 'JJ'),
 ('heart', 'NN'),
 ('ea', 'NN'),
 ('daili', 'NN'),
 ('load', 'NN'),
 ('4', 'CD'),
 ('love', 'NN'),
 ('...', ':'),
 ('2659718', 'CD'),
 ('also', 'RB'),
 ('cite', 'JJ'),
 ('incid', 'NN'),
 ('ecuadorian', 'JJ'),
 ('televis', 'NN'),
 ('channel', 'NN'),
 ('pre', 'NN'),
 ('...', ':'),
 ('2659719', 'CD'),
 ('hue', 'NN'),
 ('2659720', 'CD'),
 ('satan', 'JJ'),
 ('mean', 'NN'),
 ('caus', 'NN'),
 ('death', 'NN'),
 ('2659721', 'CD'),
 ('mate', 'NN'),
 ('contradict', 'NN'),
 ('natur', 'RB'),
 ('toward', 'IN'),
 ('select', 'JJ'),
 ('improv', 'NN'),
 ('lif', 'NN'),
 ('...', ':'),
 ('2659722', 'CD'),
 ('1', 'CD'),
 ('provis', 'NN'),
 ('taken', 'VBN'),
 ('non', 'RB'),
 ('classifi', 'JJ'),
 ('outstand', 'NN'),
 ('loan', 'NN'),
 ('rate', 'NN'),
 ('...

#The End