Cleaning Text Data

 text wrangling

In [None]:
import csv

with open('example.csv','rb') as f:
    reader = csv.reader(f,delimiter=',',quotechar='"')
    for line in reader :
        print line[1] # assuming the second field is the raw sting

json data

In [None]:
import json

jsonfile = open('example.json')
data = json.load(jsonfile)
print data['string']

Sentence splitter

In [1]:
inputstring = ' This is an example sent. The sentence splitter will split on sent markers. Ohh really !!'

from nltk.tokenize import sent_tokenize

all_sent = sent_tokenize(inputstring)
print (all_sent)

[' This is an example sent.', 'The sentence splitter will split on sent markers.', 'Ohh really !', '!']


In [2]:
import nltk.tokenize.punkt
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()

Tokenization

In [3]:
s = "Hi Everyone ! hola gr8" # simplest tokenizer
print (s.split())
# ['Hi', 'Everyone', '!', 'hola', 'gr8']

['Hi', 'Everyone', '!', 'hola', 'gr8']


In [4]:
from nltk.tokenize import word_tokenize
word_tokenize(s)
# ['Hi', 'Everyone', '!', 'hola', 'gr8']

['Hi', 'Everyone', '!', 'hola', 'gr8']

In [6]:
from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize

regexp_tokenize(s, pattern='\w+')
# ['Hi', 'Everyone', 'hola', 'gr8']

['Hi', 'Everyone', 'hola', 'gr8']

In [7]:
regexp_tokenize(s, pattern='\d+')
# ['8']

['8']

In [8]:
wordpunct_tokenize(s)
# ['Hi', ',', 'Everyone', '!!', 'hola', 'gr8']

['Hi', 'Everyone', '!', 'hola', 'gr8']

In [9]:
blankline_tokenize(s)
# ['Hi, Everyone !! hola gr8']

['Hi Everyone ! hola gr8']

stemming

In [11]:
from nltk.stem import PorterStemmer # import Porter stemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

pst = PorterStemmer() # create obj of the PorterStemmer
lst = LancasterStemmer() # create obj of LancasterStemmer

lst.stem("eating")
# eat

'eat'

In [13]:
pst.stem("shopping")
# shop

'shop'

lemmatization

In [14]:
from nltk.stem import WordNetLemmatizer

wlem = WordNetLemmatizer()
wlem.lemmatize("ate")

'ate'

stopwords remover

In [15]:
from nltk.corpus import stopwords

stoplist = stopwords.words('english') # config the language name
# NLTK supports 22 languages for removing the stop words
text = "This is just a test"

cleanwordlist = [word for word in text.split() if word not in
stoplist]

In [16]:
cleanwordlist

['This', 'test']

rare word removal

In [17]:
# tokens is a list of all tokens in corpus
freq_dist = nltk.FreqDist(token)
rarewords = freq_dist.keys()[-50:]
after_rare_words = [ word for word in token not in rarewords]

NameError: name 'token' is not defined

spell correction

In [18]:
from nltk.metrics import edit_distance

edit_distance("rain","shine")

3

Part of Speech Tagging

In [19]:
import nltk
from nltk import word_tokenize
s = "I was watching TV"
print( nltk.pos_tag(word_tokenize(s)))

[('I', 'PRP'), ('was', 'VBD'), ('watching', 'VBG'), ('TV', 'NN')]


In [20]:
tagged = nltk.pos_tag(word_tokenize(s))
allnoun = [word for word,pos in tagged if pos in ['NN','NNP'] ]

In [21]:
allnoun

['TV']

Stanford tagger

In [23]:
from nltk.tag.stanford import POSTagger
import nltk

stan_tagger = POSTagger('models/english-bidirectional-distdim.tagger','standford-postagger.jar')

tokens = nltk.word_tokenize(s)
stan_tagger.tag(tokens)

TypeError: Can't instantiate abstract class StanfordTagger with abstract methods _cmd

deep diving tagger

In [24]:
from nltk.corpus import brown
import nltk

tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
print (nltk.FreqDist(tags))

<FreqDist with 218 samples and 100554 outcomes>


In [25]:
brown_tagged_sents = brown.tagged_sents(categories='news')
default_tagger = nltk.DefaultTagger('NN')
print (default_tagger.evaluate(brown_tagged_sents))

0.13089484257215028


n-gram tagger

In [26]:
from nltk.tag import UnigramTagger
from nltk.tag import DefaultTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

# we are dividing the data into a test and train to evaluate our taggers.
train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]

unigram_tagger = UnigramTagger(train_data,backoff=default_tagger)
print( unigram_tagger.evaluate(test_data))

0.8363400777434467


In [28]:
bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)

print( bigram_tagger.evaluate(test_data))

0.84570915977275


In [29]:
trigram_tagger = TrigramTagger(train_data,backoff=bigram_tagger)

print (trigram_tagger.evaluate(test_data))

0.8442140934914781


Regex tagger

In [30]:
from nltk.tag.sequential import RegexpTagger

regexp_tagger = RegexpTagger(
 [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
 ( r'(The|the|A|a|An|an)$', 'AT'), # articles
 ( r'.*able$', 'JJ'), # adjectives
 ( r'.*ness$', 'NN'), # nouns formed from adj
 ( r'.*ly$', 'RB'), # adverbs
 ( r'.*s$', 'NNS'), # plural nouns
 ( r'.*ing$', 'VBG'), # gerunds
 (r'.*ed$', 'VBD'), # past tense verbs
 (r'.*', 'NN') # nouns (default)
 ])

print (regexp_tagger.evaluate(test_data))

0.31306687929831556


NER tagger

In [32]:
import nltk
from nltk import ne_chunk

sent = "Mark is studying at Stanford University in California"
print(ne_chunk(nltk.pos_tag(word_tokenize(sent)), binary=False))
# (S
#  (PERSON Mark/NNP)
#  is/VBZ
#  studying/VBG
#  at/IN
#  (ORGANIZATION Stanford/NNP University/NNP)
#  in/IN
#  NY(GPE California/NNP)))

(S
  (PERSON Mark/NNP)
  is/VBZ
  studying/VBG
  at/IN
  (ORGANIZATION Stanford/NNP University/NNP)
  in/IN
  (GPE California/NNP))


In [36]:
from nltk.tag.stanford import CoreNLPNERTagger

st = CoreNLPNERTagger('<PATH>/stanford-ner/classifiers/all.3class.distsim.crf.ser.gz', '<PATH>/stanford-ner/stanford-ner.jar')

st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) 

LookupError: unknown encoding: <PATH>/stanford-ner/stanford-ner.jar

Parsing Structure in Text

In [38]:
# toy CFG
from nltk import CFG

toy_grammar = nltk.CFG.fromstring(
"""
 S -> NP VP # S indicate the entire sentence
 VP -> V NP # VP is verb phrase the
 V -> "eats" | "drinks" # V is verb
 NP -> Det N # NP is noun phrase (chunk that has noun in it)
 Det -> "a" | "an" | "the" # Det is determiner used in the sentences
 N -> "president" |"Obama" |"apple"| "coke" # N some example nouns
 """)

toy_grammar.productions()

ERROR:root:An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 1))



ValueError: Unable to parse line 2: S -> NP VP # S indicate the entire sentence
Expected a nonterminal, found: # S indicate the entire sentence

A regex parser

In [39]:
# Regex parser
chunk_rules=ChunkRule("<.*>+","chunk everything")

import nltk
from nltk.chunk.regexp import *
reg_parser = RegexpParser('''
 NP: {<DT>? <JJ>* <NN>*} # NP
 P: {<IN>} # Preposition
 V: {<V.*>} # Verb
 PP: {<P> <NP>} # PP -> P NP
 VP: {<V> <NP|PP>*} # VP -> V (NP|PP)*
 ''')

test_sent="Mr. Obama played a big role in the Health insurance bill"
test_sent_pos=nltk.pos_tag(nltk.word_tokenize(test_sent))
paresed_out=reg_parser.parse(test_sent_pos)
print (paresed_out)

NameError: name 'ChunkRule' is not defined

Named-entity recognition (NER)

In [None]:
# NP chunking (NER)
# f=open(# absolute path for the file of text for which we want NER)
# )
# text=f.read()
    
sentences = nltk.sent_tokenize(b)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in
sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]

for sent in tagged_sentences:
    print nltk.ne_chunk(sent)

Relation extraction

In [45]:
import re

IN = re.compile(r'.*\bin\b(?!\b.+ing)')

for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
     for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):

        print(nltk.sem.rtuple(rel))

[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']


NLP Applications

In [46]:
import sys

f=open('nyt.txt','r')
news_content=f.read()

FileNotFoundError: [Errno 2] No such file or directory: 'nyt.txt'

In [None]:
import nltk

results=[]

for sent_no,sentence in enumerate(nltk.sent_tokenize(news_content)):
     no_of_tokens=len(nltk.word_tokenize(sentence))
     #print no_of_toekns
     # Let's do POS tagging
     tagged=nltk.pos_tag(nltk.word_tokenize(sentence))
     # Count the no of Nouns in the sentence
     no_of_nouns=len([word for word,pos in tagged if pos in ["NN","NNP"] ])
     #Use NER to tag the named entities.
     ners=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence)),
    binary=False)
     no_of_ners= len([chunk for chunk in ners if hasattr(chunk,
    'node')])
     score=(no_of_ners+no_of_nouns)/float(no_of_toekns)

     results.append((sent_no,no_of_tokens,no_of_ners,\
    no_of_nouns,score,sentence))

In [None]:
for sent in sorted(results,key=lambda x: x[4],reverse=True):

print (sent[5])

In [48]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

results=[]

news_content="""Mr. Obama planned to promote the effort on Monday during
a visit to Camden, N.J. The ban is part of Mr. Obama's push to ease
tensions between law enforcement and minority \communities in reaction to
the crises in Baltimore; Ferguson, Mo. We are, without a doubt, sitting
at a defining moment in American policing, Ronald L. Davis, the director
of the Office of Community Oriented Policing Services at the Department
of Justice, told reporters in a conference call organized by the White
House"""

sentences=nltk.sent_tokenize(news_content)

vectorizer = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True)

sklearn_binary=vectorizer.fit_transform(sentences)
print (countvectorizer.get_feature_names())

print (sklearn_binary.toarray())

for i in sklearn_binary.toarray():
     results.append(i.sum()/float(len(i.nonzero()[0]))

SyntaxError: unexpected EOF while parsing (<ipython-input-48-f6938cb31711>, line 25)

Text Classification

In [49]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv

def preprocessing(text):
     text = text.decode("utf8")
     # tokenize into words
     tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

    # remove stopwords
     stop = stopwords.words('english')
     tokens = [token for token in tokens if token not in stop]
     # remove words less than three letters
     tokens = [word for word in tokens if len(word) >= 3]

     # lower capitalization
     tokens = [word.lower() for word in tokens]
     # lemmatize
     lmtzr = WordNetLemmatizer()
     tokens = [lmtzr.lemmatize(word) for word in tokens]
     preprocessed_text= ' '.join(tokens)
     return preprocessed_text

In [None]:
>>>smsdata = open('SMSSpamCollection') # check the structure of this
file!
>>>smsdata_data = []
>>>sms_labels = []
>>>csv_reader = csv.reader(sms,delimiter='\t')
>>>for line in csv_reader:
>>> # adding the sms_id
>>> sms_labels.append( line[0])
>>> # adding the cleaned text We are calling preprocessing method
>>> sms_data.append(preprocessing(line[1]))
>>>sms.close()

In [None]:
>>>import sklearn

In [None]:
>>>trainset_size = int(round(len(sms_data)*0.70))
>>># i chose this threshold for 70:30 train and test split.
>>>print 'The training set size for this classifier is ' + str(trainset_
size) + '\n'
>>>x_train = np.array([''.join(el) for el in sms_data[0:trainset_size]])
>>>y_train = np.array([el for el in sms_labels[0:trainset_size]])
>>>x_test = np.array([''.join(el) for el in sms_data[trainset_
size+1:len(sms_data)]])
>>>y_test = np.array([el for el in sms_labels[trainset_size+1:len(sms_
labels)]])or el in sms_labels[trainset_size+1:len(sms_labels)]])
>>>print x_train
>>>print y_train

In [None]:
>>>from sklearn.feature_extraction.text import CountVectorizer
>>>sms_exp=[ ]
>>>for line in sms_list:
>>> sms_exp.append(preprocessing(line[1]))
>>>vectorizer = CountVectorizer(min_df=1)
>>>X_exp = vectorizer.fit_transform(sms_exp)
>>>print "||".join(vectorizer.get_feature_names())
>>>print X_exp.toarray()


In [None]:
>>>from sklearn.feature_extraction.text import TfidfVectorizer
>>>vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_
words='english', strip_accents='unicode', norm='l2')
>>>X_train = vectorizer.fit_transform(x_train)
>>>X_test = vectorizer.transform(x_test)

In [None]:
>>>from sklearn.naive_bayes import MultinomialNB
>>>clf = MultinomialNB().fit(X_train, y_train)
>>>y_nb_predicted = clf.predict(X_test)
>>>print y_nb_predicted

In [None]:
>>>print ' \n confusion_matrix \n '
>>>cm = confusion_matrix(y_test, y_pred)
>>>print cm
>>>print '\n Here is the classification report:'
>>>print classification_report(y_test, y_nb_predicted)

In [None]:
>>>feature_names = vectorizer.get_feature_names()
>>>coefs = clf.coef_
>>>intercept = clf.intercept_
>>>coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
>>>n = 10
>>>top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
>>>for (coef_1, fn_1), (coef_2, fn_2) in top:
>>> print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1, fn_1, coef_2,
fn_2))

Decision trees

In [None]:
>>>from sklearn import tree
>>>clf = tree.DecisionTreeClassifier().fit(X_train.toarray(), y_train)
>>>y_tree_predicted = clf.predict(X_test.toarray())
>>>print y_tree_predicted
>>>print ' \n Here is the classification report:'
>>>print classification_report(y_test, y_tree_predicted)

In [None]:
>>>from sklearn.linear_model import SGDClassifier
>>>from sklearn.metrics import confusion_matrix
>>>clf = SGDClassifier(alpha=.0001, n_iter=50).fit(X_train, y_train)
>>>y_pred = clf.predict(X_test)
>>>print '\n Here is the classification report:'
>>>print classification_report(y_test, y_pred)
>>>print ' \n confusion_matrix \n '
>>>cm = confusion_matrix(y_test, y_pred)
>>>print cm

The Random forest algorithm

In [None]:
>>>from sklearn.ensemble import RandomForestClassifier
>>>RF_clf = RandomForestClassifier(n_estimators=10)
>>>predicted = RF_clf.predict(X_test)
>>>print '\n Here is the classification report:'
>>>print classification_report(y_test, predicted)
>>>cm = confusion_matrix(y_test, y_pred)
>>>print cm

K-means

In [None]:
>>>from sklearn.cluster import KMeans, MiniBatchKMeans
>>>true_k=5
>>>km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_
init=1)
>>>kmini = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=opts.verbose)
>>># we are using the same test,train data in TFIDF form as we did in
text classification
>>>km_model=km.fit(X_train)
>>>kmini_model=kmini.fit(X_train)
>>>print "For K-mean clustering "
>>>clustering = collections.defaultdict(list)
>>>for idx, label in enumerate(km_model.labels_):
>>> clustering[label].append(idx)
>>>print "For K-mean Mini batch clustering "
>>>clustering = collections.defaultdict(list)
>>>for idx, label in enumerate(kmini_model.labels_):
>>> clustering[label].append(idx)

Topic modeling in text

In [None]:
>>>from gensim import corpora, models, similarities
>>>from itertools import chain
>>>import nltk

In [None]:
>>>from nltk.corpus import stopwords
>>>from operator import itemgetter
>>>import re
>>>documents = [document for document in sms_data]
>>>stoplist = stopwords.words('english')
>>>texts = [[word for word in document.lower().split() if word not in
stoplist] \ for document in documents]