In [37]:
import numpy as np
import nltk, textblob

from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
from sklearn import metrics, datasets, preprocessing

import scipy.stats as stats

# Representing Text Data

## Bag of Words

We shall first create a list of documents. Then we shall tokenize each of them into words. Tokenizing refers to the conversion of documents (as strings) into word vectors.

In [38]:
raw_docs = ["Here are some very simple basic sentences.",
            "They won’t be very interesting , I’m afraid.", 
            "The point of these examples is to learn how basic text cleaning works on *very simple* data."]

In [39]:
raw_docs[1]

'They won’t be very interesting , I’m afraid.'

In [40]:
vec1 = CountVectorizer()
out = vec1.fit_transform(raw_docs)
out

<3x26 sparse matrix of type '<class 'numpy.int64'>'
	with 30 stored elements in Compressed Sparse Row format>

In [41]:
# equivalent approach
vec1 = CountVectorizer()
vec1.fit(raw_docs)
vec1.transform(raw_docs)

<3x26 sparse matrix of type '<class 'numpy.int64'>'
	with 30 stored elements in Compressed Sparse Row format>

The sparse vector corresponding to document 1 in `raw_docs` can be extracted with

In [42]:
# Sparse representation is to omit values that equate zero.

In [43]:
out.toarray()[0]

array([0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0], dtype=int64)

The binary values indicate which features (or words) were present in the document. Match up the features found with the vector above.

In [44]:
vec1.get_feature_names()

['afraid',
 'are',
 'basic',
 'be',
 'cleaning',
 'data',
 'examples',
 'here',
 'how',
 'interesting',
 'is',
 'learn',
 'of',
 'on',
 'point',
 'sentences',
 'simple',
 'some',
 'text',
 'the',
 'these',
 'they',
 'to',
 'very',
 'won',
 'works']

In [45]:
out.toarray()

array([[0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
        0, 1, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 1, 1, 0],
       [0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0,
        1, 1, 0, 1]], dtype=int64)

A vocabulary of words has been set up using these three documents. It can be accessed via the following dictionary. The numbers refer to the id of the word in the sparse matrix representation.

In [46]:
vec1.vocabulary_

{'afraid': 0,
 'are': 1,
 'basic': 2,
 'be': 3,
 'cleaning': 4,
 'data': 5,
 'examples': 6,
 'here': 7,
 'how': 8,
 'interesting': 9,
 'is': 10,
 'learn': 11,
 'of': 12,
 'on': 13,
 'point': 14,
 'sentences': 15,
 'simple': 16,
 'some': 17,
 'text': 18,
 'the': 19,
 'these': 20,
 'they': 21,
 'to': 22,
 'very': 23,
 'won': 24,
 'works': 25}

In [47]:
vec1.vocabulary_['afraid']

0

In [48]:
vec1.vocabulary_['very']

23

The process pf converting documents into word vectors is known as tokenizing. To obtain the tokenized version of a particular document, we can do the following:

In [49]:
vec1a = vec1.build_analyzer()
vec1a(raw_docs[2])

['the',
 'point',
 'of',
 'these',
 'examples',
 'is',
 'to',
 'learn',
 'how',
 'basic',
 'text',
 'cleaning',
 'works',
 'on',
 'very',
 'simple',
 'data']

This process is known as tokenzing, whhich is the converting of document-as-strings into word-vectors.  

### Normalisation

In [50]:
tfer = TfidfVectorizer()
out2 = tfer.fit_transform(raw_docs)
out2.toarray()[0]

array([0.        , 0.4261835 , 0.32412354, 0.        , 0.        ,
       0.        , 0.        , 0.4261835 , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.4261835 , 0.32412354, 0.4261835 , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.25171084, 0.        ,
       0.        ])

In [51]:
out.toarray()[0]

array([0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0], dtype=int64)

The vector representing the first document now contains weighted values instead of raw frequencies.

In [52]:
tfer_ng = TfidfVectorizer(ngram_range=(1,2))
# also have two words.
out3 = tfer_ng.fit_transform(raw_docs)
tfer_ng.get_feature_names()

['afraid',
 'are',
 'are some',
 'basic',
 'basic sentences',
 'basic text',
 'be',
 'be very',
 'cleaning',
 'cleaning works',
 'data',
 'examples',
 'examples is',
 'here',
 'here are',
 'how',
 'how basic',
 'interesting',
 'interesting afraid',
 'is',
 'is to',
 'learn',
 'learn how',
 'of',
 'of these',
 'on',
 'on very',
 'point',
 'point of',
 'sentences',
 'simple',
 'simple basic',
 'simple data',
 'some',
 'some very',
 'text',
 'text cleaning',
 'the',
 'the point',
 'these',
 'these examples',
 'they',
 'they won',
 'to',
 'to learn',
 'very',
 'very interesting',
 'very simple',
 'won',
 'won be',
 'works',
 'works on']

# Sentiment Analysis with TextBlob

### TextBlob

TextBlob is an interesting python library that offer simple API to access its methods and perform some basic NLP tasks.

Let's take a look at how to use textblob to do the same functions as we talked about previously. 

Before the next step, be sure to run the following commands from a terminal, where the virtual environment has been activated:

`$ /Applications/Python\ 3.6/Install\ Certificates.command`

`$ python -m textblob.download_corpora`

In [53]:
("4352345" "435")

'4352345435'

In [54]:
#tokenization
doc1 = ("Data science is pretty awesome! \n "
       "There's so many discussions that we can hold about it.")
blob = TextBlob(doc1)

In [55]:
blob.sentences[1]

Sentence("There's so many discussions that we can hold about it.")

In [56]:
blob.sentences[1].words

WordList(['There', "'s", 'so', 'many', 'discussions', 'that', 'we', 'can', 'hold', 'about', 'it'])

In [57]:
print(blob.sentences)
for words in blob.sentences[0].words:
    print (words)

[Sentence("Data science is pretty awesome!"), Sentence("There's so many discussions that we can hold about it.")]
Data
science
is
pretty
awesome


Notice how we can tokenize the textblob into sentences, and further into words.  We can perform another function called noun phase extraction that extracts just the noun phrases. 

In [58]:
#noun phrase extraction 
for np in blob.noun_phrases:
    print (np)

data
pretty awesome


In [59]:
blob.sentences

[Sentence("Data science is pretty awesome!"),
 Sentence("There's so many discussions that we can hold about it.")]

Now let's look at how lemmatization can be done.   

In [60]:
#lemmatization 
print (blob.sentences[1].words[4])
print (blob.sentences[1].words[4].singularize())
print (blob.sentences[0].words[1])
print (blob.sentences[0].words[1].pluralize())
#lemmatizing single words
from textblob import Word
w = Word('running')
w.lemmatize("v") ## v here represents verb

discussions
discussion
science
sciences


'run'

### TextBlob Output

In [61]:
blob.sentiment

Sentiment(polarity=0.5833333333333334, subjectivity=0.8333333333333334)

In [62]:
# polarity: posive
# subjectivity: subjective

In [63]:
blob.sentiment_assessments

Sentiment(polarity=0.5833333333333334, subjectivity=0.8333333333333334, assessments=[(['pretty'], 0.25, 1.0, None), (['awesome', '!'], 1.0, 1.0, None), (['many'], 0.5, 0.5, None)])