### Import the required modules

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

### Text data and how to make it ready for Machine Learning

#### A very small test text dataset

In [2]:
test_text_data = ['Check out this link', 'Lets go get a drink', 'This is the best video you will ever see']

#### Instantiate `CountVectorizer`

In [3]:
vectorizer = CountVectorizer()

#### Fitting our data

In [4]:
vectorizer.fit(test_text_data)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

#### Let's look at the vectorized word tokens

In [5]:
vectorizer.get_feature_names()

['best',
 'check',
 'drink',
 'ever',
 'get',
 'go',
 'is',
 'lets',
 'link',
 'out',
 'see',
 'the',
 'this',
 'video',
 'will',
 'you']

#### Transform to Document Term Matrix

In [6]:
dtm = vectorizer.transform(test_text_data)
dtm

<3x16 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [7]:
pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,best,check,drink,ever,get,go,is,lets,link,out,see,the,this,video,will,you
0,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0
1,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,1,0,0,0,1,1,1,1,1,1


#### Test a new record against the bag of word

In [8]:
test_record = ['This is amazing. Check it out.']

In [9]:
test_dtm = vectorizer.transform(test_record)

In [10]:
pd.DataFrame(test_dtm.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,best,check,drink,ever,get,go,is,lets,link,out,see,the,this,video,will,you
0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0


### Real Dataset

Dataset based on - 
* University of Michigan Sentiment Analysis competition on Kaggle
* Twitter Sentiment Corpus by Niek Sanders(Sentiment140)


#### Read in the dataset

In [11]:
data = pd.read_table('tweets.tsv', usecols=[0,1])

### Examine the dataset

In [12]:
data.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL friend.............
1,0,I missed the New Moon trailer...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I've been at...
4,0,i think mi bf is cheating on me!!! T_T


In [13]:
data.shape

(140222, 2)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140222 entries, 0 to 140221
Data columns (total 2 columns):
Sentiment        140222 non-null int64
SentimentText    140222 non-null object
dtypes: int64(1), object(1)
memory usage: 1.6+ MB


In [15]:
data.Sentiment.value_counts()

1    80891
0    59331
Name: Sentiment, dtype: int64

#### Take out features and labels from the data

In [16]:
X = data.SentimentText
y = data.Sentiment
print(X.shape)
print(y.shape)

(140222,)
(140222,)


#### Instantiate `CountVectorizer`

In [17]:
vectorizer = CountVectorizer()

#### Fit it with the text data

In [18]:
vectorizer.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

#### Transform the data to a document term matrix

In [19]:
X_dtm = vectorizer.transform(X)

In [20]:
X_dtm

<140222x140612 sparse matrix of type '<class 'numpy.int64'>'
	with 1699101 stored elements in Compressed Sparse Row format>