### Bag of words model

In [5]:
# load all necessary libraries
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('max_colwidth', 100)

#### Let's build a basic bag of words model on three sample documents

In [2]:
documents = ["Gangs of Wasseypur is a great movie.", "The success of a movie depends on the performance of the actors.", "There are no new movies releasing this week."]
print(documents)

['Gangs of Wasseypur is a great movie.', 'The success of a movie depends on the performance of the actors.', 'There are no new movies releasing this week.']


In [3]:
def preprocess(document):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    # join words to make sentence
    document = " ".join(words)
    
    return document

In [4]:
documents = [preprocess(document) for document in documents]
print(documents)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/home/shakeeb/nltk_data'
    - '/home/shakeeb/anaconda3/nltk_data'
    - '/home/shakeeb/anaconda3/share/nltk_data'
    - '/home/shakeeb/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


#### Creating bag of words model using count vectorizer function

In [None]:
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(documents)
print(bow_model)  # returns the rown and column number of cells which have 1 as value

In [None]:
# print the full sparse matrix
print(bow_model.toarray())

In [None]:
print(bow_model.shape)
print(vectorizer.get_feature_names())

### Let's create a bag of words model on the spam dataset.

In [6]:
# load data
spam = pd.read_csv("SMSSpamCollection.txt", sep = "\t", names=["label", "message"])
# spam.head()

##### Let's take a subset of data (first 50 rows only) and create bag of word model on that.

In [7]:
spam = spam.iloc[0:100,:]
# print(spam)

In [8]:
# extract the messages from the dataframe
messages = spam.message
# print(messages)

In [9]:
# convert messages into list
messages = [message for message in messages]
# print(messages)

In [15]:
# preprocess messages using the preprocess function
messages = [preprocess(message) for message in messages]
# print(messages)

In [16]:
# bag of words model
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(messages)

In [17]:
bow_model.shape

(100, 640)

In [19]:
sum(sum(bow_model.toarray()))

934

In [None]:
# look at the dataframe
pd.DataFrame(bow_model.toarray(), columns = vectorizer.get_feature_names())

In [None]:
print(vectorizer.get_feature_names())

* A lot of duplicate tokens such as 'win'and 'winner'; 'reply' and 'replying'; 'want' and 'wanted' etc. 

In [None]:
documents = ["there was a place on my ankle that was itching",
             "but I did not scratch it", 
             "and then my ear began to itch",
             "and next my back"]

vec = CountVectorizer()
bom = vec.fit_transform(documents)
bom.shape

In [None]:
len(vec.get_feature_names())

In [None]:
index = vec.get_feature_names().index("was")

In [None]:
bom.toarray()[0][index]