### Bag of words model

In [2]:
# load all necessary libraries
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('max_colwidth', 100)

#### Let's build a basic bag of words model on three sample documents

In [3]:
documents = ["Gangs of Wasseypur is a great movie.", "The success of a movie depends on the performance of the actors.", "There are no new movies releasing this week."]
print(documents)

['Gangs of Wasseypur is a great movie.', 'The success of a movie depends on the performance of the actors.', 'There are no new movies releasing this week.']


In [4]:
def preprocess(document):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    # join words to make sentence
    document = " ".join(words)
    
    return document

documents = [preprocess(document) for document in documents]
print(documents)


['gangs wasseypur great movie .', 'success movie depends performance actors .', 'new movies releasing week .']


#### Creating bag of words model using count vectorizer function

In [5]:
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(documents)
print(bow_model)  # returns the rown and column number of cells which have 1 as value

  (0, 2)	1
  (0, 10)	1
  (0, 3)	1
  (0, 4)	1
  (1, 4)	1
  (1, 9)	1
  (1, 1)	1
  (1, 7)	1
  (1, 0)	1
  (2, 6)	1
  (2, 5)	1
  (2, 8)	1
  (2, 11)	1


In [6]:
# print the full sparse matrix
print(bow_model.toarray())

[[0 0 1 1 1 0 0 0 0 0 1 0]
 [1 1 0 0 1 0 0 1 0 1 0 0]
 [0 0 0 0 0 1 1 0 1 0 0 1]]


In [7]:
print(bow_model.shape)
print(vectorizer.get_feature_names())

(3, 12)
['actors', 'depends', 'gangs', 'great', 'movie', 'movies', 'new', 'performance', 'releasing', 'success', 'wasseypur', 'week']




In [8]:
pd.DataFrame(bow_model.toarray(), columns = vectorizer.get_feature_names())

Unnamed: 0,actors,depends,gangs,great,movie,movies,new,performance,releasing,success,wasseypur,week
0,0,0,1,1,1,0,0,0,0,0,1,0
1,1,1,0,0,1,0,0,1,0,1,0,0
2,0,0,0,0,0,1,1,0,1,0,0,1


### Stemming and Lemmatization

In [13]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [14]:
words = ["desire", "desires", "desirable", "desiring", "desiri", 
         "eat", "eating", "ate", "feet"]

In [17]:
ps = PorterStemmer()
lem = WordNetLemmatizer()

In [20]:
stem_words = [ps.stem(w) for w in words]
lemma_words = [lem.lemmatize(w) for w in words]

In [21]:
pd.DataFrame({"word" : words,
             "stem" : stem_words,
             "lemma" : lemma_words})

Unnamed: 0,word,stem,lemma
0,desire,desir,desire
1,desires,desir,desire
2,desirable,desir,desirable
3,desiring,desir,desiring
4,desiri,desiri,desiri
5,eat,eat,eat
6,eating,eat,eating
7,ate,ate,ate
8,feet,feet,foot
