In [1]:
# Importing the library
import nltk

In [2]:
# Input text data
paragraph = """Real Madrid simply show no sign of letting up. The LaLiga table-toppers saw off Alavés at 
               the Di Stéfano to make it eight wins on the bounce and retain the four-point buffer at the 
               summit with three games to go. The Madrid goals came from Karim Benzema, who converted 
               from the spot, whilst Marco Asensio was also on the mark for the hosts, who recorded a fifth 
               successive shutout. Ferland Mendy started at left wing-back, with Lucas Vázquez occupying 
               the right wing-back berth and inside the first minute, the pair were involved in the madridistas' 
               first forward foray, which culminated in Luka Modric sending his effort wide of the target. 
               The Alavés response wasn't long in coming and Joselu's headed effort struck the crossbar, 
               whilst Raphaël Varane cleared a Lucas Pérez's follow-up off the line. It looked as if we were 
               in store for a high-tempo affair and just after the 10-minute mark, Mendy once again showed what a 
               threat he is down the left. Ximo Navarro upended the Frenchman in the area and Benzema stepped 
               up to make it 1-0. With 12 minutes gone, Toni Kroos’ did his best to find the top corner, before a 
               fierce Mendy cross nearly forced Camarasa to turn into his own net on 17’. The Blanquiazules refused
               to roll over though, with Oliver Burke proving a constant nuisance for the defence and testing
               Thibaut Courtois, despite the final chances before the break falling to Rodrygo and Benzema. 
               After the restart, referee Gil Manzano retired injured and by the time the 50th minute came around, 
               Madrid had added to their advantage. Benzema and Asensio raced through on goal, up against Roberto, 
               and the Balearic Island-born forward stroked home with ease, though his goal was originally ruled 
               out for offside before being correctly awarded by VAR."""

In [3]:
import re # Used for cleaning the text such as commas, full-stops, question mark, etc
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Creating the object for Stemming and Lemmatization
ps = PorterStemmer()
wordnet = WordNetLemmatizer()

### Stemming

In [4]:
# Converting the paragraph into sentences
sentences = nltk.sent_tokenize(paragraph)

# storing the cleaned text
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]',' ', sentences[i]) # Here, we are replacing everything with spaces in the input paragraph apart from characters 'a-z' and 'A-Z'
    review = review.lower()  # Lowering each and every sentence i.e. lowercase
    review = review.split()  # When we apply this, we'll be getting the list of words from the sentences
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)


In [8]:
sentences

['Real Madrid simply show no sign of letting up.',
 'The LaLiga table-toppers saw off Alavés at \n               the Di Stéfano to make it eight wins on the bounce and retain the four-point buffer at the \n               summit with three games to go.',
 'The Madrid goals came from Karim Benzema, who converted \n               from the spot, whilst Marco Asensio was also on the mark for the hosts, who recorded a fifth \n               successive shutout.',
 "Ferland Mendy started at left wing-back, with Lucas Vázquez occupying \n               the right wing-back berth and inside the first minute, the pair were involved in the madridistas' \n               first forward foray, which culminated in Luka Modric sending his effort wide of the target.",
 "The Alavés response wasn't long in coming and Joselu's headed effort struck the crossbar, \n               whilst Raphaël Varane cleared a Lucas Pérez's follow-up off the line.",
 'It looked as if we were \n               in store for a hi

In [7]:
corpus

['real madrid simpli show sign let',
 'laliga tabl topper saw alav di st fano make eight win bounc retain four point buffer summit three game go',
 'madrid goal came karim benzema convert spot whilst marco asensio also mark host record fifth success shutout',
 'ferland mendi start left wing back luca v zquez occupi right wing back berth insid first minut pair involv madridista first forward foray culmin luka modric send effort wide target',
 'alav respons long come joselu head effort struck crossbar whilst rapha l varan clear luca p rez follow line',
 'look store high tempo affair minut mark mendi show threat left',
 'ximo navarro upend frenchman area benzema step make',
 'minut gone toni kroo best find top corner fierc mendi cross nearli forc camarasa turn net',
 'blanquiazul refus roll though oliv burk prove constant nuisanc defenc test thibaut courtoi despit final chanc break fall rodrygo benzema',
 'restart refere gil manzano retir injur time th minut came around madrid ad advantag

### Lemmatization

In [12]:
# Converting the paragraph into sentences
sentences = nltk.sent_tokenize(paragraph)

# storing the cleaned text
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]',' ', sentences[i]) # Here, we are replacing everything with spaces in the input paragraph apart from characters 'a-z' and 'A-Z'
    review = review.lower()  # Lowering each and every sentence i.e. lowercase
    review = review.split()  # When we apply this, we'll be getting the list of words from the sentences
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [13]:
sentences

['Real Madrid simply show no sign of letting up.',
 'The LaLiga table-toppers saw off Alavés at \n               the Di Stéfano to make it eight wins on the bounce and retain the four-point buffer at the \n               summit with three games to go.',
 'The Madrid goals came from Karim Benzema, who converted \n               from the spot, whilst Marco Asensio was also on the mark for the hosts, who recorded a fifth \n               successive shutout.',
 "Ferland Mendy started at left wing-back, with Lucas Vázquez occupying \n               the right wing-back berth and inside the first minute, the pair were involved in the madridistas' \n               first forward foray, which culminated in Luka Modric sending his effort wide of the target.",
 "The Alavés response wasn't long in coming and Joselu's headed effort struck the crossbar, \n               whilst Raphaël Varane cleared a Lucas Pérez's follow-up off the line.",
 'It looked as if we were \n               in store for a hi

In [14]:
corpus

['real madrid simply show sign letting',
 'laliga table topper saw alav di st fano make eight win bounce retain four point buffer summit three game go',
 'madrid goal came karim benzema converted spot whilst marco asensio also mark host recorded fifth successive shutout',
 'ferland mendy started left wing back lucas v zquez occupying right wing back berth inside first minute pair involved madridistas first forward foray culminated luka modric sending effort wide target',
 'alav response long coming joselu headed effort struck crossbar whilst rapha l varane cleared lucas p rez follow line',
 'looked store high tempo affair minute mark mendy showed threat left',
 'ximo navarro upended frenchman area benzema stepped make',
 'minute gone toni kroos best find top corner fierce mendy cross nearly forced camarasa turn net',
 'blanquiazules refused roll though oliver burke proving constant nuisance defence testing thibaut courtois despite final chance break falling rodrygo benzema',
 'restart 

### Applying Bag of Words model

In [15]:
# Importing the library
from sklearn.feature_extraction.text import CountVectorizer

# Creating the object 
cv = CountVectorizer()
# Fit the model to the corpus
X = cv.fit_transform(corpus).toarray()

In [25]:
# Shape of the matrix
X.shape

(11, 152)

In [36]:
import numpy as np
import sys
np.set_printoptions(threshold=sys.maxsize)
# View the matrix
print(X)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
  0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0
  0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0
  0 0 0 0 1 0 0 0]
 [0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
  0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0

##### This is how we implement Bag of Words to a text data and one more thing to note that there is a major drawback in this which is that in the reperesentation matrix above we only see the values in the form of '1's and '0's, and because of that we are not able to distinguish that which word is more important than the other as they have similar vectors. So, in order to solve this problem, we have something called TF-IDF i.e. Term Frequency and Inverse Document Frequency

### Now, it's your turn to try this out by yourself. Till then, PEACE...✌️ 