# Chapter4: Transforming Text into Data Structures

## Understanding vectors and matrices

In [1]:
#building vectors and matrices from text data
from sklearn.feature_extraction.text import CountVectorizer
X = ("Computers can analyze text", "They do it using vectors and matrices", 
     "Computers can process massive amounts of text data")
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(X)
print(vectorizer.vocabulary_)
print(X_vec.todense())

{'computers': 2, 'analyze': 1, 'text': 7, 'using': 8, 'vectors': 9, 'matrices': 5, 'process': 6, 'massive': 4, 'amounts': 0, 'data': 3}
[[0 1 1 0 0 0 0 1 0 0]
 [0 0 0 0 0 1 0 0 1 1]
 [1 0 1 1 1 0 1 1 0 0]]


## Exploring the Bag-of-Words architecture

#### Required imports

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import numpy as np

#### Take a list of sentences

In [3]:
sentences = ["We are reading about Natural Language Processing Here",
             "Natural Language Processing making computers comprehend language data",
             "The field of Natural Language Processing is evolving everyday"]

#### Create a pandas series object from the list of sentences,

In [4]:
corpus = pd.Series(sentences)
corpus

0    We are reading about Natural Language Processi...
1    Natural Language Processing making computers c...
2    The field of Natural Language Processing is ev...
dtype: object

#### Preprocess the corpus using the NLP pipeline

In [5]:
#The different process present in the pipeline

#tokenizing
def tokenizer(corpus, keep_list = []):
    cleaned_rows = []
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub('[^a-zA-Z0-9]', ' ', word).lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_rows.append(' '.join(qs))
    return pd.Series(cleaned_rows)

#removing stopwords
def remove_stops(corpus):
    stop = set(stopwords.words('english'))
    corpus = [[word for word in sentence.split() if word not in stop] for sentence in corpus]
    return corpus

#stemming
def stemmer(corpus, stem_type):
    if stem_type == 'Porter':
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(word) for word in sentence] for sentence in corpus]          

    if stem_type == 'Snowball':
        stemmer = SnowballStemmer(language='english')
        corpus = [' '.join([stemmer.stem(word) for word in sentence]) for sentence in corpus] 

        return corpus
#lemmatization
def lemmatizer(corpus):
    lemmatizer = WordNetLemmatizer()
    corpus = [' '.join([lemmatizer.lemmatize(x, pos = 'v') for x in x]) for x in corpus]
    return corpus

In [6]:
def preprocess(corpus, keep_list, stemming, stem_type, lemmatization, remove_stopwords):
    
    corpus = tokenizer(corpus, keep_list)
    
    if remove_stopwords:
        corpus = remove_stops(corpus)
    
    if stemming:
        corpus = stemmer(corpus, stem_type)
        
    if lemmatization:
        corpus = lemmatizer(corpus)
    
    return corpus    

In [7]:
preprocessed_corpus = preprocess(corpus,keep_list = [], stemming = False, \
                                 stem_type = None, lemmatization = True,remove_stopwords = True)
preprocessed_corpus

['read natural language process',
 'natural language process make computers comprehend language data',
 'field natural language process evolve everyday']

#### Building Vocabulary

In [8]:
set_of_words = set()
for sentence in preprocessed_corpus:
    for word in sentence.split():
        set_of_words.add(word)
vocab = list(set_of_words)
print(vocab)

['language', 'make', 'field', 'natural', 'everyday', 'computers', 'evolve', 'process', 'comprehend', 'data', 'read']


#### Fetching the position/index of each token in the vocabulary

In [9]:
position = {}
for i, token in enumerate(vocab):
    position[token] = i
print(position)

{'language': 0, 'make': 1, 'field': 2, 'natural': 3, 'everyday': 4, 'computers': 5, 'evolve': 6, 'process': 7, 'comprehend': 8, 'data': 9, 'read': 10}


#### Create a placeholder matrix for holding the BoW.

In [10]:
bow_matrix = np.zeros((len(preprocessed_corpus), len(vocab)))

#### Increase the positional index of every word by 1 if it appears in a sentence

In [11]:
for i, preprocessed_sentence in enumerate(preprocessed_corpus):
    for token in preprocessed_sentence.split():
        bow_matrix[i][position[token]] = bow_matrix[i][position[token]] + 1

In [12]:
bow_matrix

array([[1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1.],
       [2., 1., 0., 1., 0., 1., 0., 1., 1., 1., 0.],
       [1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0.]])

## Understanding a basic CountVectorizer

In [13]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(preprocessed_corpus)

In [14]:
print(vectorizer.get_feature_names())
print(bow_matrix.toarray())

['comprehend', 'computers', 'data', 'everyday', 'evolve', 'field', 'language', 'make', 'natural', 'process', 'read']
[[0 0 0 0 0 0 1 0 1 1 1]
 [1 1 1 0 0 0 2 1 1 1 0]
 [0 0 0 1 1 1 1 0 1 1 0]]


## Out-of-the-box features offered by CountVectorizer

#### Prebuilt dictionary and support for n-grams

In [15]:
vectorizer_ngram_range = CountVectorizer(analyzer='word', ngram_range=(1,3))
bow_matrix_ngram = vectorizer_ngram_range.fit_transform(preprocessed_corpus)
print(vectorizer_ngram_range.get_feature_names())
print(bow_matrix_ngram.toarray())

['comprehend', 'comprehend language', 'comprehend language data', 'computers', 'computers comprehend', 'computers comprehend language', 'data', 'everyday', 'evolve', 'evolve everyday', 'field', 'field natural', 'field natural language', 'language', 'language data', 'language process', 'language process evolve', 'language process make', 'make', 'make computers', 'make computers comprehend', 'natural', 'natural language', 'natural language process', 'process', 'process evolve', 'process evolve everyday', 'process make', 'process make computers', 'read', 'read natural', 'read natural language']
[[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1]
 [1 1 1 1 1 1 1 0 0 0 0 0 0 2 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 0 0 0]
 [0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0]]


#### max_features

In [16]:
vectorizer_max_features = CountVectorizer(analyzer='word', ngram_range=(1,3),
max_features = 6)
bow_matrix_max_features = vectorizer_max_features.fit_transform(preprocessed_corpus)
print(vectorizer_max_features.get_feature_names())
print(bow_matrix_max_features.toarray())

['language', 'language process', 'natural', 'natural language', 'natural language process', 'process']
[[1 1 1 1 1 1]
 [2 1 1 1 1 1]
 [1 1 1 1 1 1]]


#### Min_df and Max_df thresholds

In [17]:
vectorizer_max_features = CountVectorizer(analyzer='word', ngram_range=(1,3), max_df
= 3, min_df = 2)
bow_matrix_max_features = vectorizer_max_features.fit_transform(preprocessed_corpus)
print(vectorizer_max_features.get_feature_names())
print(bow_matrix_max_features.toarray())

['language', 'language process', 'natural', 'natural language', 'natural language process', 'process']
[[1 1 1 1 1 1]
 [2 1 1 1 1 1]
 [1 1 1 1 1 1]]


## TF-IDF vectors

#### Building a basic TF-IDF vectorizer

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
#Instansiating basic TF-IDF vectorizer
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(preprocessed_corpus)

In [19]:
#results on the preprocessed corpus after TF-IDF vectorization
print(vectorizer.get_feature_names())
print(tf_idf_matrix.toarray())
print("\nThe shape of the TF-IDF matrix is: ", tf_idf_matrix.shape)

['comprehend', 'computers', 'data', 'everyday', 'evolve', 'field', 'language', 'make', 'natural', 'process', 'read']
[[0.         0.         0.         0.         0.         0.
  0.41285857 0.         0.41285857 0.41285857 0.69903033]
 [0.40512186 0.40512186 0.40512186 0.         0.         0.
  0.478543   0.40512186 0.2392715  0.2392715  0.        ]
 [0.         0.         0.         0.49711994 0.49711994 0.49711994
  0.29360705 0.         0.29360705 0.29360705 0.        ]]

The shape of the TF-IDF matrix is:  (3, 11)


In [20]:
#when the norm is changed to l1
vectorizer_l1_norm = TfidfVectorizer(norm="l1")
tf_idf_matrix_l1_norm = vectorizer_l1_norm.fit_transform(preprocessed_corpus)
print(vectorizer_l1_norm.get_feature_names())
print(tf_idf_matrix_l1_norm.toarray())
print("\nThe shape of the TF-IDF matrix is: ", tf_idf_matrix_l1_norm.shape)

['comprehend', 'computers', 'data', 'everyday', 'evolve', 'field', 'language', 'make', 'natural', 'process', 'read']
[[0.         0.         0.         0.         0.         0.
  0.21307663 0.         0.21307663 0.21307663 0.3607701 ]
 [0.1571718  0.1571718  0.1571718  0.         0.         0.
  0.1856564  0.1571718  0.0928282  0.0928282  0.        ]
 [0.         0.         0.         0.2095624  0.2095624  0.2095624
  0.12377093 0.         0.12377093 0.12377093 0.        ]]

The shape of the TF-IDF matrix is:  (3, 11)


#### N-grams and maximum features in the TF-IDF vectorizer

In [21]:
vectorizer_n_gram_max_features = TfidfVectorizer(norm="l2", analyzer='word',ngram_range=(1,3), max_features = 6)
tf_idf_matrix_n_gram_max_features = vectorizer_n_gram_max_features.fit_transform(preprocessed_corpus)
print(vectorizer_n_gram_max_features.get_feature_names())
print(tf_idf_matrix_n_gram_max_features.toarray())
print("\nThe shape of the TF-IDF matrix is: ", tf_idf_matrix_n_gram_max_features.shape)

['language', 'language process', 'natural', 'natural language', 'natural language process', 'process']
[[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]
 [0.66666667 0.33333333 0.33333333 0.33333333 0.33333333 0.33333333]
 [0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]

The shape of the TF-IDF matrix is:  (3, 6)


## Distance/similarity calculation between document vectors

#### Cosine similarity

In [22]:
def cosine_similarity(vector1, vector2):
    vector1 = np.array(vector1)
    vector2 = np.array(vector2)
    return np.dot(vector1, vector2) / (np.sqrt(np.sum(vector1**2)) * np.sqrt(np.sum(vector2**2)))

In [23]:
for i in range(bow_matrix.shape[0]):
    for j in range(i + 1, bow_matrix.shape[0]):
        print("The cosine similarity between the documents ", i, "and", j, "is: ", \
              cosine_similarity(bow_matrix.toarray()[i], bow_matrix.toarray()[j]))

The cosine similarity between the documents  0 and 1 is:  0.6324555320336759
The cosine similarity between the documents  0 and 2 is:  0.6123724356957946
The cosine similarity between the documents  1 and 2 is:  0.5163977794943223


#### Cosine similarity on vectors developed using TfIdfVectorizers tool

In [24]:
for i in range(tf_idf_matrix.shape[0]):
    for j in range(i + 1, tf_idf_matrix.shape[0]):
        print("The cosine similarity between the documents ", i, "and", j, "is: ", \
              cosine_similarity(tf_idf_matrix.toarray()[i], tf_idf_matrix.toarray()[j]))

The cosine similarity between the documents  0 and 1 is:  0.39514115766749125
The cosine similarity between the documents  0 and 2 is:  0.36365455673761865
The cosine similarity between the documents  1 and 2 is:  0.2810071916500233


## One-hot vectorization

#### Example corpus

In [25]:
sentence = ["We are reading about Natural Language Processing Here"]
corpus = pd.Series(sentence)
corpus

0    We are reading about Natural Language Processi...
dtype: object

#### Preprocessing the corpus

In [26]:
# Preprocessing with Lemmatization here
preprocessed_corpus = preprocess(corpus, keep_list = [], stemming = False, stem_type = None,
                                 lemmatization = True, remove_stopwords = True)
preprocessed_corpus

['read natural language process']

####  Building the vocabulary

In [27]:
set_of_words = set()
for word in preprocessed_corpus[0].split():
    set_of_words.add(word)
vocab = list(set_of_words)
print(vocab)

['language', 'natural', 'read', 'process']


#### Maintaining the position of each token in the vocabulary

In [28]:
position = {}
for i, token in enumerate(vocab):
    position[token] = i
print(position)

{'language': 0, 'natural': 1, 'read': 2, 'process': 3}


#### Instantiating the one-hot matrix

In [29]:
one_hot_matrix = np.zeros((len(preprocessed_corpus[0].split()),
len(vocab)))
one_hot_matrix.shape

(4, 4)

#### Building One-Hot Vectors

In [30]:
for i, token in enumerate(preprocessed_corpus[0].split()):
    one_hot_matrix[i][position[token]] = 1

#### Visualizing the one-hot matrix

In [31]:
one_hot_matrix

array([[0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.]])

## Building a basic chatbot

#### Iterating through each dictionary to extract and store questions and answers in separate lists

In [32]:
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


#loading questions and answers in separate lists
import ast 
questions = []
answers = [] 
with open('qa_Electronics.json','r') as f:
    for line in f:
        data = ast.literal_eval(line)
        questions.append(data['question'].lower())
        answers.append(data['answer'].lower())

#### Using CountVectorizer to convert the questions list into a sparse matrix and apply TF-IDF transformation

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(questions)
tfidf = TfidfTransformer() #by default applies "l2" normalization
X_tfidf = tfidf.fit_transform(X_vec)

#### Finding Cosine Similarity

In [34]:
def conversation(im):
    global tfidf, answers, X_tfidf
    Y_vec = vectorizer.transform(im)
    Y_tfidf = tfidf.fit_transform(Y_vec)
    cos_sim = np.rad2deg(np.arccos(max(cosine_similarity(Y_tfidf, X_tfidf)[0])))
    if cos_sim > 60 :
        return "sorry, I did not quite understand that"
    else:
        return answers[np.argmax(cosine_similarity(Y_tfidf, X_tfidf)[0])]

#### Implementing the chat

In [35]:
def main():
    usr = input("Please enter your username: ")
    print("support: Hi, welcome to Q&A support. How can I help you?")
    while True:
        im = input("{}: ".format(usr))
        if im.lower() == 'bye':
            print("Q&A support: bye!")
            break
        else:
            print("Q&A support: "+conversation([im]))

In [36]:
main()

Please enter your username: Jeevitesh
support: Hi, welcome to Q&A support. How can I help you?
Jeevitesh: My battery life is decreasing
Q&A support: so far after i charge the battery it will last about 90 minutes. i have not had any issues with the battery.
Jeevitesh: Where can i get original parts
Q&A support: there are flaps that push into pockets on the middle part (the notebook). there is a strip of surprisingly strong velcro that secures it. my son carried an ipad in one flap compartmant and his lunch...including a bottle of water in the opposite flap.
Jeevitesh: bye
Q&A support: bye!
