# Information retrieval using word embeddings.

In [5]:
Doc1 = ["""With the Union cabinet approving the amendments to the Motor Vehicles Act, 2016, those caught for drunken driving will 
have to have really deep pockets, as the fine payable in court has been enhanced to Rs 10,000 for first-time offenders.""" ]

In [6]:
Doc2 = ["""Natural language processing (NLP) is an area of computer science and artificial intelligence concerned with the
interactions between computers and human (natural) languages, in particular how to program computers to process and analyze
large amounts of natural language data."""]

In [7]:
Doc3 = ["""He points out that public transport is very good in Mumbai and New Delhi, where there is a good network of suburban
and metro rail systems."""]

In [8]:
Doc4 = ["""But the man behind the wickets at the other end was watching just as keenly. With an affirmative nod from Dhoni,
India captain Rohit Sharma promptly asked for a review. Sure enough, the ball would have clipped the top of middle and leg."""]

In [17]:
fin= Doc1+Doc2+Doc3+Doc4
fin

['With the Union cabinet approving the amendments to the Motor Vehicles Act, 2016, those caught for drunken driving will \nhave to have really deep pockets, as the fine payable in court has been enhanced to Rs 10,000 for first-time offenders.',
 'Natural language processing (NLP) is an area of computer science and artificial intelligence concerned with the\ninteractions between computers and human (natural) languages, in particular how to program computers to process and analyze\nlarge amounts of natural language data.',
 'He points out that public transport is very good in Mumbai and New Delhi, where there is a good network of suburban\nand metro rail systems.',
 'But the man behind the wickets at the other end was watching just as keenly. With an affirmative nod from Dhoni,\nIndia captain Rohit Sharma promptly asked for a review. Sure enough, the ball would have clipped the top of middle and leg.']

In [69]:
import gensim
from gensim.models import Word2Vec
import numpy as np
import nltk
import itertools
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import scipy
from scipy import spatial
from nltk.tokenize.toktok import ToktokTokenizer
import re
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

In [11]:
# As mentioned earlier, we are going to use the word embeddings to solve
# this problem. Download word2vec from the below link:
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit


In [16]:
#load the model
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

In [1]:
# list(model.key_to_index)

In [36]:
# Now we build the information retrieval system:

#Preprocessing
def remove_stopwords(text, is_lower_case=False):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', ''.join(text))
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [37]:
# Function to get the embedding vector for n dimension, we have used "300"
def get_embedding(word):
    if word in list(model.key_to_index):
        return model[word]
    else:
        return np.zeros(300)


In [38]:
# For every document, we will get a lot of vectors based on the number of
# words present. We need to calculate the average vector for the document
# through taking a mean of all the word vectors.


# Getting average vector for each document
out_dict = {}
for sen in fin:
    average_vector = (np.mean(np.array([get_embedding(x) for x in nltk.word_tokenize(remove_stopwords(sen))]), axis=0))
    dict = { sen : (average_vector) }
    out_dict.update(dict)

In [40]:
# Function to calculate the similarity between the query vector and document vector

def get_sim(query_embedding, average_vector_doc):
    sim = [(1 - scipy.spatial.distance.cosine(query_embedding,
    average_vector_doc))]
    return sim

# Rank all the documents based on the similarity to get relevant docs
def Ranked_documents(query):
    query_words = (np.mean(np.array([get_embedding(x) for x in nltk.word_tokenize(query.lower())],dtype=float), axis=0))
    rank = []
    for k,v in out_dict.items():
        rank.append((k, get_sim(query_words, v)))
    rank = sorted(rank,key=lambda t: t[1], reverse=True)
    print('Ranked Documents :')
    return rank

In [41]:
# Let’s see how the information retrieval system we built is working with a
# couple of examples.

# Call the IR function with a query
Ranked_documents("cricket")

Ranked Documents :


[('But the man behind the wickets at the other end was watching just as keenly. With an affirmative nod from Dhoni,\nIndia captain Rohit Sharma promptly asked for a review. Sure enough, the ball would have clipped the top of middle and leg.',
  [0.44954328830341783]),
 ('He points out that public transport is very good in Mumbai and New Delhi, where there is a good network of suburban\nand metro rail systems.',
  [0.23973446930269127]),
 ('With the Union cabinet approving the amendments to the Motor Vehicles Act, 2016, those caught for drunken driving will \nhave to have really deep pockets, as the fine payable in court has been enhanced to Rs 10,000 for first-time offenders.',
  [0.1832371201201335]),
 ('Natural language processing (NLP) is an area of computer science and artificial intelligence concerned with the\ninteractions between computers and human (natural) languages, in particular how to program computers to process and analyze\nlarge amounts of natural language data.',
  [0.

In [None]:
 #If you see, doc4 (on top in result), this will be most relevant for the
# query “cricket” even though the word “cricket” is not even mentioned once
# with the similarity of 0.449.


In [None]:
# Let’s see how the information retrieval system we built is working with a
# couple of examples.

In [42]:
Ranked_documents("driving")

Ranked Documents :


[('With the Union cabinet approving the amendments to the Motor Vehicles Act, 2016, those caught for drunken driving will \nhave to have really deep pockets, as the fine payable in court has been enhanced to Rs 10,000 for first-time offenders.',
  [0.3594728772380067]),
 ('But the man behind the wickets at the other end was watching just as keenly. With an affirmative nod from Dhoni,\nIndia captain Rohit Sharma promptly asked for a review. Sure enough, the ball would have clipped the top of middle and leg.',
  [0.19042557661139026]),
 ('He points out that public transport is very good in Mumbai and New Delhi, where there is a good network of suburban\nand metro rail systems.',
  [0.1706653724240128]),
 ('Natural language processing (NLP) is an area of computer science and artificial intelligence concerned with the\ninteractions between computers and human (natural) languages, in particular how to program computers to process and analyze\nlarge amounts of natural language data.',
  [0.0

We can use the same approach and scale it up for as many documents as possible. For more accuracy, we can build our own embeddings,  the one we are using is generalized.

This is the fundamental approach that can be used for many applications like the following:

• Search engines\
• Document retrieval\
• Passage retrieval\
• Question and answer

# Classifying Text with Deep Learning

In [87]:
import pandas as pd
from nltk.corpus import stopwords
# from nltk import *
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, Conv1D, SimpleRNN
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.keras.layers import Dense, Input, Flatten, Dropout, BatchNormalization
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from keras.layers.recurrent import SimpleRNN


In [101]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [56]:
# build a text classification model using CNN, RNN, and LSTM.
# Email classification (spam or ham).

In [57]:
# The approach and NLP pipeline would remain the same as discussed
# earlier. The only change would be that instead of using machine learning
# algorithms, we would be building models using deep learning algorithms.

In [58]:
#read file
file_content = pd.read_csv('spam.csv', encoding = "ISO-8859-1")
#check sample content in the email
file_content

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [59]:
file_content['v2'][1]

'Ok lar... Joking wif u oni...'

In [60]:
# Remove stop words
stop = stopwords.words('english') 
file_content['v2'] = file_content['v2'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
# Delete unwanted columns
Email_Data = file_content[['v1', 'v2']]
# Rename column names
Email_Data = Email_Data.rename(columns={"v1":"Target", "v2":"Email"})
Email_Data.head()

Unnamed: 0,Target,Email
0,ham,"Go jurong point, crazy.. Available bugis n gre..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,U dun say early hor... U c already say...
4,ham,"Nah I think goes usf, lives around though"


In [61]:
#Delete punctuations, convert text in lower case and delete the double space

Email_Data['Email'] = Email_Data['Email'].apply(lambda x:re.sub('[!@#$:).;,?&]', '', x.lower()))
Email_Data['Email'] = Email_Data['Email'].apply(lambda x:re.sub(' ', ' ', x))
Email_Data['Email'].head(5)

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry 2 wkly comp win fa cup final tkts 2...
3                  u dun say early hor u c already say
4             nah i think goes usf lives around though
Name: Email, dtype: object

In [62]:
#Separating text(input) and target classes
list_sentences_rawdata = Email_Data["Email"].fillna("_na_").values
list_classes = ["Target"]
target = Email_Data[list_classes].values
target

array([['ham'],
       ['ham'],
       ['spam'],
       ...,
       ['ham'],
       ['ham'],
       ['ham']], dtype=object)

In [63]:
To_Process=Email_Data[['Email', 'Target']]
To_Process

Unnamed: 0,Email,Target
0,go jurong point crazy available bugis n great ...,ham
1,ok lar joking wif u oni,ham
2,free entry 2 wkly comp win fa cup final tkts 2...,spam
3,u dun say early hor u c already say,ham
4,nah i think goes usf lives around though,ham
...,...,...
5567,this 2nd time tried 2 contact u u å£750 pound ...,spam
5568,will ì_ b going esplanade fr home,ham
5569,pity * mood that soany suggestions,ham
5570,the guy bitching i acted like i'd interested b...,ham


In [64]:
#Train and test split with 80:20 ratio
train, test = train_test_split(To_Process, test_size=0.2)


In [83]:
# Define the sequence lengths, max number of words and embedding dimensions
# Sequence length of each sentence. If more, truncate. If less, pad with zeros
MAX_SEQUENCE_LENGTH = 300
# Top 20000 frequently occurring words
MAX_NB_WORDS = 20000


# Get the frequently occurring words
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train.Email)
train_sequences = tokenizer.texts_to_sequences(train.Email)
test_sequences = tokenizer.texts_to_sequences(test.Email)



In [2]:
# test_sequences

In [85]:
# dictionary containing words and their index
word_index = tokenizer.word_index
# print(tokenizer.word_index)
# total words in the corpus
print('Found %s unique tokens.' % len(word_index))


Found 8463 unique tokens.


In [86]:
# get only the top frequent words on train
train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
# get only the top frequent words on test
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(train_data.shape)
print(test_data.shape)

(4457, 300)
(1115, 300)


In [88]:
train_labels = train['Target']
test_labels = test['Target']

# converts the character array to numeric array. Assigns levels to unique labels.
le = LabelEncoder()
le.fit(train_labels)
train_labels = le.transform(train_labels)
test_labels = le.transform(test_labels)
print(le.classes_)
print(np.unique(train_labels, return_counts=True))
print(np.unique(test_labels, return_counts=True))

['ham' 'spam']
(array([0, 1]), array([3848,  609], dtype=int64))
(array([0, 1]), array([977, 138], dtype=int64))


In [89]:
# changing data types
labels_train = to_categorical(np.asarray(train_labels))
labels_test = to_categorical(np.asarray(test_labels))
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', labels_train.shape)
print('Shape of label tensor:', labels_test.shape)

Shape of data tensor: (4457, 300)
Shape of label tensor: (4457, 2)
Shape of label tensor: (1115, 2)


In [90]:
labels_train

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [91]:
EMBEDDING_DIM = 100
print(MAX_SEQUENCE_LENGTH)

300


In [98]:
# Model building and predicting

# define a single hidden layer with 128 memory units. The
# network uses a dropout with a probability of 0.5. The output layer is a
# dense layer using the softmax activation function to output a probability
# prediction.

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Dropout(0.5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(2, activation='softmax'))


In [99]:
model.compile(loss='categorical_crossentropy',optimizer='rmsprop', metrics=['acc'])

In [102]:
model.fit(train_data, labels_train, batch_size=64, epochs=5, validation_data=(test_data, labels_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x16d6d08ab80>

In [94]:
#predictions on test data
predicted=model.predict(test_data)
predicted

array([[0.5217743 , 0.47822574],
       [0.52167493, 0.4783251 ],
       [0.52386904, 0.476131  ],
       ...,
       [0.5217612 , 0.4782388 ],
       [0.5330771 , 0.46692288],
       [0.5264083 , 0.47359166]], dtype=float32)

In [97]:
#model evaluation
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report

precision, recall, fscore, support = score(labels_test, predicted.round())
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print("############################")
print(classification_report(labels_test, predicted.round()))

precision: [0.936721 1.      ]
recall: [1.         0.52173913]
fscore: [0.96732673 0.68571429]
support: [977 138]
############################
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       977
           1       1.00      0.52      0.69       138

   micro avg       0.94      0.94      0.94      1115
   macro avg       0.97      0.76      0.83      1115
weighted avg       0.94      0.94      0.93      1115
 samples avg       0.94      0.94      0.94      1115



In [106]:
# RNN model
from tensorflow.keras.layers import SimpleRNN
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(SimpleRNN(2, input_shape=(None,1)))
model.add(Dense(2,activation='softmax'))


In [107]:
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])

In [108]:
model.fit(train_data, labels_train, batch_size=16, epochs=5, validation_data=(test_data, labels_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x16d73f57850>

In [110]:
# prediction on test data
predicted_Srnn=model.predict(test_data)
predicted_Srnn

array([[0.990229  , 0.00977101],
       [0.9979056 , 0.0020944 ],
       [0.89674115, 0.10325881],
       ...,
       [0.9576678 , 0.04233219],
       [0.9935895 , 0.00641052],
       [0.9971961 , 0.00280388]], dtype=float32)

In [111]:
precision, recall, fscore, support = score(labels_test,predicted_Srnn.round())
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print("############################")
print(classification_report(labels_test, predicted_Srnn.round()))

precision: [0.94322709 0.72972973]
recall: [0.96929376 0.58695652]
fscore: [0.95608279 0.65060241]
support: [977 138]
############################
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       977
           1       0.73      0.59      0.65       138

   micro avg       0.92      0.92      0.92      1115
   macro avg       0.84      0.78      0.80      1115
weighted avg       0.92      0.92      0.92      1115
 samples avg       0.92      0.92      0.92      1115



In [None]:
# LSTM Model 

In [None]:
from tensorflow.keras.layers import LSTM
model = Sequential()
model.add(Embedding(MAX_NB_WORDS,EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(output_dim=16, activation='relu', inner_activation='hard_sigmoid',return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(2,activation='softmax'))


In [None]:
model.compile(loss = 'binary_crossentropy',optimizer='adam',metrics = ['accuracy'])


In [None]:
model.fit(train_data, labels_train,nbatch_size=16,epochs=5, validation_data=(test_data, labels_test))

In [None]:
#prediction on text data
predicted_lstm=model.predict(test_data)
predicted_lstm

In [None]:
precision, recall, fscore, support = score(labels_test,
predicted_lstm.round())
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print("############################")
print(sklearn.metrics.classification_report(labels_test,
predicted_lstm.round()))

In [109]:
#  Bidirectional LSTM

# As we know, LSTM preserves information from inputs using the
# hidden state. In bidirectional LSTMs, inputs are fed in two ways: one
# from previous to future and the other going backward from future to
# past, helping in learning future representation as well. Bidirectional
# LSTMs are known for producing very good results as they are capable of
# understanding the context better.

#  Bidirectional LSTM must outperform  the rest of the algorithms.

In [115]:
from tensorflow.keras.layers import LSTM

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Bidirectional(LSTM(16, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
model.add(Conv1D(16, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform"))
model.add(GlobalMaxPool1D())
model.add(Dense(50, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(2,activation='softmax'))




In [118]:
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [120]:
# model.fit(train_data, labels_train, batch_size = 32, epochs =1, validation_data=(test_data, labels_test))

In [None]:
# prediction on test data
predicted_blstm=model.predict(test_data)
predicted_blstm

In [None]:
#model evaluation
from sklearn.metrics import precision_recall_fscore_support as
score
precision, recall, fscore, support = score(labels_test,
predicted_blstm.round())
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print("############################")
print(classification_report(labels_test, predicted_blstm.round()))

# Next Word Prediction