In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import nltk
import numpy as np
import re
from nltk.stem import wordnet # to perform lemmitization
from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from nltk import pos_tag # for parts of speech
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words

In [2]:
nltk.download('punkt')

nltk.download('wordnet')

nltk.download('averaged_perceptron_tagger')

nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# File Input

In [3]:
# df = pd.read_excel('/content/drive/MyDrive/Colab_Notebooks/NLP/ChatBot/Input/dialog_talk_agent.xlsx')

In [18]:
df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/NLP/ChatBot/Input/OPTED-Dictionary.csv', on_bad_lines='skip')

In [19]:
df

Unnamed: 0,Word,Count,POS,Definition
0,A,1,"""""","""The first letter of the English and of many o..."
1,A,1,"""""","""The name of the sixth tone in the model major..."
2,A,1,"""""","""An adjective commonly called the indefinite ..."
3,A,1,"""""","""In each; to or for each; as """"""""twenty leagu..."
4,A,1,"""prep.""","""In; on; at; by."""
...,...,...,...,...
176004,Zymotic,7,"""a.""","""Of pertaining to or caused by fermentation."""
176005,Zymotic,7,"""a.""","""Designating or pertaining to a certain class..."
176006,Zythem,6,"""n.""","""See Zythum."""
176007,Zythepsary,10,"""n.""","""A brewery."""


In [20]:
# Generate random indices
indices = np.random.choice(df.index, size=170000, replace=False)

# Remove rows at random
df = df.drop(indices)

In [21]:
df

Unnamed: 0,Word,Count,POS,Definition
19,Ab,2,"""n.""","""The fifth month of the Jewish year according ..."
21,Abacinate,9,"""v. t.""","""To blind by a red-hot metal plate held before..."
38,Abacus,6,"""n.""","""The uppermost member or division of the capit..."
96,Abatable,8,"""a.""","""Capable of being abated; as an abatable writ..."
100,Abate,5,"""v. t.""","""To bring down or reduce from a higher to a lo..."
...,...,...,...,...
175918,Zoosperm,8,"""n.""","""One of the spermatic particles; spermatozoid."""
175930,Zope,4,"""n.""","""A European fresh-water bream (Abramis balleru..."
175937,Zoroastrism,11,"""n.""","""Same as Zoroastrianism."""
175949,Zuisin,6,"""n.""","""The American widgeon."""


# Data Preprocessing

In [22]:
df.shape

(6009, 4)

In [23]:
df.drop(columns=['Count', 'POS'], inplace=True, axis=1)

In [24]:
df.isnull().values.any()

False

In [25]:
df = df.dropna()

In [26]:
# df.ffill(axis = 0,inplace=True) # fills the null value with the previous value.

In [27]:
df1 = df.head(10) # copy of first ten rows of dataset

In [28]:
df1

Unnamed: 0,Word,Definition
19,Ab,"""The fifth month of the Jewish year according ..."
21,Abacinate,"""To blind by a red-hot metal plate held before..."
38,Abacus,"""The uppermost member or division of the capit..."
96,Abatable,"""Capable of being abated; as an abatable writ..."
100,Abate,"""To bring down or reduce from a higher to a lo..."
165,Abdest,"""Purification by washing the hands before pray..."
201,Abduction,"""The movement which separates a limb or other ..."
246,Abetment,"""The act of abetting; as an abetment of treas..."
306,Abiogenous,"""Produced by spontaneous generation."""
370,Ablet,"""Alt. of Ablen"""


# LowerCase Convertion

In [29]:
# function that converts text into lower case and removes special characters

def lower_case(x):
    for i in x:
        a=str(i).lower()
        p=re.sub(r'[^a-z0-9]',' ',a)
        print(p)

In [30]:
lower_case(df1['Word'])

ab
abacinate
abacus
abatable
abate
abdest
abduction
abetment
abiogenous
ablet


# Understadning Tokenizer, Lemmatizer, POS

In [31]:
# Tokenizer

# s='tell me about your personality'
# words=word_tokenize(s)
# print(words)

In [32]:
# Lemmatizer

# lemma = wordnet.WordNetLemmatizer() # intializing lemmatizer
# lemma.lemmatize('absorbed', pos = 'v')

In [33]:
# Part Of Speach

# pos_tag(nltk.word_tokenize(s),tagset = None) # returns the parts of speech of every word

# Normalization

In [34]:
# function that performs text normalization steps

def text_normalization(text):
    text = str(text).lower() # text to lower case
    spl_char_text=re.sub(r'[^ a-z]','',text) # removing special characters

    tokens=nltk.word_tokenize(spl_char_text) # word tokenizing

    lema=wordnet.WordNetLemmatizer() # intializing lemmatization

    tags_list=pos_tag(tokens,tagset=None) # parts of speech

    lema_words=[]   # empty list
    for token,pos_token in tags_list:
        if pos_token.startswith('V'):  # Verb
            pos_val='v'
        elif pos_token.startswith('J'): # Adjective
            pos_val='a'
        elif pos_token.startswith('R'): # Adverb
            pos_val='r'
        else:
            pos_val='n' # Noun
        lema_token=lema.lemmatize(token,pos_val) # performing lemmatization
        lema_words.append(lema_token) # appending the lemmatized token into a list

    return " ".join(lema_words) # returns the lemmatized tokens as a sentence

In [35]:
# Query Question
# text_normalization('telling you some stuff about me')

In [36]:
df['lemmatized_text']=df['Word'].apply(text_normalization) # applying the fuction to the dataset to get clean text

In [37]:
df

Unnamed: 0,Word,Definition,lemmatized_text
19,Ab,"""The fifth month of the Jewish year according ...",ab
21,Abacinate,"""To blind by a red-hot metal plate held before...",abacinate
38,Abacus,"""The uppermost member or division of the capit...",abacus
96,Abatable,"""Capable of being abated; as an abatable writ...",abatable
100,Abate,"""To bring down or reduce from a higher to a lo...",abate
...,...,...,...
175918,Zoosperm,"""One of the spermatic particles; spermatozoid.""",zoosperm
175930,Zope,"""A European fresh-water bream (Abramis balleru...",zope
175937,Zoroastrism,"""Same as Zoroastrianism.""",zoroastrism
175949,Zuisin,"""The American widgeon.""",zuisin


# Stop Words

In [38]:
stop = stopwords.words('english')

In [39]:
# stop

# Bag Of Words

Count Vectorizer

In [40]:
cv = CountVectorizer() # intializing the count vectorizer

X = cv.fit_transform(df['lemmatized_text']).toarray()

In [41]:
# returns all the unique word from data

features = cv.get_feature_names_out()

In [42]:
df_bow = pd.DataFrame(X, columns = features)

df_bow.head()

Unnamed: 0,ab,abacinate,abacus,abatable,abate,abdest,abduction,abetment,abiogenous,ablet,...,zincking,zone,zoogony,zooid,zoomelanin,zoosperm,zope,zoroastrism,zuisin,zygoma
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
Question ='What is the meaning of abacus' # considering an example query

In [44]:
# checking for stop words

Q=[]
a=Question.split()
for i in a:
    if i in stop:
        continue
    else:
        Q.append(i)
    b=" ".join(Q)

Normalizer

In [45]:
Question_lemma = text_normalization(b) # applying the function that we created for text normalizing

Question_bow = cv.transform([Question_lemma]).toarray() # applying bow

In [46]:
Question_bow

array([[0, 0, 1, ..., 0, 0, 0]])

# Cosine Similarity

In [47]:
# cosine similarity for the above question we considered.

cosine_value = 1- pairwise_distances(df_bow, Question_bow, metric = 'cosine' )

In [48]:
cosine_value

array([[0.        ],
       [0.        ],
       [0.70710678],
       ...,
       [0.        ],
       [0.        ],
       [0.        ]])

##  creating a new column to compare Definition and Cosine Similarity

In [49]:
df['similarity_bow'] = cosine_value # creating a new column

In [50]:
df_simi = pd.DataFrame(df, columns=['Definition','similarity_bow']) # taking similarity value of responses for the question we took

In [51]:
df_simi

Unnamed: 0,Definition,similarity_bow
19,"""The fifth month of the Jewish year according ...",0.000000
21,"""To blind by a red-hot metal plate held before...",0.000000
38,"""The uppermost member or division of the capit...",0.707107
96,"""Capable of being abated; as an abatable writ...",0.000000
100,"""To bring down or reduce from a higher to a lo...",0.000000
...,...,...
175918,"""One of the spermatic particles; spermatozoid.""",0.000000
175930,"""A European fresh-water bream (Abramis balleru...",0.000000
175937,"""Same as Zoroastrianism.""",0.000000
175949,"""The American widgeon.""",0.000000


In [52]:
df_simi_sort = df_simi.sort_values(by='similarity_bow', ascending=False) # sorting the values, Larger to smaller

In [53]:
df_simi_sort

Unnamed: 0,Definition,similarity_bow
38,"""The uppermost member or division of the capit...",0.707107
94494,"""of Mean""",0.707107
19,"""The fifth month of the Jewish year according ...",0.000000
116468,"""The state of being poachy; marshiness.""",0.000000
116767,"""A native or inhabitant of Poland; a Pole.""",0.000000
...,...,...
58840,"""A form of melody or accompaniment kept up thr...",0.000000
58750,"""of Fife""",0.000000
58749,"""of Fife""",0.000000
58708,"""An estate held of a superior on condition of ...",0.000000


## Adding Threshold

In [54]:
# considering the value of p=smiliarity to be greater than 0.2

threshold = 0.2

In [55]:
df_threshold = df_simi_sort[df_simi_sort['similarity_bow'] > threshold]

In [56]:
df_threshold

Unnamed: 0,Definition,similarity_bow
38,"""The uppermost member or division of the capit...",0.707107
94494,"""of Mean""",0.707107


- Finally using bow for the question 'Will you help me and tell me about yourself more' , the above are the responses we got using bow and the smiliarity value of responses, we consider the response with highest similarity

In [62]:
index_value = cosine_value.argmax() # returns the index number of highest value

In [63]:
index_value

2

In [64]:
Question

'What is the meaning of abacus'

In [66]:
df['Definition'].iloc[index_value] # The text at the above index becomes the response for the question

'"The uppermost member or division of the capital of a column  immediately under the architrave. See Column."'

# TF_IDF

In [67]:
# intializing tf-id
tfidf = TfidfVectorizer()

x_tfidf = tfidf.fit_transform(df['lemmatized_text']).toarray() # transforming the data into array

In [68]:
Question1 ='What is the meaning of abacus'

In [69]:
Question_lemma1 = text_normalization(Question1)

In [70]:
Question_tfidf = tfidf.transform([Question_lemma1]).toarray() # applying tf-idf

In [71]:
# returns all the unique word from data with a score of that word

df_tfidf = pd.DataFrame(x_tfidf,columns=tfidf.get_feature_names_out())

In [72]:
df_tfidf.head()

Unnamed: 0,ab,abacinate,abacus,abatable,abate,abdest,abduction,abetment,abiogenous,ablet,...,zincking,zone,zoogony,zooid,zoomelanin,zoosperm,zope,zoroastrism,zuisin,zygoma
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Cosine Similarity

In [73]:
cos = 1 - pairwise_distances(df_tfidf, Question_tfidf, metric='cosine')  # applying cosine similarity

In [74]:
cos

array([[0.        ],
       [0.        ],
       [0.45815541],
       ...,
       [0.        ],
       [0.        ],
       [0.        ]])

## Creating a new column to compare Text Responce and Similarity Score

In [75]:
df['similarity_tfidf'] = cos

In [77]:
df_simi_tfidf = pd.DataFrame(df, columns=['Definition','similarity_tfidf']) # taking similarity value of responses for the question we took


In [78]:
df_simi_tfidf

Unnamed: 0,Definition,similarity_tfidf
19,"""The fifth month of the Jewish year according ...",0.000000
21,"""To blind by a red-hot metal plate held before...",0.000000
38,"""The uppermost member or division of the capit...",0.458155
96,"""Capable of being abated; as an abatable writ...",0.000000
100,"""To bring down or reduce from a higher to a lo...",0.000000
...,...,...
175918,"""One of the spermatic particles; spermatozoid.""",0.000000
175930,"""A European fresh-water bream (Abramis balleru...",0.000000
175937,"""Same as Zoroastrianism.""",0.000000
175949,"""The American widgeon.""",0.000000


## Sort the values from max similarity to min

In [79]:
df_simi_tfidf_sort = df_simi_tfidf.sort_values(by='similarity_tfidf', ascending=False)

In [80]:
df_simi_tfidf_sort

Unnamed: 0,Definition,similarity_tfidf
157731,"""By that; by how much; by so much; on that acc...",0.458155
38,"""The uppermost member or division of the capit...",0.458155
94494,"""of Mean""",0.458155
104619,"""Denoting part of an aggregate or whole; belon...",0.437533
104623,"""Denoting identity or equivalence; -- used wit...",0.437533
...,...,...
58863,"""A native of the Fiji islands.""",0.000000
58840,"""A form of melody or accompaniment kept up thr...",0.000000
58750,"""of Fife""",0.000000
58749,"""of Fife""",0.000000


## Considering the value of p=smiliarity to be greater than 0.2

In [81]:
threshold = 0.2

In [82]:
df_threshold = df_simi_tfidf_sort[df_simi_tfidf_sort['similarity_tfidf'] > threshold]


In [83]:
df_threshold

Unnamed: 0,Definition,similarity_tfidf
157731,"""By that; by how much; by so much; on that acc...",0.458155
38,"""The uppermost member or division of the capit...",0.458155
94494,"""of Mean""",0.458155
104619,"""Denoting part of an aggregate or whole; belon...",0.437533
104623,"""Denoting identity or equivalence; -- used wit...",0.437533
163942,"""A contraction of it was.""",0.422901
13530,"""of Be""",0.422901
159523,"""A common contraction of it is.""",0.422901


- by using tfidf for the question 'Will you help me and tell me about yourself more' , the above are the responses we got and the smiliarity value of responses, we consider the response with highest similarity

# Index Value

In [84]:
index_value_1 = cos.argmax() # returns the index number of highest value

In [85]:
index_value_1

2

In [86]:
Question1

'What is the meaning of abacus'

In [87]:
df['Definition'].iloc[index_value_1]  # returns the text at that index

'"The uppermost member or division of the capital of a column  immediately under the architrave. See Column."'

# Model Using Bag of Words

Text Normalizer

In [88]:
# Function that removes stop words and process the text

def stopword_(text):
    tag_list = pos_tag(nltk.word_tokenize(text),tagset=None)
    stop = stopwords.words('english')

    lema = wordnet.WordNetLemmatizer()

    lema_word = []
    for token,pos_token in tag_list:
        if token in stop:
            continue
        if pos_token.startswith('V'):
            pos_val='v'
        elif pos_token.startswith('J'):
            pos_val='a'
        elif pos_token.startswith('R'):
            pos_val='r'
        else:
            pos_val='n'
        lema_token=lema.lemmatize(token,pos_val)
        lema_word.append(lema_token)

    return " ".join(lema_word)

vectorizer

In [89]:
# defining a function that returns response to query using bow

def chat_bow(text):
    s = stopword_(text)

    lemma = text_normalization(s) # calling the function to perform text normalization

    bow = cv.transform([lemma]).toarray() # applying bow

    cosine_value = 1 - pairwise_distances(df_bow, bow, metric = 'cosine' )

    index_value = cosine_value.argmax() # getting index value

    return df['Definition'].iloc[index_value]

In [90]:
chat_bow('What is the meaning of abacus')

'"The uppermost member or division of the capital of a column  immediately under the architrave. See Column."'

In [91]:
chat_bow('What is zoosperm')

'"One of the spermatic particles; spermatozoid."'

In [92]:
chat_bow('What is called as zooning')

'"To summon to the discharge of a particular duty; to designate for an office  or employment especially of a religious character; -- often used of a divine summons; as to be called to the ministry; sometimes to invite; as to call a minister to be the pastor of a church."'

# Model Using Tf-Idf



In [93]:
# defining a function that returns response to query using tf-idf

def chat_tfidf(text):
    lemma = text_normalization(text) # calling the function to perform text normalization

    tf = tfidf.transform([lemma]).toarray() # applying tf-idf

    cos = 1-pairwise_distances(df_tfidf, tf, metric='cosine') # applying cosine similarity

    index_value = cos.argmax() # getting index value

    return df['Definition'].iloc[index_value]

In [94]:
chat_tfidf('What is the meaning of abacus')

'"The uppermost member or division of the capital of a column  immediately under the architrave. See Column."'

In [95]:
chat_tfidf('What is zoosperm')

'"One of the spermatic particles; spermatozoid."'

In [96]:
chat_tfidf('What is called as zooning')

'"To summon to the discharge of a particular duty; to designate for an office  or employment especially of a religious character; -- often used of a divine summons; as to be called to the ministry; sometimes to invite; as to call a minister to be the pastor of a church."'