In [1]:
import pandas as pd
import nltk 
import numpy as np
import re
from nltk.stem import wordnet # to perform lemmitization
from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from nltk import pos_tag # for parts of speech
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words

In [2]:
nltk.download('punkt')
  
import nltk
nltk.download('wordnet')

import nltk
nltk.download('averaged_perceptron_tagger')

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shubhamrathod/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shubhamrathod/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shubhamrathod/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shubhamrathod/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(pd.__version__)

2.1.1


In [4]:
df = pd.read_csv('OPTED-Dictionary.csv', on_bad_lines='skip')

In [5]:
df.head()

Unnamed: 0,Word,Count,POS,Definition
0,A,1,"""""","""The first letter of the English and of many o..."
1,A,1,"""""","""The name of the sixth tone in the model major..."
2,A,1,"""""","""An adjective commonly called the indefinite ..."
3,A,1,"""""","""In each; to or for each; as """"""""twenty leagu..."
4,A,1,"""prep.""","""In; on; at; by."""


In [6]:
df.shape

(176009, 4)

# Data Preprocessing

In [7]:
df.drop(columns=['Count', 'POS'], inplace=True, axis=1)

In [8]:
df.head()

Unnamed: 0,Word,Definition
0,A,"""The first letter of the English and of many o..."
1,A,"""The name of the sixth tone in the model major..."
2,A,"""An adjective commonly called the indefinite ..."
3,A,"""In each; to or for each; as """"""""twenty leagu..."
4,A,"""In; on; at; by."""


In [9]:
df.shape

(176009, 2)

In [10]:
df.isnull().values.any() # check for null/ NaN value

True

In [11]:
df = df.dropna()

In [12]:
df.shape

(176005, 2)

In [13]:
df.dtypes # check for data type

Word          object
Definition    object
dtype: object

# Lower Case Convertion

In [14]:
# function that converts text into lower case and removes special characters
def txt_to_lower(x):
    for i in x:
        a=str(i).lower()
        p=re.sub(r'[^a-z0-9]',' ',a)
        print(a)

# Text Normalization

In [15]:
def text_normalization(text):
    text=str(text).lower() # text to lower case
    spl_char_text=re.sub(r'[^ a-z]','',text) # removing special characters

    tokens=nltk.word_tokenize(spl_char_text) # word tokenizing
    lema=wordnet.WordNetLemmatizer() # intializing lemmatization

    tags_list=pos_tag(tokens,tagset=None) # parts of speech

    lema_words=[]   # empty list 
    for token,pos_token in tags_list:
        if pos_token.startswith('V'):  # Verb
            pos_val='v'
        elif pos_token.startswith('J'): # Adjective
            pos_val='a'
        elif pos_token.startswith('R'): # Adverb
            pos_val='r'
        else:
            pos_val='n' # Noun
        lema_token=lema.lemmatize(token,pos_val) # performing lemmatization
        lema_words.append(lema_token) # appending the lemmatized token into a list
    
    return " ".join(lema_words) # returns the lemmatized tokens as a sentence 

In [16]:
# test Query
text_normalization('telling you some stuff about me')

'tell you some stuff about me'

In [17]:
df['lemmatized_text']=df['Word'].apply(text_normalization) # applying the fuction to the dataset to get clean text

In [18]:
df.head()

Unnamed: 0,Word,Definition,lemmatized_text
0,A,"""The first letter of the English and of many o...",a
1,A,"""The name of the sixth tone in the model major...",a
2,A,"""An adjective commonly called the indefinite ...",a
3,A,"""In each; to or for each; as """"""""twenty leagu...",a
4,A,"""In; on; at; by.""",a


# Remove Stop Words

In [19]:
stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Bag of Words

In [20]:
cv = CountVectorizer() # intializing the count vectorizer

X = cv.fit_transform(df['lemmatized_text']).toarray()

In [21]:
# returns all the unique word from data 

features = cv.get_feature_names_out()


In [22]:
df_bow = pd.DataFrame(X, columns = features)
df_bow.head()

Unnamed: 0,aam,aardvark,aardwolf,aaron,aaronic,aaronical,ab,abaca,abacinate,abacination,...,zymometer,zymophyte,zymose,zymosimeter,zymosis,zymotic,zyophyte,zythem,zythepsary,zythum
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
Question ='Meaning of a AAM' # considering an example query

In [24]:
# checking for stop words

Q=[]
a=Question.split()
for i in a:
    if i in stop:
        continue
    else:
        Q.append(i)
    b=" ".join(Q) 

In [25]:
Question_lemma = text_normalization(b) # applying the function that we created for text normalizing

Question_bow = cv.transform([Question_lemma]).toarray() # applying bow

In [26]:
Question_bow

array([[1, 0, 0, ..., 0, 0, 0]])

# Cosine Similarity

In [27]:
# cosine similarity for the above question we considered.

# cosine_value = 1- pairwise_distances(df_bow, Question_bow, metric = 'cosine' )

In [28]:
# index_value = cosine_value.argmax() # returns the index number of highest value

In [29]:
# using tf-idf

tfidf = TfidfVectorizer() # intializing tf-id 
x_tfidf = tfidf.fit_transform(df['lemmatized_text']).toarray() # transforming the data into array

In [30]:
# returns all the unique word from data with a score of that word

df_tfidf = pd.DataFrame(x_tfidf,columns=tfidf.get_feature_names_out()) 

df_tfidf.head()

Unnamed: 0,aam,aardvark,aardwolf,aaron,aaronic,aaronical,ab,abaca,abacinate,abacination,...,zymometer,zymophyte,zymose,zymosimeter,zymosis,zymotic,zyophyte,zythem,zythepsary,zythum
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# defining a function that returns response to query using tf-idf

def chat_tfidf(text):
    lemma=text_normalization(text) # calling the function to perform text normalization
    
    tf = tfidf.transform([lemma]).toarray() # applying tf-idf
    
    cos = 1 - pairwise_distances(df_tfidf, tf, metric='cosine') # applying cosine similarity
    index_value = cos.argmax() # getting index value 
    
    return df['Text Response'].loc[index_value]

In [None]:
chat_tfidf("What is AAM")