##### Author: Ardit Sulce, Automate Everything with Python, Udemy
##### Course URL: https://www.udemy.com/course/automate-everything-with-python/

## What is natural language processing?

In [1]:
x = 'was'
y = 'is'
x == y

False

## Lemmatization of words

In [3]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/apple/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemma1 = lemmatizer.lemmatize('vegetables', 'n')
lemma2 = lemmatizer.lemmatize('vegetable', 'v')

lemma1

'vegetable'

## Lemmatization of Sentences

In [None]:
#import nltk
#nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [5]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
sentence = 'Vegetables are types of plants.'

Tokenizing sentences

In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
sentence_tokens = nltk.word_tokenize(sentence.lower())
sentence_tokens

['vegetables', 'are', 'types', 'of', 'plants', '.']

In [26]:
#is_noun = lambda pos: pos[:2] == 'NN'
#nouns = [word for (word, pos) in nltk.pos_tag(sentence_tokens) if is_noun(pos)]
#print(nouns) # ['future', 'past']

nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/apple/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [27]:
pos_tags = pos_tag(sentence_tokens)
pos_tags

[('vegetables', 'NNS'),
 ('are', 'VBP'),
 ('types', 'NNS'),
 ('of', 'IN'),
 ('plants', 'NNS'),
 ('.', '.')]

In [28]:
import nltk 
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemma_me(sent):
    sentence_tokens = nltk.word_tokenize(sent.lower())
    pos_tags = nltk.pos_tag(sentence_tokens)

    sentence_lemmas = []
    for token, pos_tag in zip(sentence_tokens, pos_tags):
        #noun verb adjective adverbs
        if pos_tag[1][0].lower() in ['n', 'v', 'a', 'r']:
            lemma = lemmatizer.lemmatize(token, pos_tag[1][0].lower())
            sentence_lemmas.append(lemma)

    return sentence_lemmas

In [29]:
l1 = lemma_me('Vegetables are types of plants.')
l1

['vegetable', 'be', 'type', 'plant']

In [30]:
l2 = lemma_me('A vegetable is a type of plant')
l2 

['vegetable', 'be', 'type', 'plant']

In [31]:
l1 == l2

True

## Find the most similar sentence

In [32]:
text = 'Originally, vegetables were collected from the wild by hunter-gatherers. Vegetables are all plants. Vegetables can be eaten either raw or cooked.'
question = 'What are vegetables?' 

In [33]:
import nltk 

def lemma_me(sent):
    sentence_tokens = nltk.word_tokenize(sent.lower())
    pos_tags = nltk.pos_tag(sentence_tokens)

    sentence_lemmas = []
    for token, pos_tag in zip(sentence_tokens, pos_tags):
        if pos_tag[1][0].lower() in ['n', 'v', 'a', 'r']:
            lemma = lemmatizer.lemmatize(token, pos_tag[1][0].lower())
            sentence_lemmas.append(lemma)

    return sentence_lemmas

In [34]:
sentence_tokens = nltk.sent_tokenize(text)
sentence_tokens.append(question)
sentence_tokens

['Originally, vegetables were collected from the wild by hunter-gatherers.',
 'Vegetables are all plants.',
 'Vegetables can be eaten either raw or cooked.',
 'What are vegetables?']

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
tv = TfidfVectorizer(tokenizer=lemma_me)
tv

In [37]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/apple/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/apple/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [39]:
tf = tv.fit_transform(sentence_tokens)

In [40]:
tf

<4x8 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [41]:
tf.toarray()

array([[0.27717414, 0.53114624, 0.        , 0.        , 0.53114624,
        0.53114624, 0.        , 0.27717414],
       [0.41988018, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.8046125 , 0.41988018],
       [0.32713399, 0.        , 0.62688384, 0.62688384, 0.        ,
        0.        , 0.        , 0.32713399],
       [0.70710678, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678]])

In [43]:
import pandas
df = pandas.DataFrame(tf.toarray(), columns=tv.get_feature_names_out())
df

Unnamed: 0,be,collect,cook,eat,hunter-gatherer,originally,plant,vegetable
0,0.277174,0.531146,0.0,0.0,0.531146,0.531146,0.0,0.277174
1,0.41988,0.0,0.0,0.0,0.0,0.0,0.804612,0.41988
2,0.327134,0.0,0.626884,0.626884,0.0,0.0,0.0,0.327134
3,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.707107


In [44]:
from sklearn.metrics.pairwise import cosine_similarity
values = cosine_similarity(tf[-1], tf)
values

#similarity 1 means 100%

array([[0.39198343, 0.59380024, 0.46263733, 1.        ]])

In [46]:
text = 'Originally, vegetables were collected from the wild by hunter-gatherers. Vegetables are all plants. Vegetables can be eaten either raw or cooked.'
question = 'What are vegetables?' 

In [47]:
index = values.argsort()[0][-2]
index

1

In [48]:
values_flat = values.flatten()
values_flat

array([0.39198343, 0.59380024, 0.46263733, 1.        ])

In [49]:
values_flat.sort()
values_flat

array([0.39198343, 0.46263733, 0.59380024, 1.        ])

In [50]:
coeff = values_flat[-2]
coeff

0.593800244493221

In [51]:
if coeff > 0.3:
    print(sentence_tokens[index])

Vegetables are all plants.


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b07786ef-bd35-4ac6-b274-00a3f2361065' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>