## Lemmatizing

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

In [3]:
wordnet_lemmatizer = WordNetLemmatizer()

In [6]:
wordnet_lemmatizer.lemmatize('cats'), wordnet_lemmatizer.lemmatize('mangoes')

('cat', 'mango')

In [7]:
sample_text = "Python programmers used to program in python and they are known as pythoners and what they are doing is pythoning."

In [34]:
## step - 1 tokeinzing in tokens

tokens = word_tokenize(sample_text)

## step - 2 remove stopwords
stop_words = stopwords.words('english')

final_words = []
for token in tokens:
    if token not in stop_words:
        final_words.append(token)

## step - 3 lemmatizing
lemmatized_words = []

for word in final_words:
    lemmatized_words.append(wordnet_lemmatizer.lemmatize(word, pos = 'v'))

## step - 4 final sentence after all these performance
lemmatized_sentence = ' '.join(word for word in lemmatized_words)

In [36]:
sample_text, lemmatized_sentence

('Python programmers used to program in python and they are known as pythoners and what they are doing is pythoning.',
 'Python programmers use program python know pythoners pythoning .')

In [40]:
# final_words, lemmatized_words

In [39]:
# tokens

In [38]:
# stop_words

### Similarity between two sentences

In [44]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def calculate_sentence_similarity(sentence1, sentence2):
    # Tokenize sentences into words
    words1 = word_tokenize(sentence1)
    words2 = word_tokenize(sentence2)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    words1 = [word.lower() for word in words1 if word.lower() not in stop_words]
    words2 = [word.lower() for word in words2 if word.lower() not in stop_words]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words1 = [lemmatizer.lemmatize(word) for word in words1]
    words2 = [lemmatizer.lemmatize(word) for word in words2]

    # Create frequency distribution of words
    freq_dist1 = nltk.FreqDist(words1)
    freq_dist2 = nltk.FreqDist(words2)

    # Calculate similarity using Jaccard similarity coefficient
    similarity = nltk.jaccard_distance(set(freq_dist1), set(freq_dist2))
    similarity = 1 - similarity

    return similarity

# Example usage
sentence1 = "That boy is good in nature but he does not like to study much."
sentence2 = "That boy who is well mannered, does not like to study much."
similarity_score = calculate_sentence_similarity(sentence1, sentence2)
print(f"Similarity score: {similarity_score}")


Similarity score: 0.5


In [43]:
help(nltk.FreqDist)

Help on class FreqDist in module nltk.probability:

class FreqDist(collections.Counter)
 |  FreqDist(samples=None)
 |  
 |  A frequency distribution for the outcomes of an experiment.  A
 |  frequency distribution records the number of times each outcome of
 |  an experiment has occurred.  For example, a frequency distribution
 |  could be used to record the frequency of each word type in a
 |  document.  Formally, a frequency distribution can be defined as a
 |  function mapping from each sample to the number of times that
 |  sample occurred as an outcome.
 |  
 |  Frequency distributions are generally constructed by running a
 |  number of experiments, and incrementing the count for a sample
 |  every time it is an outcome of an experiment.  For example, the
 |  following code will produce a frequency distribution that encodes
 |  how often each word occurs in a text:
 |  
 |      >>> from nltk.tokenize import word_tokenize
 |      >>> from nltk.probability import FreqDist
 |      >

In [45]:
help(nltk.jaccard_distance)

Help on function jaccard_distance in module nltk.metrics.distance:

jaccard_distance(label1, label2)
    Distance metric comparing set-similarity.

