In [25]:
import nltk
nltk.download('punkt') #required for tokenization
nltk.download('wordnet')#required for Lemmatization
nltk.download('averaged_perceptron_tagger')#required for pos tagging
nltk.download('stopwords') #required for stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nishigandha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nishigandha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Nishigandha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nishigandha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Sample Text

In [5]:
text = 'Real madrid is set to win the UCL for the season . Benzema might win Balon dor . Salah might be the runner up'

In [8]:
from nltk import sent_tokenize     #seperate para into sentence

tokenized_sent=sent_tokenize(text)
print(tokenized_sent)

['Real madrid is set to win the UCL for the season .', 'Benzema might win Balon dor .', 'Salah might be the runner up']


# Tokenization

In [9]:
from nltk import word_tokenize     #seperate seperate word

tokenized_word=word_tokenize(text)
print(tokenized_word)

['Real', 'madrid', 'is', 'set', 'to', 'win', 'the', 'UCL', 'for', 'the', 'season', '.', 'Benzema', 'might', 'win', 'Balon', 'dor', '.', 'Salah', 'might', 'be', 'the', 'runner', 'up']


# POS Tagging

In [12]:
# POS tagging
pos_tags = nltk.pos_tag(tokenized_word)  #The pos_tag() function takes the list of tokens as input and assigns 
                                         #POS tags to each token based on its grammatical category.
print("POS Tags:", pos_tags)

POS Tags: [('Real', 'JJ'), ('madrid', 'NN'), ('is', 'VBZ'), ('set', 'VBN'), ('to', 'TO'), ('win', 'VB'), ('the', 'DT'), ('UCL', 'NNP'), ('for', 'IN'), ('the', 'DT'), ('season', 'NN'), ('.', '.'), ('Benzema', 'NNP'), ('might', 'MD'), ('win', 'VB'), ('Balon', 'NNP'), ('dor', 'NN'), ('.', '.'), ('Salah', 'NNP'), ('might', 'MD'), ('be', 'VB'), ('the', 'DT'), ('runner', 'NN'), ('up', 'RP')]


# Stopword

We use the set() function to create a set data structure from the list of stop words retrieved 
using stopwords.words("english"). 

The argument "english" specifies that we want the stop words for the English language.

In [13]:
from nltk.corpus import stopwords              
stop_words=set(stopwords.words("english"))
print(stop_words)

{'than', 'so', 'haven', 'ain', 'now', 'only', 'how', 'at', 'more', 'don', 'because', 'yourself', 'own', 'through', 'of', "isn't", 'has', 'been', 'won', 'will', 'didn', 'you', 'where', "hasn't", 's', 'were', "mustn't", "couldn't", 'hers', 'during', "should've", "weren't", 've', 'his', 'from', 'your', 'or', 'some', "aren't", 'yours', 'weren', 'll', 'into', 'it', "doesn't", 'isn', 'if', "wasn't", 'about', 'over', 'again', 'under', "shouldn't", 'shan', 'wouldn', "don't", 'myself', 'her', 'very', 'hasn', 'wasn', 'above', 'few', 'we', 'ma', 'mightn', 'this', "that'll", 'having', "haven't", 'mustn', 'me', 'nor', 'too', 'most', 'between', 'then', 'is', 'herself', 'being', 'that', 'be', 'himself', 'until', 'both', 'below', 'its', 'against', 'aren', 'my', 'had', 'them', 'd', 'which', 'whom', 'to', 'doesn', 'these', "didn't", "wouldn't", 'needn', 'ourselves', "needn't", 'after', 'further', 'for', 'not', 'on', 'theirs', 'but', 'themselves', 'couldn', 'there', 'i', 'once', 'who', 'any', 'm', 'have'

# Stopwords Removal

1 filtered_sent = []: An empty list named filtered_sent is initialized to store the tokens that are not stop words.

2 for w in tokenized_word:: This loop iterates over each token in the tokenized_word list.

3 if w not in stop_words:: This condition checks if the current token w is not present in the stop_words set.

4 If the token is not a stop word, filtered_sent.append(w) adds the token to the filtered_sent list.

5 After the loop completes, filtered_sent contains the tokens that are not stop words.

The code then prints the original tokenized sentence (tokenized_word) and 
the filtered sentence (filtered_sent) using the print function.

In [15]:
filtered_sent=[]
for w in tokenized_word:
    if w not in stop_words:
        filtered_sent.append(w)
print("Tokenized Sentence:",tokenized_word)
print("\nFilterd Sentence:",filtered_sent)

Tokenized Sentence: ['Real', 'madrid', 'is', 'set', 'to', 'win', 'the', 'UCL', 'for', 'the', 'season', '.', 'Benzema', 'might', 'win', 'Balon', 'dor', '.', 'Salah', 'might', 'be', 'the', 'runner', 'up']

Filterd Sentence: ['Real', 'madrid', 'set', 'win', 'UCL', 'season', '.', 'Benzema', 'might', 'win', 'Balon', 'dor', '.', 'Salah', 'might', 'runner']


# Stemming

1. stemmer = PorterStemmer(): This line initializes an instance of the PorterStemmer from the nltk.stem module.

2. stemmed_words = []: This line initializes an empty list named stemmed_words that will store the stemmed words.

3. The loop iterates over each word w in the filtered_sent list, which contains the words after stop word removal.

4. stemmer.stem(w) applies the stemming process to each word w using the stem() method of the stemmer object. 
   The stemmed word is obtained.

5. The stemmed word is then appended to the stemmed_words list using stemmed_words.append(stemmer.stem(w)).

In [24]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

stemmed_words=[]
for w in filtered_sent:
    stemmed_words.append(stemmer.stem(w))
    
print("Filtered Sentence:",filtered_sent)
print("\nStemmed Sentence:",stemmed_words)

Filtered Sentence: ['Real', 'madrid', 'set', 'win', 'UCL', 'season', '.', 'Benzema', 'might', 'win', 'Balon', 'dor', '.', 'Salah', 'might', 'runner']

Stemmed Sentence: ['real', 'madrid', 'set', 'win', 'ucl', 'season', '.', 'benzema', 'might', 'win', 'balon', 'dor', '.', 'salah', 'might', 'runner']


# Lemmatization

1.initialize an instance of it named lem to perform lemmatization.

2.initialize an instance of it named stem to perform stemming.

3.lemma_word_list = []: This line initializes an empty list named lemma_word_list that will store the lemmatized words.

4.The loop iterates over each word in the stemmed_words list, which contains the stemmed words obtained from the previous code. For each word, the following steps are performed:

5 lemmat = lem.lemmatize(word, "v"): The lemmatize() method of the lem object is used to lemmatize the word, considering it as a verb ("v"). 

The resulting lemma is stored in the lemmat variable.

lemma_word_list.append(lemmat): The lemmatized word is appended to the lemma_word_list list.


In [27]:
import nltk
nltk.download('wordnet') #necessary for lemmatization.

from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()

from nltk.stem import PorterStemmer
stem = PorterStemmer()

lemma_word_list = []

for word in stemmed_words:
    lemmat = lem.lemmatize(word,"v")
    lemma_word_list.append(lemmat)
    

print("Lemmatized Word:",lemma_word_list)
print("\nStemmed Word:",stemmed_words)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nishigandha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Lemmatized Word: ['real', 'madrid', 'set', 'win', 'ucl', 'season', '.', 'benzema', 'might', 'win', 'balon', 'dor', '.', 'salah', 'might', 'runner']

Stemmed Word: ['real', 'madrid', 'set', 'win', 'ucl', 'season', '.', 'benzema', 'might', 'win', 'balon', 'dor', '.', 'salah', 'might', 'runner']


# Frequency

In [33]:
from nltk.probability import FreqDist
fdist = FreqDist(lemma_word_list)
print(fdist)

<FreqDist with 13 samples and 16 outcomes>


In [34]:
fdist.most_common(2)

[('win', 2), ('.', 2)]