<a href="https://colab.research.google.com/github/RajasekarMurugan/Explore/blob/main/Text_Processing_with_NLTK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Text processing using NLTK (Natural **Lanaguage** toolkit)

In [3]:
!pip install nltk gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import  PorterStemmer, WordNetLemmatizer
from gensim.models import word2vec


In [7]:
nltk.download("stopwords")  # most common words in any given language
nltk.download("wordnet")  # Dictionary of a given lanaguage (word and its synonyms, antonyms, etc..)
nltk.download("punkt_tab")  #  Pre-trained unpervised tokenizer
nltk.download("omw-1.4")  # Open multi-lingual wordnet (multi-lingual version of wordnet)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [8]:
# Sample sentence

sentences = ["The students today are learning about natural lanaguage Processing in Artificial Intelligence"]

In [9]:
# Tokenization
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
print("Tokennized: ", tokenized_sentences)

Tokennized:  [['the', 'students', 'today', 'are', 'learning', 'about', 'natural', 'lanaguage', 'processing', 'in', 'artificial', 'intelligence']]


In [10]:
# Stopwords removal (Keep only the keywords)
stop_words = set(stopwords.words("english")) # setting the stopwords to english


filtered_sentences = [
    [word for word in sentence if word.isalnum() and word not in stop_words]
    for sentence in tokenized_sentences
]
print("After stopword removal :", filtered_sentences)

After stopword removal : [['students', 'today', 'learning', 'natural', 'lanaguage', 'processing', 'artificial', 'intelligence']]


In [15]:
# Stemming - identify root word
stemmer = PorterStemmer()
stemmed_sentences = [
    [stemmer.stem(word) for word in sentence]
    for sentence in filtered_sentences
]
print("Stemmed sentence", stemmed_sentences)


Stemmed sentence [['student', 'today', 'learn', 'natur', 'lanaguag', 'process', 'artifici', 'intellig']]


In [14]:
# Lemmetization  - identify meaningful root word
lemmatizer = WordNetLemmatizer()
lemmatized_sentences=[
    [lemmatizer.lemmatize(word) for word in sentence]
    for sentence in filtered_sentences
]
print("Lemmatized sentences ", lemmatized_sentences)

Lemmatized sentences  [['student', 'today', 'learning', 'natural', 'lanaguage', 'processing', 'artificial', 'intelligence']]


In [27]:
# Convert to text to numbers (vectors)

model = Word2Vec(sentences=lemmatized_sentences, vector_size=4, window=1, min_count=1, workers=4)
# Vector Size is the dimension of the output vector generated
# window si th number of words that we consider after/before the query word to understand the meaning


print("Vector for the word 'artificial':", model.wv['artificial'])
print("Vector for the word 'student':", model.wv['student'])

Vector for the word 'artificial': [-0.23257375 -0.17792022  0.16147181  0.2243247 ]
Vector for the word 'student': [-0.18804094 -0.09840259 -0.18778956 -0.02325106]
