# ***NLP - NATURAL LANGUAGE PROCESSING***

# ***1.Tokenization***

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
text =  'Hello,This is Raj.Nice to meet you!Have a good day1'
word = word_tokenize(text)
sentence = sent_tokenize(text)
print('Word tokenizer: ',word)
print('Sentence tokenizer: ',sentence)

Word tokenizer:  ['Hello', ',', 'This', 'is', 'Raj.Nice', 'to', 'meet', 'you', '!', 'Have', 'a', 'good', 'day1']
Sentence tokenizer:  ['Hello,This is Raj.Nice to meet you!Have a good day1']


# ***2.Stopwords Removal***

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
print("Original Words:", words)
print("Filtered Words:", filtered_words)

Original Words: ['Hello', ',', 'This', 'is', 'Raj.Nice', 'to', 'meet', 'you', '!', 'Have', 'a', 'good', 'day1']
Filtered Words: ['Hello', ',', 'Raj.Nice', 'meet', '!', 'good', 'day1']


# ***3.Stemming***

In [None]:
from nltk.stem import PorterStemmer

In [None]:
stemmer = PorterStemmer()
words = word_tokenize(text)
stemmed_words = [stemmer.stem(word) for word in words]
print("Original Words:", word)
print("Stemmed Words:", stemmed_words)

Original Words: ['Hello', ',', 'This', 'is', 'Raj.Nice', 'to', 'meet', 'you', '!', 'Have', 'a', 'good', 'day1']
Stemmed Words: ['hello', ',', 'thi', 'is', 'raj.nic', 'to', 'meet', 'you', '!', 'have', 'a', 'good', 'day1']


# ***4.Lemmatization***

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print("Original Words:", words)
print("Lemmatized Words:", lemmatized_words)


Original Words: ['Hello', ',', 'This', 'is', 'Raj.Nice', 'to', 'meet', 'you', '!', 'Have', 'a', 'good', 'day1']
Lemmatized Words: ['Hello', ',', 'This', 'is', 'Raj.Nice', 'to', 'meet', 'you', '!', 'Have', 'a', 'good', 'day1']


# ***5.Part of Speech (POS)***

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
pos_tags = nltk.pos_tag(words)
print("POS Tagging Result:", pos_tags)

POS Tagging Result: [('Hello', 'NNP'), (',', ','), ('This', 'DT'), ('is', 'VBZ'), ('Raj.Nice', 'NNP'), ('to', 'TO'), ('meet', 'VB'), ('you', 'PRP'), ('!', '.'), ('Have', 'VB'), ('a', 'DT'), ('good', 'JJ'), ('day1', 'NN')]


# ***6. Named Entity Recognition (NER)***

In [None]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
named_entities = nltk.ne_chunk(pos_tags)
print("Named Entities Tree:")
print(named_entities)

Named Entities Tree:
(S
  (GPE Hello/NNP)
  ,/,
  This/DT
  is/VBZ
  Raj.Nice/NNP
  to/TO
  meet/VB
  you/PRP
  !/.
  Have/VB
  a/DT
  good/JJ
  day1/NN)


# ***7.Bag of Words (BoW)***

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
sentences =['I like ml','ml is interesting','I enjoy learning ml']
vectorizer = CountVectorizer()
bow_matrix = (vectorizer.fit_transform(sentences))
print("Bag of Words Matrix:\n", bow_matrix.toarray())

Bag of Words Matrix:
 [[0 0 0 0 1 1]
 [0 1 1 0 0 1]
 [1 0 0 1 0 1]]


# ***8.TF-IDF***

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences)
print("Vocabulary:", vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", X.toarray())

Vocabulary: ['enjoy' 'interesting' 'is' 'learning' 'like' 'ml']
TF-IDF Matrix:
 [[0.         0.         0.         0.         0.861037   0.50854232]
 [0.         0.65249088 0.65249088 0.         0.         0.38537163]
 [0.65249088 0.         0.         0.65249088 0.         0.38537163]]


# ***9.Word Embeddings (Word2Vec & GloVe)***

In [None]:
!pip install gensim




In [None]:
from gensim.models import Word2Vec
sentences = [
    ['i', 'love', 'nlp'],
    ['nlp', 'is', 'fun'],
    ['machine', 'learning', 'is', 'awesome'],
    ['deep', 'learning', 'is', 'part', 'of', 'nlp']
]

model = Word2Vec(sentences, vector_size=10, window=2, min_count=1, workers=1)

print("Vector for 'nlp':", model.wv['nlp'])
print("Similarity between 'nlp' and 'learning':", model.wv.similarity('nlp', 'learning'))


Vector for 'nlp': [ 0.07380505 -0.01533471 -0.04536613  0.06554051 -0.0486016  -0.01816018
  0.0287658   0.00991874 -0.08285215 -0.09448818]
Similarity between 'nlp' and 'learning': 0.32937223


# ***10.Text Preprocessing Pipeline***

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
text = "I'm senbagarajan, a first year Computer Science student."
text = text.lower()
words = word_tokenize(text)
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print("Preprocessed words:", lemmatized_words)


Preprocessed words: ['senbagarajan', 'first', 'year', 'computer', 'science', 'student']


# ***11.Sentiment Analysis***

In [None]:
!pip install textblob





In [None]:
from textblob import TextBlob

In [None]:
sentence1 = "I'm happy."
sentence2 = "I'm sad."
sentence3 = "I'm ok"
blob1 = TextBlob(sentence1)
blob2 = TextBlob(sentence2)
blob3 = TextBlob(sentence3)
sentiment1 = blob1.sentiment.polarity
sentiment2 = blob2.sentiment.polarity
sentiment3 = blob3.sentiment.polarity
print("Sentiment of sentence1:", sentiment1)
print("Sentiment of sentence2:", sentiment2)
print("Sentiment of sentence3:", sentiment3)

Sentiment of sentence1: 0.8
Sentiment of sentence2: -0.5
Sentiment of sentence3: 0.5


# ***12.Text Classification***

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB


In [None]:

texts = ["Free money!!!", "Hi, how are you?", "Win a brand new car!", "Let's catch up tomorrow."]
labels = ["spam", "ham", "spam", "ham"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)

model = MultinomialNB()
model.fit(X, labels)

test_text = ["free cashback"]
X_test = vectorizer.transform(test_text)
print("Prediction:", model.predict(X_test))


Prediction: ['spam']


# ***13.Language Translation***

In [None]:
!pip install googletrans==4.0.0-rc1




In [None]:
from googletrans import Translator

translator = Translator()
result = translator.translate("Good morning, have a nice day!", dest='ta')

print("Translated to Tamil:", result.text)

Translated to Tamil: காலை வணக்கம், ஒரு நல்ல நாள்!


#***14.Text Generation***

In [None]:
!pip install transformers
from transformers import pipeline



In [None]:
from transformers import pipeline
generator = pipeline('text-generation', model='gpt2')
output = generator("Once upon a time in a village,", max_length=50, num_return_sequences=1)
print("Generated Text:", output[0]['generated_text'])


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated Text: Once upon a time in a village, a woman's eyes glittered with a light of red. She looked down on the young woman's face.

"You look a little strange looking at you, you're quite young, you look like such a good girl."

A girl was looking down at her.

"You're so cute, you're so cute, you look so cute."

Her eyes looked into the girl's.

"I'm so happy, I'm so happy, I'm so happy, you're such a good girl."

A girl's eyes looked into the girl's.

"You're so cute, you look so cute, you look so cute."

A girl's eyes looked into the girl's.

"You look such a good girl. I'm so happy, I'm so happy, I'm so happy, you're such a good girl."

A girl's eyes looked into the girl's.

"You like me, you like me, you like me, you like me."

A girl's eyes looked into the girl's.

"You're so cute, you look so cute, you look so cute."

A girl's eyes looked into the girl's
