In [None]:
!pip install nltk



In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
corpus = "One Piece (stylized in all caps) is a Japanese manga series written and illustrated by Eiichiro Oda. It has been serialized in Shueisha's shōnen manga magazine Weekly Shōnen Jump since July 1997, with its individual chapters compiled in 108 tankōbon volumes as of March 2024. The story follows the adventures of Monkey D. Luffy and his crew, the Straw Hat Pirates, where he explores the Grand Line in search of the mythical treasure known as the One Piece in order to become the next King of the Pirates."
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**Tokenization**

In [None]:
# Word tokenization
words = word_tokenize(corpus)
print("Words:", words)

# Sentence tokenization
sentences = sent_tokenize(corpus)
print("Sentences:", sentences)

Words: ['One', 'Piece', '(', 'stylized', 'in', 'all', 'caps', ')', 'is', 'a', 'Japanese', 'manga', 'series', 'written', 'and', 'illustrated', 'by', 'Eiichiro', 'Oda', '.', 'It', 'has', 'been', 'serialized', 'in', 'Shueisha', "'s", 'shōnen', 'manga', 'magazine', 'Weekly', 'Shōnen', 'Jump', 'since', 'July', '1997', ',', 'with', 'its', 'individual', 'chapters', 'compiled', 'in', '108', 'tankōbon', 'volumes', 'as', 'of', 'March', '2024', '.', 'The', 'story', 'follows', 'the', 'adventures', 'of', 'Monkey', 'D.', 'Luffy', 'and', 'his', 'crew', ',', 'the', 'Straw', 'Hat', 'Pirates', ',', 'where', 'he', 'explores', 'the', 'Grand', 'Line', 'in', 'search', 'of', 'the', 'mythical', 'treasure', 'known', 'as', 'the', 'One', 'Piece', 'in', 'order', 'to', 'become', 'the', 'next', 'King', 'of', 'the', 'Pirates', '.']
Sentences: ['One Piece (stylized in all caps) is a Japanese manga series written and illustrated by Eiichiro Oda.', "It has been serialized in Shueisha's shōnen manga magazine Weekly Shōn

In [None]:
#TreebankWordTokenizer
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(corpus)
print(tokens)
# '.' will be considered as single word like doing. but except for the last.

['One', 'Piece', '(', 'stylized', 'in', 'all', 'caps', ')', 'is', 'a', 'Japanese', 'manga', 'series', 'written', 'and', 'illustrated', 'by', 'Eiichiro', 'Oda.', 'It', 'has', 'been', 'serialized', 'in', 'Shueisha', "'s", 'shōnen', 'manga', 'magazine', 'Weekly', 'Shōnen', 'Jump', 'since', 'July', '1997', ',', 'with', 'its', 'individual', 'chapters', 'compiled', 'in', '108', 'tankōbon', 'volumes', 'as', 'of', 'March', '2024.', 'The', 'story', 'follows', 'the', 'adventures', 'of', 'Monkey', 'D.', 'Luffy', 'and', 'his', 'crew', ',', 'the', 'Straw', 'Hat', 'Pirates', ',', 'where', 'he', 'explores', 'the', 'Grand', 'Line', 'in', 'search', 'of', 'the', 'mythical', 'treasure', 'known', 'as', 'the', 'One', 'Piece', 'in', 'order', 'to', 'become', 'the', 'next', 'King', 'of', 'the', 'Pirates', '.']


**Normalization**

Normalization typically includes converting all text to lowercase, removing punctuation, and other such cleaning steps.

In [None]:
import string
# text = corpus
lower_text = corpus.lower()
clean_text = lower_text.translate(str.maketrans('', '', string.punctuation))

print("Normalized Text:", clean_text)
print("Character Count:", len(clean_text))
print("Word Count:", len(clean_text.split()))

# updated_corpus = clean_text
# print(updated_corpus)

Normalized Text: one piece stylized in all caps is a japanese manga series written and illustrated by eiichiro oda it has been serialized in shueishas shōnen manga magazine weekly shōnen jump since july 1997 with its individual chapters compiled in 108 tankōbon volumes as of march 2024 the story follows the adventures of monkey d luffy and his crew the straw hat pirates where he explores the grand line in search of the mythical treasure known as the one piece in order to become the next king of the pirates
Character Count: 494
Word Count: 88


**Stop Words Removal**

Removing common words that might not be useful in analyzing text.

It is really important to create your own stopwords, as there are few stopwords which are really important as per the usecase.



In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

# Removing stopwords
print(stopwords.words('english'))
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in word_tokenize(clean_text) if word not in stop_words]

print("Filtered Words:", filtered_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**Stemming**

Reducing the word to it's word stem that affixes to suffixes and prefixes or to roots of the words know as lemma

eg: Eating, gaten, ate - stem word is eat

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

stemmer = PorterStemmer()

# Stemming
for words in filtered_words:
  print(words+"----->"+ stemmer.stem(words))
stemmed_words = [stemmer.stem(word) for word in filtered_words]
print("Stemmed Words:", stemmed_words)

[nltk_data] Downloading package wordnet to /root/nltk_data...


one----->one
piece----->piec
stylized----->styliz
caps----->cap
japanese----->japanes
manga----->manga
series----->seri
written----->written
illustrated----->illustr
eiichiro----->eiichiro
oda----->oda
serialized----->serial
shueishas----->shueisha
shōnen----->shōnen
manga----->manga
magazine----->magazin
weekly----->weekli
shōnen----->shōnen
jump----->jump
since----->sinc
july----->juli
1997----->1997
individual----->individu
chapters----->chapter
compiled----->compil
108----->108
tankōbon----->tankōbon
volumes----->volum
march----->march
2024----->2024
story----->stori
follows----->follow
adventures----->adventur
monkey----->monkey
luffy----->luffi
crew----->crew
straw----->straw
hat----->hat
pirates----->pirat
explores----->explor
grand----->grand
line----->line
search----->search
mythical----->mythic
treasure----->treasur
known----->known
one----->one
piece----->piec
order----->order
become----->becom
next----->next
king----->king
pirates----->pirat
Stemmed Words: ['one', 'piec', '

This is the disadvantage of stemming, as it don't have any meaning of the word

example: compiled----->compil

we use this stemming, when we have classifation problem like spam/ham

**RegexpStemmer**

In [None]:
from nltk.stem import RegexpStemmer
regexp_stemmer = RegexpStemmer('ing$|s$|e$|able$', min=4)

###Snowball Stemmer

A better version of porter stemmer, it gives a better version of words went compared to other stemmers.

In [None]:
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')
for words in filtered_words:
  print(words+"----->"+ snowball_stemmer.stem(words))

one----->one
piece----->piec
stylized----->styliz
caps----->cap
japanese----->japanes
manga----->manga
series----->seri
written----->written
illustrated----->illustr
eiichiro----->eiichiro
oda----->oda
serialized----->serial
shueishas----->shueisha
shōnen----->shōnen
manga----->manga
magazine----->magazin
weekly----->week
shōnen----->shōnen
jump----->jump
since----->sinc
july----->juli
1997----->1997
individual----->individu
chapters----->chapter
compiled----->compil
108----->108
tankōbon----->tankōbon
volumes----->volum
march----->march
2024----->2024
story----->stori
follows----->follow
adventures----->adventur
monkey----->monkey
luffy----->luffi
crew----->crew
straw----->straw
hat----->hat
pirates----->pirat
explores----->explor
grand----->grand
line----->line
search----->search
mythical----->mythic
treasure----->treasur
known----->known
one----->one
piece----->piec
order----->order
become----->becom
next----->next
king----->king
pirates----->pirat


In [None]:
stemmer.stem("fairly")

'fairli'

In [None]:
snowball_stemmer.stem("fairly")

'fair'

###Lemmatization

We get the root word exactly not the stem. It's more accurate than tokenization.

We can use this for chatbots, Q&A, text summerization.

In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
'''
POS
Noun - n
Verb - v
Adjective - a
Adverb - r
'''
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print("Lemmatized Words:", lemmatized_words)

Lemmatized Words: ['one', 'piece', 'stylized', 'cap', 'japanese', 'manga', 'series', 'written', 'illustrated', 'eiichiro', 'oda', 'serialized', 'shueishas', 'shōnen', 'manga', 'magazine', 'weekly', 'shōnen', 'jump', 'since', 'july', '1997', 'individual', 'chapter', 'compiled', '108', 'tankōbon', 'volume', 'march', '2024', 'story', 'follows', 'adventure', 'monkey', 'luffy', 'crew', 'straw', 'hat', 'pirate', 'explores', 'grand', 'line', 'search', 'mythical', 'treasure', 'known', 'one', 'piece', 'order', 'become', 'next', 'king', 'pirate']


In [None]:
corpus = """
"One Piece" is a widely acclaimed anime and manga series created by Eiichiro Oda. It debuted in 1997 and quickly rose to immense popularity, captivating audiences worldwide with its thrilling adventures and complex storytelling. The series follows Monkey D. Luffy, a spirited young pirate with the ability to stretch his body like rubber due to consuming a Devil Fruit. Luffy's dream is to find the legendary treasure known as "One Piece" and become the Pirate King. Accompanied by his diverse and lovable crew, the Straw Hat Pirates, Luffy travels across the Grand Line, encountering other pirates, bounty hunters, and various other characters, each with their own unique abilities and backstories. "One Piece" is celebrated for its intricate plot, deep character development, and a perfect blend of humor, action, and drama, making it a cornerstone of anime culture."""


In [None]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stemmer = PorterStemmer()

In [None]:
sentences = nltk.sent_tokenize(corpus)

In [None]:
#Apply stopwords and filter and then Apply stemming
for i in range(len(sentences)):
  words = nltk.word_tokenize(sentences[i])
  words = [stemmer.stem(word) for word in words if word not in stopwords.words('english')]
  sentences[i] = ' '.join(words)

In [None]:
sentences

["'' one piec '' wide acclaim anim manga seri creat eiichiro oda .",
 'it debut 1997 quickli rose immens popular , captiv audienc worldwid thrill adventur complex storytel .',
 'the seri follow monkey d. luffi , spirit young pirat abil stretch bodi like rubber due consum devil fruit .',
 "luffi 's dream find legendari treasur known `` one piec '' becom pirat king .",
 'accompani divers lovabl crew , straw hat pirat , luffi travel across grand line , encount pirat , bounti hunter , variou charact , uniqu abil backstori .',
 "`` one piec '' celebr intric plot , deep charact develop , perfect blend humor , action , drama , make cornerston anim cultur ."]

In [None]:
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')

In [None]:
#Apply stopwords and filter and then Apply stemming
for i in range(len(sentences)):
  words = nltk.word_tokenize(sentences[i])
  words = [snowball_stemmer.stem(word) for word in words if word not in stopwords.words('english')]
  sentences[i] = ' '.join(words)

#here in snowball it will make the starting word to lower case.

In [None]:
sentences

["'' one piec `` wide acclaim anim manga seri creat eiichiro oda .",
 'debut 1997 quick rose immen popular , captiv audienc worldwid thrill adventur complex storytel .',
 'seri follow monkey d. luffi , spirit young pirat abil stretch bodi like rubber due consum devil fruit .',
 "luffi 's dream find legendari treasur known `` one piec `` becom pirat king .",
 'accompani diver lovabl crew , straw hat pirat , luffi travel across grand line , encount pirat , bounti hunter , variou charact , uniqu abil backstori .',
 '`` one piec `` celebr intric plot , deep charact develop , perfect blend humor , action , drama , make cornerston anim cultur .']

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
for i in range(len(sentences)):
  words = nltk.word_tokenize(sentences[i])
  words = [lemmatizer.lemmatize(word.lower()) for word in words if word not in stopwords.words('english')]
  sentences[i] = ' '.join(words)

In [None]:
sentences

["'' one piec `` wide acclaim anim manga seri creat eiichiro oda .",
 'debut 1997 quick rose immen popular , captiv audienc worldwid thrill adventur complex storytel .',
 'seri follow monkey d. luffi , spirit young pirat abil stretch bodi like rubber due consum devil fruit .',
 "luffi 's dream find legendari treasur known `` one piec `` becom pirat king .",
 'accompani diver lovabl crew , straw hat pirat , luffi travel across grand line , encount pirat , bounti hunter , variou charact , uniqu abil backstori .',
 '`` one piec `` celebr intric plot , deep charact develop , perfect blend humor , action , drama , make cornerston anim cultur .']

***Now this parts of speech tagging plays an important role in lemmatization.***

lemmatizer = WordNetLemmatizer()

'''
POS
Noun - n
Verb - v
Adjective - a
Adverb - r
'''

lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in filtered_words]

In [None]:
corpus = """
"One Piece" is a widely acclaimed anime and manga series created by Eiichiro Oda. It debuted in 1997 and quickly rose to immense popularity, captivating audiences worldwide with its thrilling adventures and complex storytelling. The series follows Monkey D. Luffy, a spirited young pirate with the ability to stretch his body like rubber due to consuming a Devil Fruit. Luffy's dream is to find the legendary treasure known as "One Piece" and become the Pirate King. Accompanied by his diverse and lovable crew, the Straw Hat Pirates, Luffy travels across the Grand Line, encountering other pirates, bounty hunters, and various other characters, each with their own unique abilities and backstories. "One Piece" is celebrated for its intricate plot, deep character development, and a perfect blend of humor, action, and drama, making it a cornerstone of anime culture."""

from nltk.corpus import stopwords
sentences=nltk.sent_tokenize(corpus)

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
## We will find the Pos Tag

for i in range(len(sentences)):
    words=nltk.word_tokenize(sentences[i])
    words=[word for word in words if word not in set(stopwords.words('english'))]
    #sentences[i]=' '.join(words)# converting all the list of words into sentences
    pos_tag=nltk.pos_tag(words)
    print(pos_tag)

[("''", "''"), ('One', 'CD'), ('Piece', 'NNP'), ("''", "''"), ('widely', 'RB'), ('acclaimed', 'VBD'), ('anime', 'JJ'), ('manga', 'NN'), ('series', 'NN'), ('created', 'VBD'), ('Eiichiro', 'NNP'), ('Oda', 'NNP'), ('.', '.')]
[('It', 'PRP'), ('debuted', 'VBD'), ('1997', 'CD'), ('quickly', 'RB'), ('rose', 'VBD'), ('immense', 'JJ'), ('popularity', 'NN'), (',', ','), ('captivating', 'VBG'), ('audiences', 'NNS'), ('worldwide', 'RB'), ('thrilling', 'VBG'), ('adventures', 'NNS'), ('complex', 'JJ'), ('storytelling', 'NN'), ('.', '.')]
[('The', 'DT'), ('series', 'NN'), ('follows', 'VBZ'), ('Monkey', 'NNP'), ('D.', 'NNP'), ('Luffy', 'NNP'), (',', ','), ('spirited', 'VBD'), ('young', 'JJ'), ('pirate', 'NN'), ('ability', 'NN'), ('stretch', 'VBP'), ('body', 'NN'), ('like', 'IN'), ('rubber', 'NN'), ('due', 'JJ'), ('consuming', 'VBG'), ('Devil', 'NNP'), ('Fruit', 'NNP'), ('.', '.')]
[('Luffy', 'NNP'), ("'s", 'POS'), ('dream', 'NN'), ('find', 'VBP'), ('legendary', 'JJ'), ('treasure', 'NN'), ('known', 'V

###Named Entity Recognition

In [None]:
sentence="The Eiffel Tower was built from 1887 to 1889 by Gustave Eiffel, whose company specialized in building metal frameworks and structures."

In [None]:
import nltk
words=nltk.word_tokenize(sentence)

In [None]:
tag_elements=nltk.pos_tag(words)

In [None]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

In [None]:
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
nltk.ne_chunk(tag_elements).draw()

TclError: no display name and no $DISPLAY environment variable

###Text to Vector

Bag of words - Used in sentiment analysis, small text classification

Process:

Sentences -> lower case -> stop words removal -> Unique words

Now vocabulary and frequecy of that words

With the frequecy of occurance we can consider the features as needed.

Now vectors are constructed based on the features.

If the word occured more than once, we can increase it's count accordingly


*   Binary Bag of words - Even it has more than one occurance it forces to have only one.
*   Bag of words - you can increase the count to more than one as per it's frequecy of occurance

Sentence - "good boy good"
Features - good boy girl => is acheived by frequency of occureance in the doc

1.   [1 1 0] - Binary BOW
2.   [2 1 0] - BOW





In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample text
corpus = """
"One Piece" is a widely acclaimed anime and manga series created by Eiichiro Oda. It debuted in 1997 and quickly rose to immense popularity, captivating audiences worldwide with its thrilling adventures and complex storytelling. The series follows Monkey D. Luffy, a spirited young pirate with the ability to stretch his body like rubber due to consuming a Devil Fruit. Luffy's dream is to find the legendary treasure known as "One Piece" and become the Pirate King. Accompanied by his diverse and lovable crew, the Straw Hat Pirates, Luffy travels across the Grand Line, encountering other pirates, bounty hunters, and various other characters, each with their own unique abilities and backstories. "One Piece" is celebrated for its intricate plot, deep character development, and a perfect blend of humor, action, and drama, making it a cornerstone of anime culture."""

sentences = nltk.sent_tokenize(corpus)

# Apply stopwords and filter and then Apply lemmatizer
for i in range(len(sentences)):
  words = nltk.word_tokenize(sentences[i])
  words = [lemmatizer.lemmatize(word.lower()) for word in words if word not in stopwords.words('english')]
  sentences[i] = ' '.join(words)

# Creating the BoW model
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(sentences)

# Output the vocabulary and the BoW feature matrix
print("Vocabulary:", vectorizer.vocabulary_)
print("Bag of Words Matrix:\n", bow_matrix.toarray())


Vocabulary: {'one': 54, 'piece': 56, 'widely': 74, 'acclaimed': 2, 'anime': 7, 'manga': 51, 'series': 63, 'created': 20, 'eiichiro': 31, 'oda': 53, 'it': 42, 'debuted': 23, '1997': 0, 'quickly': 60, 'rose': 61, 'immense': 40, 'popularity': 59, 'captivating': 14, 'audience': 8, 'worldwide': 75, 'thrilling': 69, 'adventure': 6, 'complex': 17, 'storytelling': 65, 'the': 68, 'follows': 34, 'monkey': 52, 'luffy': 49, 'spirited': 64, 'young': 76, 'pirate': 57, 'ability': 1, 'stretch': 67, 'body': 12, 'like': 46, 'rubber': 62, 'due': 30, 'consuming': 18, 'devil': 26, 'fruit': 35, 'dream': 29, 'find': 33, 'legendary': 45, 'treasure': 71, 'known': 44, 'become': 10, 'king': 43, 'accompanied': 3, 'diverse': 27, 'lovable': 48, 'crew': 21, 'straw': 66, 'hat': 37, 'travel': 70, 'across': 4, 'grand': 36, 'line': 47, 'encountering': 32, 'bounty': 13, 'hunter': 39, 'various': 73, 'character': 16, 'unique': 72, 'backstories': 9, 'celebrated': 15, 'intricate': 41, 'plot': 58, 'deep': 24, 'development': 2

- Sematic meaning like the importance of the word is not captured.


Example :

sentence1 - The food is good

sentence2 - The food is not good


- Not removing stop words, Vocabulary - The, food, is, not, good

- Vectors - [1, 1, 1, 0, 1], [1, 1, 1, 1, 1]

The consine similarity will be very similar like the angle between them will be very less, it's like both have similar meaning
But, it's not true.

- Sparse matrix or array leads to overfitting
- Ording of the word is getting changed.
- Out of vocabulary

###Ngrams

Used for text generation is required, such as chatbots, predictive typing aids, etc.

In [None]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

nltk.download('punkt')

text = "Hello world, welcome to natural language processing."

# Tokenizing the text
tokens = word_tokenize(text)

# Generating Bigrams
bigrams = list(ngrams(tokens, 2))
print("Bigrams:", bigrams)

# Generating Trigrams
trigrams = list(ngrams(tokens, 3))
print("Trigrams:", trigrams)


Bigrams: [('Hello', 'world'), ('world', ','), (',', 'welcome'), ('welcome', 'to'), ('to', 'natural'), ('natural', 'language'), ('language', 'processing'), ('processing', '.')]
Trigrams: [('Hello', 'world', ','), ('world', ',', 'welcome'), (',', 'welcome', 'to'), ('welcome', 'to', 'natural'), ('to', 'natural', 'language'), ('natural', 'language', 'processing'), ('language', 'processing', '.')]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


###TF-IDF (Term Frequency - Inverse Document Frequency)

Term Frequency - (Number of repeation of the word in sentense/number of words in sentence)

Inverse Document Freqeuncy - (Number of Sentences/Number of sentences the word contains)

Vector = TF*IDF

Main Imp advantage of this is the word importance is captured.

TF-IDF scores represent the importance of a word to a document in a collection. It increases proportionally to the number of times a word appears in the document but is offset by the number of documents that contain the word.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = """
"One Piece" is a widely acclaimed anime and manga series created by Eiichiro Oda. It debuted in 1997 and quickly rose to immense popularity, captivating audiences worldwide with its thrilling adventures and complex storytelling. The series follows Monkey D. Luffy, a spirited young pirate with the ability to stretch his body like rubber due to consuming a Devil Fruit. Luffy's dream is to find the legendary treasure known as "One Piece" and become the Pirate King. Accompanied by his diverse and lovable crew, the Straw Hat Pirates, Luffy travels across the Grand Line, encountering other pirates, bounty hunters, and various other characters, each with their own unique abilities and backstories. "One Piece" is celebrated for its intricate plot, deep character development, and a perfect blend of humor, action, and drama, making it a cornerstone of anime culture."""

sentences = nltk.sent_tokenize(corpus)
print(sentences)
# Creating the TF-IDF model
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

# Output the vocabulary and the TF-IDF feature matrix
print("Vocabulary:", tfidf_vectorizer.vocabulary_)
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())

['\n"One Piece" is a widely acclaimed anime and manga series created by Eiichiro Oda.', 'It debuted in 1997 and quickly rose to immense popularity, captivating audiences worldwide with its thrilling adventures and complex storytelling.', 'The series follows Monkey D. Luffy, a spirited young pirate with the ability to stretch his body like rubber due to consuming a Devil Fruit.', 'Luffy\'s dream is to find the legendary treasure known as "One Piece" and become the Pirate King.', 'Accompanied by his diverse and lovable crew, the Straw Hat Pirates, Luffy travels across the Grand Line, encountering other pirates, bounty hunters, and various other characters, each with their own unique abilities and backstories.', '"One Piece" is celebrated for its intricate plot, deep character development, and a perfect blend of humor, action, and drama, making it a cornerstone of anime culture.']
Vocabulary: {'one': 66, 'piece': 70, 'is': 51, 'widely': 91, 'acclaimed': 3, 'anime': 9, 'and': 8, 'manga': 6

###Word Embeddings

Word embeddings are a type of word representation that allows words with similar meaning to have a similar representation. They are a set of feature learning techniques in natural language processing (NLP) where words or phrases from the vocabulary are mapped to vectors of real numbers in a low-dimensional space relative to the vocabulary size.

Concept: Word embeddings are learned in a way that reflects the semantic and syntactic similarities between words. For instance, words like "king" and "queen" will have embeddings that are closer to each other than to embeddings for unrelated words like "apple."

Two thing what Word2Vec gives is the, very good representation of word to vector and gives the similar meaning to the words which are closer in vector space.

**The word meaning is highlighed only with the efficient word conversion to vector**

All one-hot encoding, tf-idf and bag of words are part of Word Embeddings.
With the correct word conversion to vector, we can find the exact similarity between the words more accuratly.

Two Types:


*   Count / Frequency [One-hot encoding, Bag of Words, TF-IDF]
*   Deep leaning trained models [Word2Vec] (More accurate)

Word2Vec uses a Neural Network model to learn **Word associations from large corpus**

Once trained it can detect synonyous words or suggests additional words for a partial sentence.

Word2Vec represents a distict word with a particular list or numbers called vectors.


*   Continous Bag of words
*   Skipgram




We see zeros in all the embeddings it includes (Bag of words, TI-IDF). But, in case of Word2Vec we have somewhat different

Vocabulary -> Unique words in corpus

***Each word inside Vocabulary will be converted into FEATURE REPRESENTATION***

The Numerical number is assigned based on the Unique word and the features.
Features like (Gender, Royal,...etc) n dimensions

More the window size more better the model performs.

At the end of the day, each Unique word in vocubulary is represented in n dimensions vector.

Recommendations also comes here.

Under **Word2Vec** we have two Continous Bag of words, skipgrams. Parallely, we have **Pretrained Models(Google), Train a model from scratch**


**CBOW - Continous Bag of Words**

Prediction - Focus word

**SkipGram**

Prediction - Context word



In [None]:
pip install gensim



In [None]:
import os
print("Current Working Directory:", os.getcwd())
print("Files in the Directory:", os.listdir())


Current Working Directory: /content
Files in the Directory: ['.config', 'sample_data']


In [None]:
import gensim.downloader as api

# Load the Word2Vec model directly from Gensim
word_vectors = api.load("word2vec-google-news-300")  # This loads the model if it's not already downloaded

# Check if the model is loaded correctly
print("Loaded word vectors:", type(word_vectors))


Loaded word vectors: <class 'gensim.models.keyedvectors.KeyedVectors'>


In [None]:
import gensim.downloader as api
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load the Word2Vec model directly from Gensim
word_vectors = api.load("word2vec-google-news-300")  # This loads the model if it's not already downloaded


Accuracy: 0.0


In [None]:
from sklearn.preprocessing import normalize

documents = ["I love this movie", "I hate this movie", "This movie is great", "This movie is terrible"]
labels = [1, 0, 1, 0]  # 1 for positive, 0 for negative
# Function to convert document to averaged word vector
def document_vector(doc):
    words = doc.split()
    word_vectors_list = [word_vectors[word] for word in words if word in word_vectors]
    return np.mean(word_vectors_list, axis=0) if word_vectors_list else np.zeros(300)

# Convert documents to vectors and normalize
X = np.array([document_vector(doc) for doc in documents])
X = normalize(X)  # Normalize vectors

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.25, random_state=42)

# Train the model with different regularization settings
model = LogisticRegression(max_iter=1000, C=0.1)  # Try adjusting C
model.fit(X_train, y_train)

# Predict and evaluate
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Predictions:", predictions)
print("True Labels:", y_test)
print("Accuracy:", accuracy)


Predictions: [1]
True Labels: [0]
Accuracy: 0.0


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import normalize
import numpy as np
def document_vector(doc):
    words = doc.split()
    word_vectors_list = [word_vectors[word] for word in words if word in word_vectors]
    return np.mean(word_vectors_list, axis=0) if word_vectors_list else np.zeros(300)

# Convert documents to vectors and normalize
X = np.array([document_vector(doc) for doc in documents])
X = normalize(X)  # Normalize vectors

# Labels
labels = [1, 0, 1, 0]  # 1 for positive, 0 for negative

# Use SVM and cross-validation
model = SVC(kernel='linear')
scores = cross_val_score(model, X, labels, cv=2)  # Using 5-fold cross-validation

print("Cross-validated scores:", scores)
print("Average score:", np.mean(scores))

Cross-validated scores: [1. 1.]
Average score: 1.0


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

# Generate a simple sequence data
# Example: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 -> Predict the next number in the sequence

# Create training data
def create_sequence(n_steps):
    X, y = [], []
    for i in range(len(n_steps)):
        end_ix = i + 1
        if end_ix > len(n_steps) - 1:
            break
        seq_x, seq_y = n_steps[i:end_ix], n_steps[end_ix]
        X.append(seq_x)
        y.append(seq_y)
        print(X, y)
    return np.array(X), np.array(y)

raw_seq = np.array([i for i in range(10)])
print("Raw_Seq", raw_seq)
X, y = create_sequence(raw_seq)
print("X:", X)
print("y:", y)

# Reshape input to be [samples, time steps, features]
X = X.reshape((X.shape[0], X.shape[1], 1))
print("X:", X)

# Define the RNN model
model = Sequential([
    SimpleRNN(50, activation='relu', input_shape=(X.shape[1], X.shape[2])),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X, y, epochs=300, verbose=0)

# Predict the next number
test_input = np.array([13]).reshape((1, 1, 1))  # Input sequence: [8]
print(test_input)
predicted_number = model.predict(test_input, verbose=0)
print("Predicted number:", predicted_number)


Raw_Seq [0 1 2 3 4 5 6 7 8 9]
[array([0])] [1]
[array([0]), array([1])] [1, 2]
[array([0]), array([1]), array([2])] [1, 2, 3]
[array([0]), array([1]), array([2]), array([3])] [1, 2, 3, 4]
[array([0]), array([1]), array([2]), array([3]), array([4])] [1, 2, 3, 4, 5]
[array([0]), array([1]), array([2]), array([3]), array([4]), array([5])] [1, 2, 3, 4, 5, 6]
[array([0]), array([1]), array([2]), array([3]), array([4]), array([5]), array([6])] [1, 2, 3, 4, 5, 6, 7]
[array([0]), array([1]), array([2]), array([3]), array([4]), array([5]), array([6]), array([7])] [1, 2, 3, 4, 5, 6, 7, 8]
[array([0]), array([1]), array([2]), array([3]), array([4]), array([5]), array([6]), array([7]), array([8])] [1, 2, 3, 4, 5, 6, 7, 8, 9]
X: [[0]
 [1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [7]
 [8]]
y: [1 2 3 4 5 6 7 8 9]
X: [[[0]]

 [[1]]

 [[2]]

 [[3]]

 [[4]]

 [[5]]

 [[6]]

 [[7]]

 [[8]]]
[[[13]]]




Predicted number: [[14.485639]]
