## Imports

In [3]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

## texts

In [1]:
paragraph="Satoru Gojo is a fictional character from Gege Akutami's manga and anime series Jujutsu Kaisen. He was first introduced in Akutami's short series Tokyo Metropolitan Curse Technical School as the mentor of the cursed teenager Yuta Okkotsu, who suffers a curse of Rika teaches him at Tokyo Prefectural Jujutsu High School. This miniseries became the prequel Jujutsu Kaisen 0 of Jujutsu Kaisen. In main series of Jujutsu Kaisen, Gojo takes the same role but mentors the student Yuji Itadori who suffers a curse of Sukuna, helping him become stronger while protecting other characters in the series.Gojo was designed by Akutami to be a formidable yet endearing figure who is passionate about his students. He is voiced by Yūichi Nakamura in Japanese and Kaiji Tang in English and Lohit Sharma in Hindi in the animated adaptations by MAPPA.The character was well-received by the media for his carefree nature and power shown when protecting his students, becoming the series's breakout character. Furthermore, his role in the prequel Jujutsu Kaisen 0 was appreciated by the media due to his hidden traits,[clarification needed] such as his relationship with the antagonist Suguru Geto."

## tokenization

In [4]:
## The paragraph type of text will be converted into
## a sentences form 
sentences=nltk.sent_tokenize(paragraph)

In [5]:
sentences[0]

"Satoru Gojo is a fictional character from Gege Akutami's manga and anime series Jujutsu Kaisen."

In [6]:
len(sentences)

6

## Cleaningt the sentences

In [8]:
corpus=[]
for i in range(len(sentences)):
    text=re.sub('[^a-zA-Z]',' ',sentences[i])
    text=text.lower()
    corpus.append(text)

In [9]:
corpus

['satoru gojo is a fictional character from gege akutami s manga and anime series jujutsu kaisen ',
 'he was first introduced in akutami s short series tokyo metropolitan curse technical school as the mentor of the cursed teenager yuta okkotsu  who suffers a curse of rika teaches him at tokyo prefectural jujutsu high school ',
 'this miniseries became the prequel jujutsu kaisen   of jujutsu kaisen ',
 'in main series of jujutsu kaisen  gojo takes the same role but mentors the student yuji itadori who suffers a curse of sukuna  helping him become stronger while protecting other characters in the series gojo was designed by akutami to be a formidable yet endearing figure who is passionate about his students ',
 'he is voiced by y ichi nakamura in japanese and kaiji tang in english and lohit sharma in hindi in the animated adaptations by mappa the character was well received by the media for his carefree nature and power shown when protecting his students  becoming the series s breakout c

# comparing

In [12]:
for i in range(len(sentences)):
    print(i," : ",sentences[i],"\n")

0  :  Satoru Gojo is a fictional character from Gege Akutami's manga and anime series Jujutsu Kaisen. 

1  :  He was first introduced in Akutami's short series Tokyo Metropolitan Curse Technical School as the mentor of the cursed teenager Yuta Okkotsu, who suffers a curse of Rika teaches him at Tokyo Prefectural Jujutsu High School. 

2  :  This miniseries became the prequel Jujutsu Kaisen 0 of Jujutsu Kaisen. 

3  :  In main series of Jujutsu Kaisen, Gojo takes the same role but mentors the student Yuji Itadori who suffers a curse of Sukuna, helping him become stronger while protecting other characters in the series.Gojo was designed by Akutami to be a formidable yet endearing figure who is passionate about his students. 

4  :  He is voiced by Yūichi Nakamura in Japanese and Kaiji Tang in English and Lohit Sharma in Hindi in the animated adaptations by MAPPA.The character was well-received by the media for his carefree nature and power shown when protecting his students, becoming the

In [13]:
for i in range(len(corpus)):
    print(i," : ",corpus[i],"\n")

0  :  satoru gojo is a fictional character from gege akutami s manga and anime series jujutsu kaisen  

1  :  he was first introduced in akutami s short series tokyo metropolitan curse technical school as the mentor of the cursed teenager yuta okkotsu  who suffers a curse of rika teaches him at tokyo prefectural jujutsu high school  

2  :  this miniseries became the prequel jujutsu kaisen   of jujutsu kaisen  

3  :  in main series of jujutsu kaisen  gojo takes the same role but mentors the student yuji itadori who suffers a curse of sukuna  helping him become stronger while protecting other characters in the series gojo was designed by akutami to be a formidable yet endearing figure who is passionate about his students  

4  :  he is voiced by y ichi nakamura in japanese and kaiji tang in english and lohit sharma in hindi in the animated adaptations by mappa the character was well received by the media for his carefree nature and power shown when protecting his students  becoming the

## stemming

In [18]:
stemmer = nltk.PorterStemmer()
print("Before STEMMING -> After Stemming")
for i in corpus:
    words=nltk.word_tokenize(i)
    for word in words:
        if word not in set(stopwords.words('english')):
            print(word,'->',stemmer.stem(word))

Before STEMMING -> After Stemming
satoru -> satoru
gojo -> gojo
fictional -> fiction
character -> charact
gege -> gege
akutami -> akutami
manga -> manga
anime -> anim
series -> seri
jujutsu -> jujutsu
kaisen -> kaisen
first -> first
introduced -> introduc
akutami -> akutami
short -> short
series -> seri
tokyo -> tokyo
metropolitan -> metropolitan
curse -> curs
technical -> technic
school -> school
mentor -> mentor
cursed -> curs
teenager -> teenag
yuta -> yuta
okkotsu -> okkotsu
suffers -> suffer
curse -> curs
rika -> rika
teaches -> teach
tokyo -> tokyo
prefectural -> prefectur
jujutsu -> jujutsu
high -> high
school -> school
miniseries -> miniseri
became -> becam
prequel -> prequel
jujutsu -> jujutsu
kaisen -> kaisen
jujutsu -> jujutsu
kaisen -> kaisen
main -> main
series -> seri
jujutsu -> jujutsu
kaisen -> kaisen
gojo -> gojo
takes -> take
role -> role
mentors -> mentor
student -> student
yuji -> yuji
itadori -> itadori
suffers -> suffer
curse -> curs
sukuna -> sukuna
helping -> he

## lemmatization

In [21]:
lemmatizer=WordNetLemmatizer()
print("Before Lemmatizing -> After Lemmatizing")
for i in corpus:
    words=nltk.word_tokenize(i)
    for word in words:
        if word not in set(stopwords.words('english')):
            print(word,'->',lemmatizer.lemmatize(word))

Before Lemmatizing -> After Lemmatizing
satoru -> satoru
gojo -> gojo
fictional -> fictional
character -> character
gege -> gege
akutami -> akutami
manga -> manga
anime -> anime
series -> series
jujutsu -> jujutsu
kaisen -> kaisen
first -> first
introduced -> introduced
akutami -> akutami
short -> short
series -> series
tokyo -> tokyo
metropolitan -> metropolitan
curse -> curse
technical -> technical
school -> school
mentor -> mentor
cursed -> cursed
teenager -> teenager
yuta -> yuta
okkotsu -> okkotsu
suffers -> suffers
curse -> curse
rika -> rika
teaches -> teach
tokyo -> tokyo
prefectural -> prefectural
jujutsu -> jujutsu
high -> high
school -> school
miniseries -> miniseries
became -> became
prequel -> prequel
jujutsu -> jujutsu
kaisen -> kaisen
jujutsu -> jujutsu
kaisen -> kaisen
main -> main
series -> series
jujutsu -> jujutsu
kaisen -> kaisen
gojo -> gojo
takes -> take
role -> role
mentors -> mentor
student -> student
yuji -> yuji
itadori -> itadori
suffers -> suffers
curse -> c

## BOW implementation

In [33]:
count_vect = CountVectorizer()
random=[]
random.append(corpus[0])
random.append(corpus[1])
bow_matrix = count_vect.fit_transform(random)

In [48]:
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=count_vect.get_feature_names_out())
bow_df

Unnamed: 0,akutami,and,anime,as,at,character,curse,cursed,fictional,first,...,short,suffers,teaches,technical,teenager,the,tokyo,was,who,yuta
0,1,1,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,1,0,2,1,0,1,...,1,1,1,1,1,2,2,1,1,1


## TF- IDF implementation

In [49]:
tfidf_vect = TfidfVectorizer()
tfidf_matrix = tfidf_vect.fit_transform(random)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vect.get_feature_names_out())
tfidf_df

Unnamed: 0,akutami,and,anime,as,at,character,curse,cursed,fictional,first,...,short,suffers,teaches,technical,teenager,the,tokyo,was,who,yuta
0,0.201094,0.282631,0.282631,0.0,0.0,0.282631,0.0,0.0,0.282631,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.107855,0.0,0.0,0.151587,0.151587,0.0,0.303174,0.151587,0.0,0.151587,...,0.151587,0.151587,0.151587,0.151587,0.151587,0.303174,0.303174,0.151587,0.151587,0.151587


## BOW vs TF - IDF 

In [76]:
for i in bow_df.columns:
    print(i)
    print(bow_df[i],"\n\n")

akutami
0    1
1    1
Name: akutami, dtype: int64 


and
0    1
1    0
Name: and, dtype: int64 


anime
0    1
1    0
Name: anime, dtype: int64 


as
0    0
1    1
Name: as, dtype: int64 


at
0    0
1    1
Name: at, dtype: int64 


character
0    1
1    0
Name: character, dtype: int64 


curse
0    0
1    2
Name: curse, dtype: int64 


cursed
0    0
1    1
Name: cursed, dtype: int64 


fictional
0    1
1    0
Name: fictional, dtype: int64 


first
0    0
1    1
Name: first, dtype: int64 


from
0    1
1    0
Name: from, dtype: int64 


gege
0    1
1    0
Name: gege, dtype: int64 


gojo
0    1
1    0
Name: gojo, dtype: int64 


he
0    0
1    1
Name: he, dtype: int64 


high
0    0
1    1
Name: high, dtype: int64 


him
0    0
1    1
Name: him, dtype: int64 


in
0    0
1    1
Name: in, dtype: int64 


introduced
0    0
1    1
Name: introduced, dtype: int64 


is
0    1
1    0
Name: is, dtype: int64 


jujutsu
0    1
1    1
Name: jujutsu, dtype: int64 


kaisen
0    1
1    0
Name: kai

In [77]:
for i in tfidf_df.columns:
    print(i)
    print(tfidf_df[i],"\n\n")

akutami
0    0.201094
1    0.107855
Name: akutami, dtype: float64 


and
0    0.282631
1    0.000000
Name: and, dtype: float64 


anime
0    0.282631
1    0.000000
Name: anime, dtype: float64 


as
0    0.000000
1    0.151587
Name: as, dtype: float64 


at
0    0.000000
1    0.151587
Name: at, dtype: float64 


character
0    0.282631
1    0.000000
Name: character, dtype: float64 


curse
0    0.000000
1    0.303174
Name: curse, dtype: float64 


cursed
0    0.000000
1    0.151587
Name: cursed, dtype: float64 


fictional
0    0.282631
1    0.000000
Name: fictional, dtype: float64 


first
0    0.000000
1    0.151587
Name: first, dtype: float64 


from
0    0.282631
1    0.000000
Name: from, dtype: float64 


gege
0    0.282631
1    0.000000
Name: gege, dtype: float64 


gojo
0    0.282631
1    0.000000
Name: gojo, dtype: float64 


he
0    0.000000
1    0.151587
Name: he, dtype: float64 


high
0    0.000000
1    0.151587
Name: high, dtype: float64 


him
0    0.000000
1    0.151587
N

In [84]:
comparison_df = pd.DataFrame({
    "BOW": bow_df.iloc[1],
    "TF-IDF": tfidf_df.iloc[1]
})
comparison_df.T

Unnamed: 0,akutami,and,anime,as,at,character,curse,cursed,fictional,first,...,short,suffers,teaches,technical,teenager,the,tokyo,was,who,yuta
BOW,1.0,0.0,0.0,1.0,1.0,0.0,2.0,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0
TF-IDF,0.107855,0.0,0.0,0.151587,0.151587,0.0,0.303174,0.151587,0.0,0.151587,...,0.151587,0.151587,0.151587,0.151587,0.151587,0.303174,0.303174,0.151587,0.151587,0.151587


In [95]:
random[0]


'satoru gojo is a fictional character from gege akutami s manga and anime series jujutsu kaisen '

In [96]:
random[1]

'he was first introduced in akutami s short series tokyo metropolitan curse technical school as the mentor of the cursed teenager yuta okkotsu  who suffers a curse of rika teaches him at tokyo prefectural jujutsu high school '

In [97]:
bow_df['series']

0    1
1    1
Name: series, dtype: int64