# TEXTBLOB


In [30]:
!pip install -U textblob
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


In [32]:
from textblob import TextBlob

# N-Gram method

In [33]:
blob = TextBlob('Hello, Good Morning everyone ! Hopw you all are doing well.')
blob

TextBlob("Hello, Good Morning everyone ! Hopw you all are doing well.")

In [34]:
# Uni- gram
blob.ngrams(n=1)

[WordList(['Hello']),
 WordList(['Good']),
 WordList(['Morning']),
 WordList(['everyone']),
 WordList(['Hopw']),
 WordList(['you']),
 WordList(['all']),
 WordList(['are']),
 WordList(['doing']),
 WordList(['well'])]

In [35]:
# Bi-gram
blob.ngrams(n=2)

[WordList(['Hello', 'Good']),
 WordList(['Good', 'Morning']),
 WordList(['Morning', 'everyone']),
 WordList(['everyone', 'Hopw']),
 WordList(['Hopw', 'you']),
 WordList(['you', 'all']),
 WordList(['all', 'are']),
 WordList(['are', 'doing']),
 WordList(['doing', 'well'])]

In [36]:
# Tri-gram
blob.ngrams(n=3)

[WordList(['Hello', 'Good', 'Morning']),
 WordList(['Good', 'Morning', 'everyone']),
 WordList(['Morning', 'everyone', 'Hopw']),
 WordList(['everyone', 'Hopw', 'you']),
 WordList(['Hopw', 'you', 'all']),
 WordList(['you', 'all', 'are']),
 WordList(['all', 'are', 'doing']),
 WordList(['are', 'doing', 'well'])]

In [37]:
# Tetra-gram
blob.ngrams(n=4)

[WordList(['Hello', 'Good', 'Morning', 'everyone']),
 WordList(['Good', 'Morning', 'everyone', 'Hopw']),
 WordList(['Morning', 'everyone', 'Hopw', 'you']),
 WordList(['everyone', 'Hopw', 'you', 'all']),
 WordList(['Hopw', 'you', 'all', 'are']),
 WordList(['you', 'all', 'are', 'doing']),
 WordList(['all', 'are', 'doing', 'well'])]

In [38]:
from textblob import Word
from textblob.wordnet import VERB

In [39]:
my_sentence = TextBlob("You are good."
                      "He is simply brillent."
                      "I am teaching NLP."
                      "He is expert in NLP."
                      "You are doing well.")

In [40]:
my_sentence

TextBlob("You are good.He is simply brillent.I am teaching NLP.He is expert in NLP.You are doing well.")

In [41]:
my_sentence.words.count("is")

2

In [42]:
my_sentence.words.count("are")

2

In [43]:
my_sentence.sentences

[Sentence("You are good.He is simply brillent.I am teaching NLP.He is expert in NLP.You are doing well.")]

In [48]:
paragraph = TextBlob("The SCES, which had 28 members, including 10 non-official members, was also chaired by Mr. Sen. The panel was mandated to review the framework for economic indicators pertaining to the industrial sector, the services sector and the labour force statistics. This meant that their purview was limited to datasets like the Periodic Labour Force Survey, the Annual Survey of Industries, the Index of Industrial Production and the Economic Census.")

In [49]:
paragraph.word_counts

defaultdict(int,
            {'the': 10,
             'sces': 1,
             'which': 1,
             'had': 1,
             '28': 1,
             'members': 2,
             'including': 1,
             '10': 1,
             'non-official': 1,
             'was': 3,
             'also': 1,
             'chaired': 1,
             'by': 1,
             'mr': 1,
             'sen': 1,
             'panel': 1,
             'mandated': 1,
             'to': 3,
             'review': 1,
             'framework': 1,
             'for': 1,
             'economic': 2,
             'indicators': 1,
             'pertaining': 1,
             'industrial': 2,
             'sector': 2,
             'services': 1,
             'and': 2,
             'labour': 2,
             'force': 2,
             'statistics': 1,
             'this': 1,
             'meant': 1,
             'that': 1,
             'their': 1,
             'purview': 1,
             'limited': 1,
             'datasets': 1,
     

In [50]:
sentence = TextBlob("Lionel Messi walked out from behind a curtain, took a few steps along a rain-slicked runaway set up over the field at Inter Miami's stadium and headed toward David Beckham for a big hug.")

In [55]:
sentence.words[10].pluralize()

'fews'

In [57]:
# Learning purpose
sentence = TextBlob("Jishnu is my best friend")

In [59]:
sentence.words[4].pluralize()

'friends'

In [60]:
w = Word('went')
w1 = Word('go')
w2 = Word('going')

In [61]:
w.lemmatize('v') # V is verb

'go'

In [62]:
w1.lemmatize('v')

'go'

In [63]:
w2.lemmatize('v')

'go'

In [64]:
paragraph

TextBlob("The SCES, which had 28 members, including 10 non-official members, was also chaired by Mr. Sen. The panel was mandated to review the framework for economic indicators pertaining to the industrial sector, the services sector and the labour force statistics. This meant that their purview was limited to datasets like the Periodic Labour Force Survey, the Annual Survey of Industries, the Index of Industrial Production and the Economic Census.")

In [65]:
# Gives what parts of speech
paragraph.tags

[('The', 'DT'),
 ('SCES', 'NNP'),
 ('which', 'WDT'),
 ('had', 'VBD'),
 ('28', 'CD'),
 ('members', 'NNS'),
 ('including', 'VBG'),
 ('10', 'CD'),
 ('non-official', 'JJ'),
 ('members', 'NNS'),
 ('was', 'VBD'),
 ('also', 'RB'),
 ('chaired', 'VBN'),
 ('by', 'IN'),
 ('Mr.', 'NNP'),
 ('Sen', 'NNP'),
 ('The', 'DT'),
 ('panel', 'NN'),
 ('was', 'VBD'),
 ('mandated', 'VBN'),
 ('to', 'TO'),
 ('review', 'VB'),
 ('the', 'DT'),
 ('framework', 'NN'),
 ('for', 'IN'),
 ('economic', 'JJ'),
 ('indicators', 'NNS'),
 ('pertaining', 'VBG'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('industrial', 'JJ'),
 ('sector', 'NN'),
 ('the', 'DT'),
 ('services', 'NNS'),
 ('sector', 'NN'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('labour', 'JJ'),
 ('force', 'NN'),
 ('statistics', 'NNS'),
 ('This', 'DT'),
 ('meant', 'NN'),
 ('that', 'IN'),
 ('their', 'PRP$'),
 ('purview', 'NN'),
 ('was', 'VBD'),
 ('limited', 'VBN'),
 ('to', 'TO'),
 ('datasets', 'NNS'),
 ('like', 'IN'),
 ('the', 'DT'),
 ('Periodic', 'NNP'),
 ('Labour', 'NNP'),
 ('Force', 

In [66]:
for word, pos in paragraph.tags:
  print(word + "==>" +pos)

The==>DT
SCES==>NNP
which==>WDT
had==>VBD
28==>CD
members==>NNS
including==>VBG
10==>CD
non-official==>JJ
members==>NNS
was==>VBD
also==>RB
chaired==>VBN
by==>IN
Mr.==>NNP
Sen==>NNP
The==>DT
panel==>NN
was==>VBD
mandated==>VBN
to==>TO
review==>VB
the==>DT
framework==>NN
for==>IN
economic==>JJ
indicators==>NNS
pertaining==>VBG
to==>TO
the==>DT
industrial==>JJ
sector==>NN
the==>DT
services==>NNS
sector==>NN
and==>CC
the==>DT
labour==>JJ
force==>NN
statistics==>NNS
This==>DT
meant==>NN
that==>IN
their==>PRP$
purview==>NN
was==>VBD
limited==>VBN
to==>TO
datasets==>NNS
like==>IN
the==>DT
Periodic==>NNP
Labour==>NNP
Force==>NNP
Survey==>NNP
the==>DT
Annual==>NNP
Survey==>NNP
of==>IN
Industries==>NNPS
the==>DT
Index==>NNP
of==>IN
Industrial==>NNP
Production==>NNP
and==>CC
the==>DT
Economic==>NNP
Census==>NNP


In [69]:
my_sentence = TextBlob("I havv goood knowldge about python")

In [70]:
my_sentence

TextBlob("I havv goood knowldge about python")

In [72]:
# Correct spellings
print(my_sentence.correct())

I have good knowledge about patron


# Translation

In [77]:
japanes_blob = TextBlob('財務省は2024年度予算の概算要求で設ける特別枠の対象に、賃上げなど人への投資の対策を含める。脱炭素などと合わせ、岸田文雄政権が打ち出す「新しい資本主義」に重点配分する。物価高対策や少子化対策は金額を示さない「事項要求」を認め、年末までに必要額を検討する。特別枠は政府の重要課題と位置づける政策に予算を重点配分する仕組みだ。各省庁が翌年度の予算要求を財務省に出す際のルールである概算要求基準に盛り')

In [78]:
japanes_blob

TextBlob("財務省は2024年度予算の概算要求で設ける特別枠の対象に、賃上げなど人への投資の対策を含める。脱炭素などと合わせ、岸田文雄政権が打ち出す「新しい資本主義」に重点配分する。物価高対策や少子化対策は金額を示さない「事項要求」を認め、年末までに必要額を検討する。特別枠は政府の重要課題と位置づける政策に予算を重点配分する仕組みだ。各省庁が翌年度の予算要求を財務省に出す際のルールである概算要求基準に盛り")

In [79]:
japanes_blob.translate(from_lang='ja-JP', to='hi')

TextBlob("वित्त मंत्रालय में लोगों में निवेश करने के उपाय शामिल हैं, जैसे कि मजदूरी बढ़ाना, वित्त वर्ष 2014 में बजट के लिए बजट अनुरोध में निर्धारित विशेष फ्रेम के लिए। कार कैलोरल के साथ संयोजन में, फुमियो किशिदा प्रशासन द्वारा शुरू किए गए "न्यू कैपिटलिज्म" पर ध्यान केंद्रित करें। मूल्य उपाय और घटते जन्म के उपायों को "अनुरोधों" के लिए मान्यता प्राप्त है जो राशि का संकेत नहीं देते हैं, और वर्ष के अंत तक आवश्यक राशि की जांच करते हैं। विशेष फ्रेम एक ऐसा तंत्र है जो बजट पर ध्यान केंद्रित करता है जो उन नीतियों पर केंद्रित है जो सरकारी महत्वपूर्ण मुद्दों के रूप में तैनात हैं। वित्त मंत्रालय को निम्न वित्तीय वर्ष के लिए बजट अनुरोध करने के लिए प्रत्येक मंत्रालय और एजेंसी के लिए लगभग अनुमानित अनुरोध मानदंड।")

In [80]:
my_sen = TextBlob("On June 15, the Delhi police had filed a 1,000-page charge sheet against Mr. Singh and Mr. Tomar, which includes the testimonies of witnesses, victims and other concerned persons. Mr. Singh had been summoned before the court after hearing the submissions and considering the police report along with its annexed documents, including the statements of the victims, the court took cognisance of the offences committed under Sections 354 (outraging modesty), 354A (sexually coloured remarks) and 354D (stalking), 506 (para 1) (criminal intimidation) and 109 (abetment to offense) of the Indian Penal Code.")

In [81]:
my_sen.translate(from_lang='en', to='hi')

TextBlob("6月15日、デリー警察は、証人、犠牲者、その他の関係者の証言を含むシン氏とトマール氏に対して1,000ページの請求シートを提出しました。シン氏は、提出を聞いた後、裁判所に召喚され、被害者の声明を含む併合された文書とともに警察の報告書を検討した後、裁判所はセクション354（激怒する謙虚さ）、354a（性的にインド刑法の色付きの発言）および354d（ストーキング）、506（刑事脅迫）および109（犯罪への侵害）。")

# Term Frequency-Inverse Document Frequency (TF-IDF)


Term Frequency-Inverse Document Frequency (TF-IDF) is a numerical statistic used in natural language processing and information retrieval to evaluate the importance of a term within a document or a collection of documents.

TF-IDF combines two factors: term frequency (TF) and inverse document frequency (IDF). Term frequency measures the frequency of a term within a document, while inverse document frequency measures how rare or common a term is across all documents in a collection.

The TF-IDF value for a term in a document increases proportionally to the number of times it appears in the document (term frequency) but is offset by the rarity of the term in the entire collection of documents (inverse document frequency). This helps to identify important terms that are frequent within a document but relatively rare across other documents.

The TF-IDF calculation is performed by multiplying the term frequency (TF) by the inverse document frequency (IDF). The resulting value represents the relevance or significance of a term in a specific document.

TF-IDF is commonly used for tasks such as information retrieval, text mining, and document classification. It allows for the extraction of important keywords and the ranking of documents based on their relevance to a given query or topic.


In [83]:
import nltk

In [84]:
paragraph = "Term Frequency-Inverse Document Frequency (TF-IDF) is a numerical statistic used in natural language processing and information retrieval to evaluate the importance of a term within a document or a collection of documents.TF-IDF combines two factors: term frequency (TF) and inverse document frequency (IDF). Term frequency measures the frequency of a term within a document, while inverse document frequency measures how rare or common a term is across all documents in a collection.The TF-IDF value for a term in a document increases proportionally to the number of times it appears in the document (term frequency) but is offset by the rarity of the term in the entire collection of documents (inverse document frequency). This helps to identify important terms that are frequent within a document but relatively rare across other documents.The TF-IDF calculation is performed by multiplying the term frequency (TF) by the inverse document frequency (IDF). The resulting value represents the relevance or significance of a term in a specific document.TF-IDF is commonly used for tasks such as information retrieval, text mining, and document classification. It allows for the extraction of important keywords and the ranking of documents based on their relevance to a given query or topic."

In [85]:
paragraph

'Term Frequency-Inverse Document Frequency (TF-IDF) is a numerical statistic used in natural language processing and information retrieval to evaluate the importance of a term within a document or a collection of documents.TF-IDF combines two factors: term frequency (TF) and inverse document frequency (IDF). Term frequency measures the frequency of a term within a document, while inverse document frequency measures how rare or common a term is across all documents in a collection.The TF-IDF value for a term in a document increases proportionally to the number of times it appears in the document (term frequency) but is offset by the rarity of the term in the entire collection of documents (inverse document frequency). This helps to identify important terms that are frequent within a document but relatively rare across other documents.The TF-IDF calculation is performed by multiplying the term frequency (TF) by the inverse document frequency (IDF). The resulting value represents the rele

In [90]:
import nltk
nltk.download('stopwords')

# Pre-Processing/Clean the text
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemme = WordNetLemmatizer()

sentence = nltk.sent_tokenize(paragraph)

corpus = []

for i in range(len(sentence)):
    review = re.sub('[^a-zA-Z]', ' ', sentence[i])
    review = review.lower()
    review = review.split()
    review = [lemme.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [91]:
corpus

['term frequency inverse document frequency tf idf numerical statistic used natural language processing information retrieval evaluate importance term within document collection document tf idf combine two factor term frequency tf inverse document frequency idf',
 'term frequency measure frequency term within document inverse document frequency measure rare common term across document collection tf idf value term document increase proportionally number time appears document term frequency offset rarity term entire collection document inverse document frequency',
 'help identify important term frequent within document relatively rare across document tf idf calculation performed multiplying term frequency tf inverse document frequency idf',
 'resulting value represents relevance significance term specific document tf idf commonly used task information retrieval text mining document classification',
 'allows extraction important keywords ranking document based relevance given query topic'

# Apply TF-IDF Model

In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vector = TfidfVectorizer()
x = tfidf_vector.fit_transform(corpus).toarray()

In [93]:
x

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.13790808, 0.17093351, 0.        , 0.        ,
        0.32580293, 0.        , 0.17093351, 0.        , 0.17093351,
        0.45790466, 0.        , 0.        , 0.        , 0.        ,
        0.28890287, 0.17093351, 0.        , 0.        , 0.13790808,
        0.22895233, 0.        , 0.17093351, 0.        , 0.        ,
        0.        , 0.17093351, 0.        , 0.17093351, 0.        ,
        0.        , 0.17093351, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.13790808, 0.        , 0.        , 0.17093351,
        0.        , 0.28890287, 0.        , 0.28890287, 0.        ,
        0.        , 0.17093351, 0.13790808, 0.        , 0.11447616],
       [0.10958915, 0.        , 0.13583292, 0.        , 0.        ,
        0.        , 0.21917829, 0.        , 0.13583292, 0.        ,
        0.45307581, 0.13583292, 0.        , 0. 

In [94]:
import pandas as pd
df = pd.DataFrame(x)

In [95]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,0.0,0.0,0.0,0.0,0.0,0.0,0.137908,0.170934,0.0,0.0,...,0.0,0.288903,0.0,0.288903,0.0,0.0,0.170934,0.137908,0.0,0.114476
1,0.109589,0.0,0.135833,0.0,0.0,0.0,0.219178,0.0,0.135833,0.0,...,0.0,0.459155,0.0,0.076526,0.135833,0.0,0.0,0.0,0.109589,0.090969
2,0.192882,0.0,0.0,0.0,0.239072,0.0,0.0,0.0,0.0,0.0,...,0.0,0.269378,0.0,0.269378,0.0,0.0,0.0,0.0,0.0,0.160109
3,0.0,0.0,0.0,0.0,0.0,0.26617,0.0,0.0,0.0,0.26617,...,0.26617,0.149956,0.26617,0.149956,0.0,0.0,0.0,0.214744,0.214744,0.0
4,0.0,0.323951,0.0,0.323951,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.323951,0.0,0.0,0.0,0.0


