In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Tokenization breaks the raw text into words, sentences called token
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
text = 'Best quality, Great sound, Good comfort with superb Headphone Design.'
word_tokenize(text)

['Best',
 'quality',
 ',',
 'Great',
 'sound',
 ',',
 'Good',
 'comfort',
 'with',
 'superb',
 'Headphone',
 'Design',
 '.']

In [None]:
text = 'Buying gold is considered auspicious on weddings, religious rituals, and special occasions. This is the reason why, even though the government and the Reserve Bank of India (RBI) introduced schemes like sovereign gold bonds, people remained inclined towards physical gold.People not only like to wear it as jewellery, but also consider it a safe investment that can be sold when needed. This is the reason why tonnes of gold are bought every year in the Indian market, and people’s interest in it never fades.'
sent_tokenize(text)

['Buying gold is considered auspicious on weddings, religious rituals, and special occasions.',
 'This is the reason why, even though the government and the Reserve Bank of India (RBI) introduced schemes like sovereign gold bonds, people remained inclined towards physical gold.People not only like to wear it as jewellery, but also consider it a safe investment that can be sold when needed.',
 'This is the reason why tonnes of gold are bought every year in the Indian market, and people’s interest in it never fades.']

In [None]:
# split()
text = 'Best quality, Great sound, Good comfort with superb Headphone Design.'
text.split()

['Best',
 'quality,',
 'Great',
 'sound,',
 'Good',
 'comfort',
 'with',
 'superb',
 'Headphone',
 'Design.']

In [None]:
# filtering stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [None]:
text = 'Best quality, great sound, Good comfort with superb Headphone Design.'
words = word_tokenize(text)

In [None]:
filtered_list = []
for word in words:
    if word.casefold() not in stopwords:
        filtered_list.append(word)
filtered_list

['Best',
 'quality',
 ',',
 'Great',
 'sound',
 ',',
 'Good',
 'comfort',
 'superb',
 'Headphone',
 'Design',
 '.']

In [None]:
filtered_list = []
for word in words:
    if word.casefold() not in stopwords:
        filtered_list.append(word)
filtered_list

In [None]:
filtered_list = [word for word in words if word.casefold() not in stopwords]
filtered_list

['Best',
 'quality',
 ',',
 'great',
 'sound',
 ',',
 'Good',
 'comfort',
 'superb',
 'Headphone',
 'Design',
 '.']

In [None]:
# Stemming - keep only stem part of word and discard anything else
from nltk.stem import PorterStemmer
ps = PorterStemmer()
word = ['playing', 'plays', 'played', 'player']
for w in word:
    print(ps.stem(w))

play
play
play
player


In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# lemmatizer - keep only stem part of word and discard anything else
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
word = ['playing', 'plays', 'played', 'player']
for w in word:
    print(lemmatizer.lemmatize(w))

playing
play
played
player


In [None]:
# Lemmatization vs. Stemming: Key Differences
# Lemmatization: Converts words to their base or dictionary form (lemma).
# Stemming: Reduces words to their root form (stem), which may not be a valid word.

In [None]:
words = ['Discovery', 'discover', 'discoveries', 'discovering']
for word in words:
    print(lemmatizer.lemmatize(word))

Discovery
discover
discovery
discovering


In [None]:
words = ['Discovery', 'discover', 'discoveries', 'discovering']
for word in words:
    print(ps.stem(word))

discoveri
discov
discoveri
discov


In [None]:
# Part-of-speech tagging [pos tagging] -- mark each word as noun, adjective, etc
#
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
text = 'the quick brown fox jumps over the little lazy dog'
words = word_tokenize(text)
nltk.pos_tag(words)

[('the', 'DT'),
 ('quick', 'JJ'),
 ('brown', 'NN'),
 ('fox', 'NN'),
 ('jumps', 'VBZ'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('little', 'JJ'),
 ('lazy', 'JJ'),
 ('dog', 'NN')]

In [None]:
# TextBlob.correct() method
# will correct the spelling mistakes, but the % is not that high
from textblob import TextBlob
text = TextBlob('I havv goood speling!')
text.correct()

TextBlob("I have good spelling!")

In [None]:
test = TextBlob('XYZ is a good compny and alays valule ttheir employees.')
test = test.correct()
print(test)

XYZ is a good company and always value their employees.


In [None]:
!pip install googletrans

Collecting googletrans
  Downloading googletrans-4.0.2-py3-none-any.whl.metadata (10 kB)
Downloading googletrans-4.0.2-py3-none-any.whl (18 kB)
Installing collected packages: googletrans
Successfully installed googletrans-4.0.2


In [None]:
import googletrans
googletrans.LANGUAGES

{'abk': 'abkhaz',
 'ace': 'acehnese',
 'ach': 'acholi',
 'aar': 'afar',
 'af': 'afrikaans',
 'sq': 'albanian',
 'alz': 'alur',
 'am': 'amharic',
 'ar': 'arabic',
 'hy': 'armenian',
 'as': 'assamese',
 'ava': 'avar',
 'awa': 'awadhi',
 'ay': 'aymara',
 'az': 'azerbaijani',
 'ban': 'balinese',
 'bal': 'baluchi',
 'bm': 'bambara',
 'bci': 'baoulé',
 'bak': 'bashkir',
 'eu': 'basque',
 'btx': 'batak karo',
 'bts': 'batak simalungun',
 'bbc': 'batak toba',
 'be': 'belarusian',
 'bem': 'bemba',
 'bn': 'bengali',
 'bew': 'betawi',
 'bho': 'bhojpuri',
 'bik': 'bikol',
 'bs': 'bosnian',
 'bre': 'breton',
 'bg': 'bulgarian',
 'bua': 'buryat',
 'yue': 'cantonese',
 'ca': 'catalan',
 'ceb': 'cebuano',
 'cha': 'chamorro',
 'che': 'chechen',
 'zh': 'chinese',
 'zh-cn': 'chinese (simplified)',
 'zh-tw': 'chinese (traditional)',
 'chk': 'chuukese',
 'chv': 'chuvash',
 'co': 'corsican',
 'crh': 'crimean tatar',
 'hr': 'croatian',
 'cs': 'czech',
 'da': 'danish',
 'fa-af': 'dari',
 'dv': 'dhivehi',
 'di

In [None]:
from googletrans import Translator
translator = Translator()

In [None]:
from_lang =  'th'
to_lang = 'en'
text = 'อรุณสวัสดิ์'
res = await translator.translate(text, src=from_lang, dest=to_lang)
print(res.text)

good morning


In [None]:
!pip install langdetect
from langdetect import detect
detect('อรุณสวัสดิ์')



'th'

In [None]:
from langdetect import detect_langs
detect_langs('Wie gehts?')

[de:0.9999953575421194]

In [None]:
# SENTIMENT ANALYSIS
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [None]:
text = "The movie was amazing. I loved i badly"
score = sid.polarity_scores(text)
print(score)

{'neg': 0.225, 'neu': 0.217, 'pos': 0.558, 'compound': 0.6808}


In [None]:
text = 'i am watching a movie'
score = sid.polarity_scores(text)
print(score)

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


In [None]:
text = 'i am having my dinner. but i is not very good'
score = sid.polarity_scores(text)
print(score)

{'neg': 0.3, 'neu': 0.7, 'pos': 0.0, 'compound': -0.5321}


In [None]:
texts = [
    "I love this product! It works great and is very affordable.",
    "This product is okay. It gets the job done, but could be better.",
    "I hate this product. It doesn't work at all and is a waste of money.",
    "I am watching a movie"
]

def sentimentFinder(text):
  score = sid.polarity_scores(text)
  compound = score['compound']
  if compound >= 0.05:
    return 'Positive'
  elif compound <= -0.05:
    return 'Negative'
  else:
    return 'Neutral'

for text in texts:
  sentiment = sentimentFinder(text)
  print(f"Text: {text}\nSentiment: {sentiment}\n")


Text: I love this product! It works great and is very affordable.
Sentiment: Positive

Text: This product is okay. It gets the job done, but could be better.
Sentiment: Positive

Text: I hate this product. It doesn't work at all and is a waste of money.
Sentiment: Negative

Text: I am watching a movie
Sentiment: Neutral

