In [1]:
import nltk
from nltk.stem import PorterStemmer

In [2]:
stemming = PorterStemmer()


In [3]:
words = ['studying', 'ninez', 'care', 'fairly', 'easily', 'run', 'running', 'runner', 'ran', 'runs', 'going', 'goes', 'gone', 'go', 'history', 'historical']

In [4]:
words

['studying',
 'ninez',
 'care',
 'fairly',
 'easily',
 'run',
 'running',
 'runner',
 'ran',
 'runs',
 'going',
 'goes',
 'gone',
 'go',
 'history',
 'historical']

In [5]:
for word in words:
    print(word + '----->' + stemming.stem(word))

studying----->studi
ninez----->ninez
care----->care
fairly----->fairli
easily----->easili
run----->run
running----->run
runner----->runner
ran----->ran
runs----->run
going----->go
goes----->goe
gone----->gone
go----->go
history----->histori
historical----->histor


# Lemmatization

In [6]:
from nltk.stem import WordNetLemmatizer

In [7]:
lemmatizer = WordNetLemmatizer()

In [8]:
for word in words:
    print(word + '--->' + lemmatizer.lemmatize(word))

studying--->studying
ninez--->ninez
care--->care
fairly--->fairly
easily--->easily
run--->run
running--->running
runner--->runner
ran--->ran
runs--->run
going--->going
goes--->go
gone--->gone
go--->go
history--->history
historical--->historical


In [9]:
editorial = 'IIT Bombay was established in 1958. This year it placed among the top 150 universities in the QS World University Ranking. That’s the kind of headline that will do its students a world of good, as alumni and employers take note across various countries. But then there is also a very different kind of ground being broken here in 2023: Some tables in the mess have been reserved for vegetarians. An email from the mess administration reportedly said this was because “some people” cannot stomach the look or smell of non-vegetarian food. Students who have protested have been fined. Faculty members who support the protesting students credibly argue that what’s taking place here is a dangerous institutionalisation of a puritypollution matrix, which can also be deeply casteist.'

In [10]:
editorial

'IIT Bombay was established in 1958. This year it placed among the top 150 universities in the QS World University Ranking. That’s the kind of headline that will do its students a world of good, as alumni and employers take note across various countries. But then there is also a very different kind of ground being broken here in 2023: Some tables in the mess have been reserved for vegetarians. An email from the mess administration reportedly said this was because “some people” cannot stomach the look or smell of non-vegetarian food. Students who have protested have been fined. Faculty members who support the protesting students credibly argue that what’s taking place here is a dangerous institutionalisation of a puritypollution matrix, which can also be deeply casteist.'

In [11]:
from nltk.corpus import stopwords

In [12]:
sent = nltk.sent_tokenize(editorial)

In [13]:
sent

['IIT Bombay was established in 1958.',
 'This year it placed among the top 150 universities in the QS World University Ranking.',
 'That’s the kind of headline that will do its students a world of good, as alumni and employers take note across various countries.',
 'But then there is also a very different kind of ground being broken here in 2023: Some tables in the mess have been reserved for vegetarians.',
 'An email from the mess administration reportedly said this was because “some people” cannot stomach the look or smell of non-vegetarian food.',
 'Students who have protested have been fined.',
 'Faculty members who support the protesting students credibly argue that what’s taking place here is a dangerous institutionalisation of a puritypollution matrix, which can also be deeply casteist.']

In [15]:
for i in range(len(sent)):
    words = nltk.word_tokenize(sent[i])
    words = [stemming.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sent[i] = ' '.join(words)

In [16]:
sent

['iit bombay establish 1958 .',
 'thi year place among top 150 univers qs world univers rank .',
 'that ’ kind headlin student world good , alumni employ take note across variou countri .',
 'but also differ kind ground broken 2023 : some tabl mess reserv vegetarian .',
 'an email mess administr reportedli said “ peopl ” stomach look smell non-vegetarian food .',
 'student protest fine .',
 'faculti member support protest student credibl argu ’ take place danger institutionalis puritypollut matrix , also deepli casteist .']

In [17]:
sent_1 = nltk.sent_tokenize(editorial)
for i in range(len(sent_1)):
    words = nltk.word_tokenize(sent_1[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sent_1[i] = ' '.join(words)
sent_1

['IIT Bombay established 1958 .',
 'This year placed among top 150 university QS World University Ranking .',
 'That ’ kind headline student world good , alumnus employer take note across various country .',
 'But also different kind ground broken 2023 : Some table mess reserved vegetarian .',
 'An email mess administration reportedly said “ people ” stomach look smell non-vegetarian food .',
 'Students protested fined .',
 'Faculty member support protesting student credibly argue ’ taking place dangerous institutionalisation puritypollution matrix , also deeply casteist .']

In [18]:
words

['Faculty',
 'member',
 'support',
 'protesting',
 'student',
 'credibly',
 'argue',
 '’',
 'taking',
 'place',
 'dangerous',
 'institutionalisation',
 'puritypollution',
 'matrix',
 ',',
 'also',
 'deeply',
 'casteist',
 '.']

## Regex

In [23]:
import re

In [25]:
input = 'thhhhhhe film Titanic wassss released in 1988'
result = re.match(r'[a-zA-Z]+', input)
print(input)
print(result)

thhhhhhe film Titanic wassss released in 1988
<re.Match object; span=(0, 8), match='thhhhhhe'>


In [27]:
input = '1988 the film titanic was released'
result = re.match(r'[a-zA-Z]+', input)
print(input)
print(result)

1988 the film titanic was released
None


In [28]:
re.match(r'dog', 'dog cat dog')

<re.Match object; span=(0, 3), match='dog'>

In [33]:
match = re.match(r'dog', 'dog cat dog')
match.group(0)

'dog'

In [30]:
match.group()

'dog'

# Accessing Text Corpora - Mini Project

In [36]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [37]:
from nltk.corpus import movie_reviews, product_reviews_1, product_reviews_2

In [38]:
len(movie_reviews.fileids())

2000

In [39]:
len(product_reviews_1.fileids())

6

In [40]:
len(product_reviews_2.fileids())

10

In [41]:
movie_reviews.fileids()

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [42]:
product_reviews_2.words('norton.txt')

['I', 'have', 'read', 'the', 'installation', ...]

In [43]:
entire_text = ' '.join(product_reviews_2.words('norton.txt'))
entire_text



In [44]:
sentences = nltk.sent_tokenize(entire_text)
len(sentences)

403

## Cleaning / Pre-Processing the data

In [46]:
clean_paragraph = []
for i in range(len(sentences)):
    mysent = re.sub(r'[^a-zA-Z]', ' ', sentences[i])
    mysent = mysent.lower()
    mysent = mysent.split()
    mysent = [lemmatizer.lemmatize(word) for word in mysent if not word in set(stopwords.words('english'))]
    mysent = ' '.join(mysent)
    clean_paragraph.append(mysent)

clean_paragraph

['read installation instruction ni nav prior installation still ended result junk software',
 'install type software installs work properly',
 'installed either one norton product neither work installation',
 'computer owner since purchased mcafee anti virus installs work fine problem',
 'used norton past year last year software gotten disgraceful',
 'glad work norton',
 'used norton product past familiar',
 'bought ni recently try',
 'attempted installs finally given',
 'thankful roxio goback use time',
 'install ni would either freeze reboot properly asked',
 'm scandisk would appear due manual shutdown would scan nd bar hr shut used goback',
 'time install work error used unistall ni icon properly removed',
 'never problem software computer',
 'ni worked like charm last year',
 'past norton customer consider slap face',
 'buying software installed normally put position non working computer',
 'norton downhill slid recently product one fallen cliff',
 'loyal customer norton symantec 

In [49]:
movie_reviews.words('neg/cv010_29063.txt')

['best', 'remembered', 'for', 'his', 'understated', ...]

In [50]:
entire_text = ' '.join(movie_reviews.words('neg/cv010_29063.txt'))

In [51]:
entire_text

"best remembered for his understated performance as dr . hannibal lecter in michael mann ' s forensics thriller , manhunter , scottish character actor brian cox brings something special to every movie he works on . usually playing a bit role in some studio schlock ( he dies halfway through the long kiss goodnight ) , he ' s only occasionally given something meaty and substantial to do . if you want to see some brilliant acting , check out his work as a dogged police inspector opposite frances mcdormand in ken loach ' s hidden agenda . cox plays the role of big john harrigan in the disturbing new indie flick l . i . e . , which lot 47 picked up at sundance when other distributors were scared to budge . big john feels the love that dares not speak its name , but he expresses it through seeking out adolescents and bringing them back to his pad . what bothered some audience members was the presentation of big john in an oddly empathetic light . he ' s an even - tempered , funny , robust ol

In [52]:
sentences = nltk.sent_tokenize(entire_text)
len(sentences)

56

In [53]:
sentences

['best remembered for his understated performance as dr .',
 "hannibal lecter in michael mann ' s forensics thriller , manhunter , scottish character actor brian cox brings something special to every movie he works on .",
 "usually playing a bit role in some studio schlock ( he dies halfway through the long kiss goodnight ) , he ' s only occasionally given something meaty and substantial to do .",
 "if you want to see some brilliant acting , check out his work as a dogged police inspector opposite frances mcdormand in ken loach ' s hidden agenda .",
 'cox plays the role of big john harrigan in the disturbing new indie flick l .',
 'i .',
 'e .',
 ', which lot 47 picked up at sundance when other distributors were scared to budge .',
 'big john feels the love that dares not speak its name , but he expresses it through seeking out adolescents and bringing them back to his pad .',
 'what bothered some audience members was the presentation of big john in an oddly empathetic light .',
 "he '

In [60]:
clean_paragraph = []
for i in range(len(sentences)):
    mysent = sentences[i]
    mysent = re.sub(r'[^a-zA-Z]', ' ', mysent)
    mysent = mysent.lower()
    mysent = mysent.split()
    mysent = [lemmatizer.lemmatize(word) for word in mysent if not word in set(stopwords.words('english'))]
    mysent = ' '.join(mysent)
    clean_paragraph.append(mysent)

clean_paragraph

['best remembered understated performance dr',
 'hannibal lecter michael mann forensics thriller manhunter scottish character actor brian cox brings something special every movie work',
 'usually playing bit role studio schlock dy halfway long kiss goodnight occasionally given something meaty substantial',
 'want see brilliant acting check work dogged police inspector opposite france mcdormand ken loach hidden agenda',
 'cox play role big john harrigan disturbing new indie flick l',
 '',
 'e',
 'lot picked sundance distributor scared budge',
 'big john feel love dare speak name express seeking adolescent bringing back pad',
 'bothered audience member presentation big john oddly empathetic light',
 'even tempered funny robust old man actually listens kid problem opposed parent friend caught high wire act confused life',
 'sex pay elaborate courtship charming temptation grown world',
 'l',
 '',
 'e',
 'stand long island expressway slice strip mall middle class home suburbia',
 'filmmaker

In [54]:
words

['Faculty',
 'member',
 'support',
 'protesting',
 'student',
 'credibly',
 'argue',
 '’',
 'taking',
 'place',
 'dangerous',
 'institutionalisation',
 'puritypollution',
 'matrix',
 ',',
 'also',
 'deeply',
 'casteist',
 '.']

In [55]:
wor = ' '.join(words)

In [61]:
clean_paragraph = []
for i in range(len(sentences)):
    mysent = re.sub(r'[^a-zA-Z]', ' ', sentences[i])
    mysent = mysent.lower()
    mysent = mysent.split()
    mysent = [lemmatizer.lemmatize(word) for word in mysent if not word in set(stopwords.words('english'))]
    mysent = ' '.join(mysent)
    clean_paragraph.append(mysent)

clean_paragraph

['best remembered understated performance dr',
 'hannibal lecter michael mann forensics thriller manhunter scottish character actor brian cox brings something special every movie work',
 'usually playing bit role studio schlock dy halfway long kiss goodnight occasionally given something meaty substantial',
 'want see brilliant acting check work dogged police inspector opposite france mcdormand ken loach hidden agenda',
 'cox play role big john harrigan disturbing new indie flick l',
 '',
 'e',
 'lot picked sundance distributor scared budge',
 'big john feel love dare speak name express seeking adolescent bringing back pad',
 'bothered audience member presentation big john oddly empathetic light',
 'even tempered funny robust old man actually listens kid problem opposed parent friend caught high wire act confused life',
 'sex pay elaborate courtship charming temptation grown world',
 'l',
 '',
 'e',
 'stand long island expressway slice strip mall middle class home suburbia',
 'filmmaker

# Translation and Data Preprocessing Practice

In [3]:
from deep_translator import GoogleTranslator
jap = '若い世代を中心にネットでできる性格診断が流行している。いくつかの質問に答えると16類型に分けた性格から自分に当てはまるものを教えてくれる。SNSで幅広い情報を発信し、性格診断について言及することもあるお笑い芸人の九月さん（32）は「冗談半分だった性格診断が本格志向になり、少し危うい」と話す。私たちは性格診断とどう付き合っていけばいいのか、九月さんに聞いた。'
res = GoogleTranslator(source = 'ja', target = 'bn').translate(jap)
from gtts import gTTS
res_1 = gTTS(res, lang = 'bn')
res_1.save('jp_to_bn_1.mp3')

In [4]:
res

"অনলাইন ব্যক্তিত্ব পরীক্ষা জনপ্রিয় হয়ে উঠছে, বিশেষ করে তরুণ প্রজন্মের মধ্যে। আপনি যখন কয়েকটি প্রশ্নের উত্তর দেবেন, তখন এটি আপনাকে বলবে যে 16 ধরনের ব্যক্তিত্বের উপর ভিত্তি করে কোনটি আপনার জন্য উপযুক্ত। মিস্টার সেপ্টেম্বর (৩২), একজন কৌতুক অভিনেতা যিনি সোশ্যাল মিডিয়ায় বিস্তৃত তথ্য শেয়ার করেন এবং প্রায়শই ব্যক্তিত্ব পরীক্ষার কথা উল্লেখ করেন, বলেছেন, ``ব্যক্তিত্ব পরীক্ষা, যা অর্ধ-তামাশা করা হতো, তা আরও গুরুতর হয়ে উঠেছে এবং এটি কিছুটা বিপজ্জনক। .'' আমরা মিঃ সেপ্টেম্বরকে জিজ্ঞাসা করেছি যে আমাদের ব্যক্তিত্বের পরীক্ষাগুলি কীভাবে মোকাবেলা করা উচিত।"

In [17]:
import speech_recognition as sr
import pyaudio

recog = sr.Recognizer()

with sr.Microphone() as source:
    print('SPeak something.......')
    audio = recog.listen(source)

try:
    text = recog.recognize_google(audio)
    print('You said.... ',text)

except sr.UnknownValueError:
    print('Not Audible')

except:
    print('still no luck')

text_to_bn = GoogleTranslator(source = 'en', target = 'bn').translate(text)
res = gTTS(text_to_bn, lang = 'bn')
audio = 'translated_bn.mp3'
res.save(audio)
playsound(audio)

SPeak something.......
You said....  I live in Ajmer


In [13]:
from playsound import playsound

In [12]:
!pip install playsound

Collecting playsound
  Downloading playsound-1.3.0.tar.gz (7.7 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: playsound
  Building wheel for playsound (setup.py): started
  Building wheel for playsound (setup.py): finished with status 'done'
  Created wheel for playsound: filename=playsound-1.3.0-py3-none-any.whl size=7043 sha256=9aa64c87c032403dfe6a0474d5ac9643272d064be1382d66088f517028d66608
  Stored in directory: c:\users\dell\appdata\local\pip\cache\wheels\cf\42\ff\7c587bae55eec67b909ca316b250d9b4daedbf272a3cbeb907
Successfully built playsound
Installing collected packages: playsound
Successfully installed playsound-1.3.0
