## Library installation

In [None]:
pip install nltk spacy textblob -U

In [1]:
import nltk 

In [7]:
nltk.download('punkt') ## tokenization
nltk.download('stopwords') ## stopwords removal
nltk.download('averaged_perceptron_tagger') ## part of speech (speech) tagging
nltk.download('wordnet') ## wordnet database and lemmatization
nltk.download('omw-1.4') ## stemmin
nltk.download('indian') ## Indian language pos tagging
nltk.download('maxent_ne_chunker') ## chunkingh

[nltk_data] Downloading package punkt to C:\Users\Administrator.DAI-
[nltk_data]     PC2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator.DAI-
[nltk_data]     PC2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Administrator.DAI-
[nltk_data]     PC2\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\Administrator.DAI-
[nltk_data]     PC2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Administrator.DAI-
[nltk_data]     PC2\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package indian to C:\U

True

## Sample example

In [24]:
## Find the average of ages mentioned in the above sentence.

In [30]:
sent = 'They told that their ages are 25   27 and 31 respectively.'
ages = []

for i in sent.split(" "):
    if i.isdigit():
        ages.append(int(i))
        
print(sum(ages)/len(ages))

27.666666666666668


In [31]:
ages = [int(word) for word in sent.split() if word.isdigit()]
print(sum(ages)/len(ages))

27.666666666666668


In [32]:
import numpy as np
np.mean([int(word) for word in sent.split() if word.isdigit()])

27.666666666666668

## Tokenization

In [33]:
sent = 'Hello friends! How are you? Welcome to Python Programming.'

In [34]:
## import the functions for tokenization
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [35]:
#segmentation
sent_tokenize(sent)

['Hello friends!', 'How are you?', 'Welcome to Python Programming.']

In [36]:
word_tokenize(sent)

['Hello',
 'friends',
 '!',
 'How',
 'are',
 'you',
 '?',
 'Welcome',
 'to',
 'Python',
 'Programming',
 '.']

In [39]:
# Find the percentage of punctuation symbols present in it

count =0

for i in word_tokenize(sent):
    if i.isalnum():
        count +=1
        
print((1-(count/len(word_tokenize(sent))))*100)

25.0


In [42]:
punct_count = len([word for word in word_tokenize(sent) if not word.isalnum()])
punct_count/len(word_tokenize(sent))*100

25.0

## Ascii code

In [43]:
ord('#') # get ascii value

35

In [44]:
import sys
sys.getsizeof('#')  ## min size of object is 50 in ascii though only 8 are used for one char

50

In [46]:
sys.getsizeof('abcdfgh') ## 8*7

56

In [51]:
chr(75) ## returns character associated with number

'K'

## Universal Code

In [52]:
char = '\u0935' ## universal code
print(char)

व


In [53]:
char = '\u0935\u0940'
print(char)

वी


In [54]:
chr(2358)

'श'

In [55]:
chr(0x935) # hexadecimal 

'व'

In [56]:
sys.getsizeof('व') ## min size of object is 76 in universal though only 8 are used for one char

76

In [57]:
name = 'शुभम शहा'
name.split()

['शुभम', 'शहा']

In [58]:
name.startswith('श')

True

In [59]:
name.replace('श', 'व')

'वुभम वहा'

In [60]:
name.find('म')

3

In [61]:
len(name)

8

In [63]:
name[1]

'ु'

## Text in regional languages.

mytext = '४८ कि. मी. अंतरावर आणि पुणे जिल्ह्यातील वेल्हे तालुक्यात व भोर गावाच्या वायव्येला २४ कि.मी. अंतरावर नीरा-वेळवंडी-कानंदी आणि गुंजवणी या नद्यांच्या खोऱ्यांच्या बेचक्यात मुरुंबदेवाचा डोंगर उभा आहे. मावळ भागामध्ये राज्यविस्तार साध्य करण्यासाठी राजगड आणि तोरणा हे दोन्ही किल्ले मोक्याच्या ठिकाणी होते. तोरणा Archived 2020-09-20 at the Wayback Machine. किल्ल्याचा बालेकिल्ला आकाराने लहान असल्यामुळे राजकीय केंद्र म्हणून हा किल्ला सोयीचा नव्हता. त्यामानाने राजगड दुर्गम असून त्याचा बालेकिल्ला बराच मोठा आहे. शिवाय राजगडाकडे कोणत्याही बाजूने येताना एखादी टेकडी किंवा नदी ओलांडावीच लागते. एवढी सुरक्षितता होती,म्हणून आपले राजकीय केंद्र म्हणून शिवाजी महाराजांनी Archived 2020-03-18 at the Wayback Machine. राजगडाची निवड केली. राजगडाला तीन माच्या व एक बालेकिल्ला आहे. राजगडचा बालेकिल्ला खूप उंच असून त्याची समुद्रसपाटीपासूनची उंची १३९४ मीटर आहे. दुर्गराज राजगड त्यांच्या महत्त्वाकांक्षेची उंची दाखवतो, तर किल्ले रायगड हा शिवाजी महाराजांच्या कर्तृत्वाचा विस्तार दाखवतो. राजगडाच्या मध्यवर्ती ठिकाणी उंच डोंगर तासून तयार केलेला बालेकिल्ला म्हणजे पृथ्वीने स्वर्गावर केलेली स्वारी होय.'
mytext

In [65]:
word_tokenize(mytext)

['४८',
 'कि',
 '.',
 'मी',
 '.',
 'अंतरावर',
 'आणि',
 'पुणे',
 'जिल्ह्यातील',
 'वेल्हे',
 'तालुक्यात',
 'व',
 'भोर',
 'गावाच्या',
 'वायव्येला',
 '२४',
 'कि.मी',
 '.',
 'अंतरावर',
 'नीरा-वेळवंडी-कानंदी',
 'आणि',
 'गुंजवणी',
 'या',
 'नद्यांच्या',
 'खोऱ्यांच्या',
 'बेचक्यात',
 'मुरुंबदेवाचा',
 'डोंगर',
 'उभा',
 'आहे',
 '.',
 'मावळ',
 'भागामध्ये',
 'राज्यविस्तार',
 'साध्य',
 'करण्यासाठी',
 'राजगड',
 'आणि',
 'तोरणा',
 'हे',
 'दोन्ही',
 'किल्ले',
 'मोक्याच्या',
 'ठिकाणी',
 'होते',
 '.',
 'तोरणा',
 'Archived',
 '2020-09-20',
 'at',
 'the',
 'Wayback',
 'Machine',
 '.',
 'किल्ल्याचा',
 'बालेकिल्ला',
 'आकाराने',
 'लहान',
 'असल्यामुळे',
 'राजकीय',
 'केंद्र',
 'म्हणून',
 'हा',
 'किल्ला',
 'सोयीचा',
 'नव्हता',
 '.',
 'त्यामानाने',
 'राजगड',
 'दुर्गम',
 'असून',
 'त्याचा',
 'बालेकिल्ला',
 'बराच',
 'मोठा',
 'आहे',
 '.',
 'शिवाय',
 'राजगडाकडे',
 'कोणत्याही',
 'बाजूने',
 'येताना',
 'एखादी',
 'टेकडी',
 'किंवा',
 'नदी',
 'ओलांडावीच',
 'लागते',
 '.',
 'एवढी',
 'सुरक्षितता',
 'होती',
 ',',
 'म्हणून',
 'आपल

## Read file 

In [75]:
with open("mydata.txt") as myfile:
    print(myfile.readlines())

['Hello Friends! How are you?\n', 'Welcome to the world of Python Programming.']


In [84]:
myfile = open("mydata.txt",'r') 
data = myfile.read()
print(data)

Hello Friends! 	How are you?
Welcome to the world of 	Python Programming.


### Space tokenizer

In [85]:
from nltk.tokenize import SpaceTokenizer ## only space is used for tokenizing

## create an object
tk = SpaceTokenizer()

## tokenize the data
tk.tokenize(data)

['Hello',
 'Friends!',
 '\tHow',
 'are',
 'you?\nWelcome',
 'to',
 'the',
 'world',
 'of',
 '\tPython',
 'Programming.']

### Tab tokenizer

In [86]:
from nltk.tokenize import TabTokenizer ## only tab is used for tokenizing

## create an object
tk = TabTokenizer()

## tokenize the data
tk.tokenize(data)

['Hello Friends! ',
 'How are you?\nWelcome to the world of ',
 'Python Programming.']

### Line tokenizer

In [87]:
from nltk.tokenize import LineTokenizer ## only \n is used for tokenizing

## create an object
tk = LineTokenizer()

## tokenize the data
tk.tokenize(data)

['Hello Friends! \tHow are you?',
 'Welcome to the world of \tPython Programming.']

### White space tokenizer

In [90]:
from nltk.tokenize import WhitespaceTokenizer ## all spaces, tabs and \n are used for tokenizing

## create an object
tk = WhitespaceTokenizer()

## tokenize the data
tk.tokenize(data)

['Hello',
 'Friends!',
 'How',
 'are',
 'you?',
 'Welcome',
 'to',
 'the',
 'world',
 'of',
 'Python',
 'Programming.']

### MWE tokenizer

In [94]:
sent1 = '''The Van Rossum is Python creator, visiting Pune this week. The 
developement community is eager to meet Van Rossum'''

print(sent1)

print(word_tokenize(sent1))

The Van Rossum is Python creator, visiting Pune this week. The 
developement community is eager to meet Van Rossum
['The', 'Van', 'Rossum', 'is', 'Python', 'creator', ',', 'visiting', 'Pune', 'this', 'week', '.', 'The', 'developement', 'community', 'is', 'eager', 'to', 'meet', 'Van', 'Rossum']


In [97]:
from nltk.tokenize import MWETokenizer ## always gives output with multiword expression as single token 
                                        # seperated by a seperator(defaults = '_') for an input from other tokenizer

## create an object
tk = MWETokenizer(separator=' ')

## add mutli word expression
tk.add_mwe(('Van', 'Rossum'))

## tokenize the data
tk.tokenize(word_tokenize(sent1))

['The',
 'Van Rossum',
 'is',
 'Python',
 'creator',
 ',',
 'visiting',
 'Pune',
 'this',
 'week',
 '.',
 'The',
 'developement',
 'community',
 'is',
 'eager',
 'to',
 'meet',
 'Van Rossum']

### Tweet tokenizer

In [104]:
sent = 'Hello Friends :)! How are you? Welcome to the world of Python Programming. :D :|'

from nltk.tokenize import TweetTokenizer ## gives word tokenized with emojis as tokens

## create an object
tk = TweetTokenizer()

## tokenize the data
tk.tokenize(sent)

['Hello',
 'Friends',
 ':)',
 '!',
 'How',
 'are',
 'you',
 '?',
 'Welcome',
 'to',
 'the',
 'world',
 'of',
 'Python',
 'Programming',
 '.',
 ':D',
 ':|',
 ':',
 'O']

In [111]:
f = open("mydata1.txt",encoding='utf-8') 
data = f.read()
print(data)

Hello Friends 😀! 	How are you?🤚
Welcome 🙏 to the world🌍 of 	Python 💻Programming.


In [112]:
word_tokenize(data)

['Hello',
 'Friends',
 '😀',
 '!',
 'How',
 'are',
 'you',
 '?',
 '🤚',
 'Welcome',
 '🙏',
 'to',
 'the',
 'world🌍',
 'of',
 'Python',
 '💻Programming',
 '.']

In [113]:
## tokenize the data
tk.tokenize(data)

['Hello',
 'Friends',
 '😀',
 '!',
 'How',
 'are',
 'you',
 '?',
 '🤚',
 'Welcome',
 '🙏',
 'to',
 'the',
 'world',
 '🌍',
 'of',
 'Python',
 '💻',
 'Programming',
 '.']

### Custom tokenizer

In [116]:
import re

def custom_tokenizer(text):
    return re.split(r"[.,:?!\s]+", text)

text = "This is some text with punctuation > Let's tokenize it. Is it ok?"

tokens = custom_tokenizer(text)

print("Tokens : ")
for token in tokens:
    print(token)

Tokens : 
Th
s
s
some
text
w
th
punctuat
on
>
Let's
token
ze
t
Is
t
ok



In [None]:
## mitu.co.in/dataset/student3.tsv

In [154]:
f = open('student3.tsv')
data = f.read()
print(data)

roll	name	class	marks	age
1	anil	TE	56.77	22
2	amit	TE	59.77	21
3	aniket	BE	76.88	19
4	ajinkya	TE	69.66	20
5	asha	TE	63.28	20
6	ayesha	BE	49.55	20
7	amar	BE	65.34	19
8	amita	BE	68.33	23
9	amol	TE	56.75	20
10	anmol	BE	78.66	21



In [158]:
list = []

for i in data.split('\n'):
    sublist = []
    for j in i.split('\t'):
        if j.isdigit():
            sublist.append(int(j))
        elif '.' in(j):
            sublist.append(float(j))
        else:
            sublist.append(j)
    list.append(sublist)

In [160]:
list[1:-1]

[[1, 'anil', 'TE', 56.77, 22],
 [2, 'amit', 'TE', 59.77, 21],
 [3, 'aniket', 'BE', 76.88, 19],
 [4, 'ajinkya', 'TE', 69.66, 20],
 [5, 'asha', 'TE', 63.28, 20],
 [6, 'ayesha', 'BE', 49.55, 20],
 [7, 'amar', 'BE', 65.34, 19],
 [8, 'amita', 'BE', 68.33, 23],
 [9, 'amol', 'TE', 56.75, 20],
 [10, 'anmol', 'BE', 78.66, 21]]