## Tokenization
### Tokenization is the process to split the paragraphs into sentences or words.

In [3]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [4]:
corpus = """Hello my name is kuchhadiya punja. i study at ld college of engineering. i am currently enrolled in artificial intelligence and machine learning course of udemy. i am currently learning NLP! in NLP i am currently learning tokenization using NLTK."""

In [5]:
document = sent_tokenize(corpus, language='english')
for i in document:
    print(i)

Hello my name is kuchhadiya punja.
i study at ld college of engineering.
i am currently enrolled in artificial intelligence and machine learning course of udemy.
i am currently learning NLP!
in NLP i am currently learning tokenization using NLTK.


In [6]:
word_tokenize(corpus)

['Hello',
 'my',
 'name',
 'is',
 'kuchhadiya',
 'punja',
 '.',
 'i',
 'study',
 'at',
 'ld',
 'college',
 'of',
 'engineering',
 '.',
 'i',
 'am',
 'currently',
 'enrolled',
 'in',
 'artificial',
 'intelligence',
 'and',
 'machine',
 'learning',
 'course',
 'of',
 'udemy',
 '.',
 'i',
 'am',
 'currently',
 'learning',
 'NLP',
 '!',
 'in',
 'NLP',
 'i',
 'am',
 'currently',
 'learning',
 'tokenization',
 'using',
 'NLTK',
 '.']

## Stemming
### Stemming is the process of removing the prefix and suffix of the words
#### for example moves, moving, moved are basically 'move'. so it converts all of this to their stem

In [7]:
from nltk.stem import PorterStemmer

In [8]:
words = [
    "running", "jumps", "easily", "faster", "happily", "studies", "flies",  
    "driving", "eaten", "creating", "better", "stronger", "worse",  
    "fishing", "thoughtful", "walking", "unhappiness", "nationality",  
    "singing", "organization", "arguing", "simplified", "playing",  
    "working", "swimming", "quickly", "married", "loving", "caring"
]


In [9]:
ps = PorterStemmer()

In [10]:

for i in words:
    print((i)+"-------->"+ps.stem(i))
    

running-------->run
jumps-------->jump
easily-------->easili
faster-------->faster
happily-------->happili
studies-------->studi
flies-------->fli
driving-------->drive
eaten-------->eaten
creating-------->creat
better-------->better
stronger-------->stronger
worse-------->wors
fishing-------->fish
thoughtful-------->thought
walking-------->walk
unhappiness-------->unhappi
nationality-------->nation
singing-------->sing
organization-------->organ
arguing-------->argu
simplified-------->simplifi
playing-------->play
working-------->work
swimming-------->swim
quickly-------->quickli
married-------->marri
loving-------->love
caring-------->care


In [11]:
from nltk.stem import SnowballStemmer

ss = SnowballStemmer('english')

In [12]:
print(ps.stem('goes'))
print(ps.stem("fairly"))
print(ss.stem("goes"))
print(ss.stem("fairly"))

goe
fairli
goe
fair


## Lemmatization:
### Lemmatization is similar to stemming where the output is called 'lemma' which is the root word of the input word
#### as seen above stemming is great but it fails in some cases like goes

In [13]:
from nltk.stem import WordNetLemmatizer

lmtz = WordNetLemmatizer()

In [14]:

for i in words:
    print((i)+"-------->"+lmtz.lemmatize(i,pos='v'))

    ## much better result compared to stemming

running-------->run
jumps-------->jump
easily-------->easily
faster-------->faster
happily-------->happily
studies-------->study
flies-------->fly
driving-------->drive
eaten-------->eat
creating-------->create
better-------->better
stronger-------->stronger
worse-------->worse
fishing-------->fish
thoughtful-------->thoughtful
walking-------->walk
unhappiness-------->unhappiness
nationality-------->nationality
singing-------->sing
organization-------->organization
arguing-------->argue
simplified-------->simplify
playing-------->play
working-------->work
swimming-------->swim
quickly-------->quickly
married-------->marry
loving-------->love
caring-------->care


In [15]:
lmtz.lemmatize('goes')

'go'

## Stemming is fast but sometimes it dont give good results
## Lemmatizer is slow but better

## Stopwords:
###  all the extra words are called stopwords.
#### Eg:The,Their etc

In [16]:
from nltk.corpus import stopwords

In [17]:
para = "In the year 1960, APJ Abdul Kalam’s graduation took place from Madras Institute of Technology. The association of Kalam took place with the Defence Research & Development Service (DRDS). Furthermore, he joined as a scientist at the Aeronautical Development Establishment of the Defence Research and Development Organisation. These were the beginning achievements of his prestigious career as a scientist.Big achievement for Kalam came when he was the project director at ISRO of India‘s first-ever Satellite Launch Vehicle (SLV- III). This satellite was responsible for the deployment of the Rohini satellite in 1980. Moreover, Kalam was highly influential in the development of Polar Satellite Launch Vehicle (PSLV) and SLV projects.Both projects were successful. Bringing enhancement in the reputation of Kalam. Furthermore, the development of ballistic missiles was possible because of the efforts of this man. Most noteworthy, Kalam earned the esteemed title of “The missile Man of India”. The Government of India became aware of the brilliance of this man and made him the Chief Executive of the Integrated Guided Missiles Development Program (IGMDP). Furthermore, this program was responsible for the research and development of Missiles. The achievements of this distinguished man didn't stop there.More success was to come in the form of Agni and Prithvi missiles. Once again, Kalam was influential in the developments of these missiles. It was during his tenure in IGMDP that Kalam played an instrumental role in the developments of missiles like Agni and Prithvi. Moreover, Kamal was a key figure in the Pokhran II nuclear test."

In [18]:
print(para)



## for removing all punctuation
# from nltk.tokenize import RegexpTokenizer
# tokenizer = RegexpTokenizer(r'\w+')
# tokens = tokenizer.tokenize(para)

In the year 1960, APJ Abdul Kalam’s graduation took place from Madras Institute of Technology. The association of Kalam took place with the Defence Research & Development Service (DRDS). Furthermore, he joined as a scientist at the Aeronautical Development Establishment of the Defence Research and Development Organisation. These were the beginning achievements of his prestigious career as a scientist.Big achievement for Kalam came when he was the project director at ISRO of India‘s first-ever Satellite Launch Vehicle (SLV- III). This satellite was responsible for the deployment of the Rohini satellite in 1980. Moreover, Kalam was highly influential in the development of Polar Satellite Launch Vehicle (PSLV) and SLV projects.Both projects were successful. Bringing enhancement in the reputation of Kalam. Furthermore, the development of ballistic missiles was possible because of the efforts of this man. Most noteworthy, Kalam earned the esteemed title of “The missile Man of India”. The Go

In [19]:
para = word_tokenize(para)

In [20]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [21]:
para_stem= [ss.stem(word) for word in para if word not in set(stopwords.words())]

In [22]:
para_lmtz= [lmtz.lemmatize(word,pos='v') for word in para if word not in set(stopwords.words())]

In [23]:
" ".join(para_stem)

"in year 1960 , apj abdul kalam ’ graduat place madra institut technolog . the associ kalam place defenc research & develop servic ( drds ) . furthermor , join scientist aeronaut develop establish defenc research develop organis . these begin achiev prestigi career scientist.big achiev kalam project director isro india ‘ first-ev satellit launch vehicl ( slv- iii ) . this satellit respons deploy rohini satellit 1980 . moreov , kalam high influenti develop polar satellit launch vehicl ( pslv ) slv projects.both project success . bring enhanc reput kalam . furthermor , develop ballist missil possibl effort . most noteworthi , kalam earn esteem titl “ the missil man india ” . the govern india awar brillianc made chief execut integr guid missil develop program ( igmdp ) . furthermor , program respons research develop missil . the achiev distinguish n't stop there.mor success form agni prithvi missil . onc , kalam influenti develop missil . it tenur igmdp kalam play instrument role develop 

In [24]:
para_lmtzz = " ".join(para_lmtz)

In [27]:
lwr_lmtz = []
for word in para_lmtzz:
    lwr_lmtz.append(word.lower())

In [28]:
print(para_stem)
print(lwr_lmtz)

['in', 'year', '1960', ',', 'apj', 'abdul', 'kalam', '’', 'graduat', 'place', 'madra', 'institut', 'technolog', '.', 'the', 'associ', 'kalam', 'place', 'defenc', 'research', '&', 'develop', 'servic', '(', 'drds', ')', '.', 'furthermor', ',', 'join', 'scientist', 'aeronaut', 'develop', 'establish', 'defenc', 'research', 'develop', 'organis', '.', 'these', 'begin', 'achiev', 'prestigi', 'career', 'scientist.big', 'achiev', 'kalam', 'project', 'director', 'isro', 'india', '‘', 'first-ev', 'satellit', 'launch', 'vehicl', '(', 'slv-', 'iii', ')', '.', 'this', 'satellit', 'respons', 'deploy', 'rohini', 'satellit', '1980', '.', 'moreov', ',', 'kalam', 'high', 'influenti', 'develop', 'polar', 'satellit', 'launch', 'vehicl', '(', 'pslv', ')', 'slv', 'projects.both', 'project', 'success', '.', 'bring', 'enhanc', 'reput', 'kalam', '.', 'furthermor', ',', 'develop', 'ballist', 'missil', 'possibl', 'effort', '.', 'most', 'noteworthi', ',', 'kalam', 'earn', 'esteem', 'titl', '“', 'the', 'missil', 'm