In [2]:
paragraph = """
The name 'Nepal' is first recorded in texts from the Vedic period of the Indian subcontinent, the era in ancient Nepal when Hinduism was founded,
the predominant religion of the country. In the middle of the first millennium BC, Gautama Buddha, the founder of Buddhism, was born in Lumbini in 
southern Nepal. Parts of northern Nepal were intertwined with the culture of Tibet. The centrally located Kathmandu Valley is intertwined with the culture 
of Indo-Aryans, and was the seat of the prosperous Newar confederacy known as Nepal Mandala. The Himalayan branch of the ancient Silk Road was dominated by the valley's traders.
The cosmopolitan region developed distinct traditional art and architecture. By the 18th century, the Gorkha Kingdom achieved the unification of Nepal. The Shah dynasty established
the Kingdom of Nepal and later formed an alliance with the British Empire, under its Rana dynasty of premiers. The country was never colonised but served as a buffer state between Imperial 
China and British India. Parliamentary democracy was introduced in 1951 but was twice suspended by Nepalese monarchs, in 1960 and 2005. The Nepalese Civil War in the 1990s and early 2000s 
resulted in the establishment 
of a secular republic in 2008, ending the world's last Hindu monarchy."

"""

In [5]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [28]:
# tokenization- convert paragraph into sentences-words
nltk.download('punkt')
sentences=nltk.sent_tokenize(paragraph)

[nltk_data] Downloading package punkt to /home/nix-code/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# stemming process
stemmer = PorterStemmer()
stemmer.stem('amelioration')

'amelior'

In [9]:
stemmer.stem('history')

'histori'

In [20]:
# Lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package wordnet to /home/nix-code/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/nix-code/nltk_data...


In [21]:
lemmatizer.lemmatize('goes')

'go'

In [25]:
lemmatizer.lemmatize('fighting')

'fighting'

In [32]:
## clean the data

import re
corpus = []
for i in range(len(sentences)):
    # other than = ^
    # other than a-z ot A-z replace all puntuations with space
    review = re.sub('[^a-zA-Z]',' ',sentences[i])
    review = review.lower()
    corpus.append(review)

In [34]:
corpus

[' the name  nepal  is first recorded in texts from the vedic period of the indian subcontinent  the era in ancient nepal when hinduism was founded  the predominant religion of the country ',
 'in the middle of the first millennium bc  gautama buddha  the founder of buddhism  was born in lumbini in  southern nepal ',
 'parts of northern nepal were intertwined with the culture of tibet ',
 'the centrally located kathmandu valley is intertwined with the culture  of indo aryans  and was the seat of the prosperous newar confederacy known as nepal mandala ',
 'the himalayan branch of the ancient silk road was dominated by the valley s traders ',
 'the cosmopolitan region developed distinct traditional art and architecture ',
 'by the   th century  the gorkha kingdom achieved the unification of nepal ',
 'the shah dynasty established the kingdom of nepal and later formed an alliance with the british empire  under its rana dynasty of premiers ',
 'the country was never colonised but served as

In [41]:
nltk.download('stopwords')
for i in corpus:
    words = nltk.word_tokenize(i)
    for word in words:
        # print those words that are not in stop words
        if word not in set(stopwords.words('english')):
            print(stemmer.stem(word))
        


name
nepal
first
record
text
vedic
period
indian
subcontin
era
ancient
nepal
hinduism
found
predomin
religion
countri
middl
first
millennium
bc
gautama
buddha
founder
buddhism
born
lumbini
southern
nepal
part
northern
nepal
intertwin
cultur
tibet
central
locat
kathmandu
valley
intertwin
cultur
indo
aryan
seat
prosper
newar
confederaci
known
nepal
mandala
himalayan
branch
ancient
silk
road
domin
valley
trader
cosmopolitan
region
develop
distinct
tradit
art
architectur
th
centuri
gorkha
kingdom
achiev
unif
nepal
shah
dynasti
establish
kingdom
nepal
later
form
allianc
british
empir
rana
dynasti
premier
countri
never
colonis
serv
buffer
state
imperi
china
british
india
parliamentari
democraci
introduc
twice
suspend
nepales
monarch
nepales
civil
war
earli
result
establish
secular
republ
end
world
last
hindu
monarchi


[nltk_data] Downloading package stopwords to /home/nix-
[nltk_data]     code/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
# lemmatization

for i in corpus:
    words = nltk.word_tokenize(i)
    for word in words:
        if word not in set(stopwords.words('english')):
            print(lemmatizer.lemmatize(word))

name
nepal
first
recorded
text
vedic
period
indian
subcontinent
era
ancient
nepal
hinduism
founded
predominant
religion
country
middle
first
millennium
bc
gautama
buddha
founder
buddhism
born
lumbini
southern
nepal
part
northern
nepal
intertwined
culture
tibet
centrally
located
kathmandu
valley
intertwined
culture
indo
aryan
seat
prosperous
newar
confederacy
known
nepal
mandala
himalayan
branch
ancient
silk
road
dominated
valley
trader
cosmopolitan
region
developed
distinct
traditional
art
architecture
th
century
gorkha
kingdom
achieved
unification
nepal
shah
dynasty
established
kingdom
nepal
later
formed
alliance
british
empire
rana
dynasty
premier
country
never
colonised
served
buffer
state
imperial
china
british
india
parliamentary
democracy
introduced
twice
suspended
nepalese
monarch
nepalese
civil
war
early
resulted
establishment
secular
republic
ending
world
last
hindu
monarchy


In [69]:
## clean the data

import re
corpus = []
for i in range(len(sentences)):
    # other than = ^
    # other than a-z ot A-z replace all puntuations with space
    review = re.sub('[^a-zA-Z]',' ',sentences[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [70]:
## Bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True, ngram_range=(3,3))


In [71]:
X=cv.fit_transform(corpus)
cv.vocabulary_

{'nepalese civil war': 5,
 'civil war early': 0,
 'war early resulted': 9,
 'early resulted establishment': 1,
 'resulted establishment secular': 7,
 'establishment secular republic': 3,
 'secular republic ending': 8,
 'republic ending world': 6,
 'ending world last': 2,
 'world last hindu': 10,
 'last hindu monarchy': 4}

In [72]:
corpus[0]

'nepalese civil war early resulted establishment secular republic ending world last hindu monarchy'

In [75]:
X[0].toarray()

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [78]:
# let's define tfidf

from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X_ = cv.fit_transform(corpus)
X_[0].toarray()
# now it will give some weights to words

array([[0.2773501, 0.2773501, 0.2773501, 0.2773501, 0.2773501, 0.2773501,
        0.2773501, 0.2773501, 0.2773501, 0.2773501, 0.2773501, 0.2773501,
        0.2773501]])