# word cloud

### 1. Corpus Preprocessing

In [1]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

corpus = '''India, officially the Republic of India (Hindi: Bhārat Gaṇarājya),[25] is a country in South Asia...'''  # Truncated for brevity

# Clean the corpus
corpus = corpus.replace("[25]", "").replace("[f]", "").replace(")", "")
print(corpus)

  from scipy.stats import fisher_exact


India, officially the Republic of India (Hindi: Bhārat Gaṇarājya, is a country in South Asia...


### 2. Stop Words Removal

In [2]:
words = []
for word in word_tokenize(corpus):
    if (word.lower() not in stopwords.words('english')) and (len(word) >= 2):
        words.append(word.lower())

print("Filtered Words:", words)

Filtered Words: ['india', 'officially', 'republic', 'india', 'hindi', 'bhārat', 'gaṇarājya', 'country', 'south', 'asia', '...']


### 3. Building Vocabulary

In [3]:
vocab = list(set(words))  # Remove duplicates using set
print("Vocabulary Size:", len(vocab))  # Output: 48
print("Sample Vocabulary:", vocab[:5])

Vocabulary Size: 10
Sample Vocabulary: ['officially', '...', 'south', 'asia', 'country']


## Text-Encoding and Decoding


In [7]:
for sent in sent_tokenize(corpus):
    print(sent)

India, officially the Republic of India (Hindi: Bhārat Gaṇarājya, is a country in South Asia...


In [8]:
for sent in sent_tokenize(corpus):
    print(word_tokenize(sent))

['India', ',', 'officially', 'the', 'Republic', 'of', 'India', '(', 'Hindi', ':', 'Bhārat', 'Gaṇarājya', ',', 'is', 'a', 'country', 'in', 'South', 'Asia', '...']


In [9]:
for sent in sent_tokenize(corpus):
    for word in word_tokenize(sent):
        if (word.lower() not in stopwords.words('english')) and (len(word)>=2):
            print(word,end=' ')
    print()

India officially Republic India Hindi Bhārat Gaṇarājya country South Asia ... 


In [10]:
words=[]
for word in word_tokenize(corpus):
    if (word.lower() not in stopwords.words('english')) and (len(word)>=2):
        words.append(word.lower())

vocab=list(set(words))
len(vocab)

num=1
word_to_num={}
num_to_word={}
for word in vocab:
    word_to_num[word]=num
    num_to_word[num]=word
    num+=1

In [11]:
for sent in sent_tokenize(corpus):
    for word in word_tokenize(sent):
        if (word.lower() not in stopwords.words('english')) and (len(word)>=2):
            print(word,end=' ')
            print(word_to_num[word.lower()],end=' ')
    print()

India 8 officially 1 Republic 6 India 8 Hindi 7 Bhārat 10 Gaṇarājya 9 country 5 South 3 Asia 4 ... 2 


### To get the exactly encoded numbers:

In [12]:
data=[]
for sent in sent_tokenize(corpus):
    temp=[]
    for word in word_tokenize(sent):
        if (word.lower() not in stopwords.words('english')) and (len(word)>=2):
            #print(word,end=' ')
            temp.append(word_to_num[word.lower()])
    print(temp)
    data.append(temp)
    print()

[8, 1, 6, 8, 7, 10, 9, 5, 3, 4, 2]



In [13]:
for sent in data:
    print(sent)

[8, 1, 6, 8, 7, 10, 9, 5, 3, 4, 2]


In [14]:
for sent in data:
    for word in sent:
        print(num_to_word[word],end=' ')
    print()

india officially republic india hindi bhārat gaṇarājya country south asia ... 


## Text Encoding - Decoding | Without Stop Words

In [15]:
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords

corpus='''India, officially the Republic of India (Hindi: Bhārat Gaṇarājya),[25] is a country in South Asia. It is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west;[f] China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar, and Indonesia.'''

corpus = corpus.replace("[25]" , "")    
corpus = corpus.replace("[f]" , "")   
corpus = corpus.replace(")" , "")

In [16]:
words=[]
for word in word_tokenize(corpus):
    if(len(word)==1):
        if((ord(word)>=97 and ord(word)<=122) or (ord(word)>=65 and ord(word)<=90)):
            words.append(word.lower())
    else:
        words.append(word.lower())

In [17]:
vocab=list(set(words))
print(len(vocab))

61


In [18]:
num=1
word_to_num={}
num_to_word={}
for word in vocab:
    word_to_num[word]=num
    num_to_word[num]=word
    num+=1

In [19]:
data=[]
for sent in sent_tokenize(corpus):
    temp=[]
    for word in word_tokenize(sent):
        if(len(word)==1):
            if((ord(word)>=97 and ord(word)<=122) or (ord(word)>=65 and ord(word)<=90)):
                temp.append(word_to_num[word.lower()])
        else:
            temp.append(word_to_num[word.lower()])
    data.append(temp)
print(data)

[[54, 29, 9, 34, 6, 54, 17, 43, 8, 4, 60, 16, 61, 14, 1], [57, 4, 9, 53, 16, 11, 12, 9, 2, 59, 16, 26, 9, 46, 59, 18, 61, 9, 20], [58, 11, 9, 39, 7, 23, 9, 14, 9, 5, 45, 23, 9, 27, 26, 9, 56, 6, 42, 23, 9, 50, 57, 35, 24, 13, 55, 44, 40, 9, 47, 48, 28, 26, 3, 40, 9, 22, 26, 21, 26, 49, 40, 9, 19], [61, 9, 39, 7, 54, 4, 61, 9, 25, 6, 31, 32, 26, 9, 36, 41, 15, 26, 37, 51, 10, 60, 30, 52, 55, 33, 49, 26, 38]]


In [20]:
for sent in data:
    for word in sent:
        print(num_to_word[word],end=' ')
    print()

india officially the republic of india hindi bhārat gaṇarājya is a country in south asia 
it is the seventh-largest country by area the second-most populous country and the most populous democracy in the world 
bounded by the indian ocean on the south the arabian sea on the southwest and the bay of bengal on the southeast it shares land borders with pakistan to the west china nepal and bhutan to the north and bangladesh and myanmar to the east 
in the indian ocean india is in the vicinity of sri lanka and the maldives its andaman and nicobar islands share a maritime border with thailand myanmar and indonesia 
