## Text_analytics/Assignment_1/MDS201803

In [1]:
import unicodedata
import numpy as np
import pickle
import string
import re
from nltk import ngrams
from plotly import express as px
import plotly.graph_objects as go

The code below is used to extract the **bengali** text corpus from xml file. The first 100 articles are extracted only. 

In [1]:
from wiki_dump_reader import Cleaner, iterate

#https://github.com/CyberZHG/wiki-dump-reader
#pip install wiki-dump-reader
#Code adapted from https://github.com/CyberZHG/wiki-dump-reader
def write_corpus():
    corpus_file = '/media/subhasish/Professional/CMI/Sem_3/Text_analysis/CorpusFileName_2.txt'
    page_count = 0
    cleaner = Cleaner()
    with open(corpus_file, 'w', encoding='utf-8') as output:
        for title, text in iterate('/home/subhasish/Downloads/bnwiki-latest-pages-articles.xml'):
            text = cleaner.clean_text(text)
            cleaned_text, links = cleaner.build_links(text)
            output.write(title + '\n' + cleaned_text + '\n')
            page_count += 1
            if page_count % 50000 == 0:
                print('Pages dumped = ', page_count)
                
    output.close()
write_corpus()

Pages dumped =  50000
Pages dumped =  100000
Pages dumped =  150000
Pages dumped =  200000
Pages dumped =  250000


the corpus is read as a single string named **raw**

In [3]:
f = open('/media/subhasish/Professional/CMI/Sem_3/Text_analysis/CorpusFileName_2.txt')
raw = f.read()

### Preprocessing of the data

After examining the raw corpus, it is observed that the data contains punctuations, symbols, english words and digits also. Since we are intereted in the bengali words only we preprocess the data to remove them.

Regular Expressions are used for preprocessing of the data

In [4]:
raw = re.sub("[0-9]","",raw)       # removing digits
raw = re.sub("\n"," ",raw)         # removing newline command
raw = re.sub("="," ",raw)          # removing '=' symbol        
raw = re.sub("→"," ",raw)          # removing '→' symbol        
raw = re.sub("[a-zA-Z]","",raw)    # removing english words
raw = re.sub("–"," ",raw)   
raw = re.sub("।"," ",raw) 
raw = re.sub("[!#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"]","",raw)

The punctuations are removed using the **string** package

In [5]:
raw[:200]

'বাংলা ভাষা বাংলা ভাষা বাঙলা বাঙ্গলা তথা বাঙ্গালা নামগুলোতেও পরিচিত একটি ইন্দোআর্য ভাষা যা দক্ষিণ এশিয়ার বাঙালি জাতির প্রধান কথ্য ও লেখ্য ভাষা  মাতৃভাষীর সংখ্যায় বাংলা ইন্দোইউরোপীয় ভাষা পরিবারের চতু'

Next we split the raw string w.r.t whitespace (' ') and list the terms.

In [6]:
list_words = raw.split(" ")

In [7]:
# we note the number of terms before final preprocessing
len(list_words)

39669496

In [8]:
words = []
for term in list_words:
    if term not in [''] :
        words.append(term)
len(words)        

32208190

To remove the bengali digits we use the following code which uses `unicodedata` package to identify the bengali digits

In [9]:
def is_bengai_digit(word):
    try:
        lang = unicodedata.name(word.strip()[0])
        if 'BENGALI DIGIT' in lang:
            return True
        else:
            return False
    except:
        return False

In [10]:
words_no_digit = []
for i in words:
    if not is_bengai_digit(i):
        words_no_digit.append(i)

len(words_no_digit)

30814262

Now we remove the words which have length less than 2

In [16]:
words_new = []
for i in words_no_digit:
    if len(i) > 2:
        words_new.append(i)

len(words_new)

28968242

The total number unique words :

In [12]:
print("total number of unique words in the vocabulary : ", len(set(words_new)))

total number of unique words in the vocabulary :  920429


Creating a dictionary to store the words along with their frequencies :


In [17]:
word_dict = {}
for i in words_new:
    try:
        word_dict[i] = word_dict[i]+1
    except Exception:
        word_dict[i] = 1

In [None]:
corpus = ''
for i in words_new:
    if word_dict[i]> 20:
        corpus = corpus + " " + i 

In [14]:
for i in list(word_dict.keys()):
    if word_dict[i] <= 20:
        del word_dict[i]
len(word_dict)

54245

Now we sort the word dictionary w.r.t to the word frequency

In [17]:
tokens = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)

dumping the dictionary locally for future use

In [27]:
with open("/media/subhasish/Professional/CMI/Sem_3/Text_analysis/mySavedDict.txt", "wb") as myFile:
    pickle.dump(tokens, myFile)

### Load the preprocessed data using Pickle

In [46]:
with open("/media/subhasish/Professional/CMI/Sem_3/Text_analysis/mySavedDict.txt", 'rb') as handle:
    token_dict = pickle.load(handle)

token_dict

[('এবং', 508252),
 ('করে', 352826),
 ('হয়', 344312),
 ('থেকে', 246750),
 ('ছিল', 221000),
 ('একটি', 219172),
 ('করা', 178727),
 ('তার', 178654),
 ('তিনি', 177766),
 ('করেন', 158633),
 ('জন্য', 155925),
 ('কুয়েতের', 153337),
 ('সালে', 133709),
 ('তাদের', 126249),
 ('কুয়েত', 125655),
 ('যায়', 112771),
 ('সাথে', 103800),
 ('মধ্যে', 82519),
 ('তারা', 80750),
 ('হয়ে', 76528),
 ('প্রথম', 75315),
 ('করার', 73763),
 ('সাবাহ', 71837),
 ('হিসেবে', 71340),
 ('থাকে', 70545),
 ('সালের', 66971),
 ('ছিলেন', 64901),
 ('এটি', 62710),
 ('কুয়েতে', 62510),
 ('তথ্যসূত্র', 61210),
 ('সময়', 60370),
 ('অর্থনৈতিক', 59758),
 ('উপর', 59372),
 ('কুয়েতি', 57863),
 ('প্রধান', 57115),
 ('বেশি', 56839),
 ('অত্তোমান', 55740),
 ('একজন', 55258),
 ('করতে', 54295),
 ('জন্ম', 52982),
 ('পূর্বে', 51860),
 ('পরিবার', 51246),
 ('মাধ্যমে', 50604),
 ('চলচ্চিত্র', 50385),
 ('রয়েছে', 47938),
 ('নিয়ে', 47868),
 ('দেয়', 47793),
 ('শুরু', 47352),
 ('বিভিন্ন', 47306),
 ('কারণে', 45725),
 ('বছর', 45103),
 ('পর্যন্ত', 44323)

#### Verifying Zipf's law

According to Zipf's law the frequency of the i-th most frequent token is proportional to 1/i. In other words if N be the frequency of the most frequent word, the second most frequent word would have frequency N/2 , the third most frequent word would have frequency N/3 and so on. But the frequencies of the top 5 tokens in the given data do not match this criterion.

we have :<br>
\begin{equation}
r = k/f \\ log(r) = log(k) - log(f).....(1)
\end{equation}

<br> where $r$ is the rank of the word, $f$ is the frequency and $k$ is proportionality constant
<br>$(1)$ is an equation of a negatively sloped straight line. we now take the observed word frequencies and plot their $log$ values alonside the $log(r)$ values.


Indexing the words (ranking the words w.r.t their frequencies)

In [138]:
index = 1
token_indexed = []
for i in token_dict:
    i = tuple([i[0],i[1],index])
    token_indexed.append(i)
    index = index + 1

In [139]:
log_rank = []
log_freq = []
for i in token_indexed:
    log_freq.append(np.log(i[1]))
    log_rank.append(np.log(i[2]))
    
y_bar = np.mean(log_freq)
x_bar = np.mean(log_rank)
log_k = y_bar + x_bar    # OLS estimate of the intercept parameter for fixed slope linear regression  
x_sim = np.arange(0,11,0.1)
y_sim = log_k - x_sim

To verify with zipf's law, we fit a straight line of slope $(-1)$ to the given data.

In [140]:
log_k # the OLS estimate of the intercept

14.30342235602643

In [141]:
fig = go.Figure()
fig.add_trace(go.Scatter(y = log_freq, x = log_rank, mode='lines', name='Observed'))
fig.add_trace(go.Scatter(y = y_sim, x = x_sim, mode='lines', name='Expected'))

fig.update_layout(
    title="Observed log(freq)",
    xaxis_title="log(word_rank)",
    yaxis_title="log(word_frequency)")

In [142]:
words_num = 20
k = np.exp(log_k)
expected = list(k/i for i in range(1,words_num + 1))
observed = list(token_dict[i][1] for i in range(words_num))
label = list(token_dict[i][0] for i in range(words_num))

fig = go.Figure()
fig.add_trace(go.Scatter(y = expected, x = label, name='Expected frequency', mode='lines'))
fig.add_trace(go.Scatter(y = observed, x = label, name='Observed frequency', mode='lines'))

## Assignment_2

To identify the middle frequency of the vocabulary we use the empirical equation of Zipf's law. For our corpus the equation is as follows:
\begin{equation}
    log(freq) = 14.3034 - log(rank)
\end{equation}
From this straight line we obtain the `log mid-frequency` as $14.3034/2 = 7.15$ and its corresponding `log(rank)` is also $7.15$. Thus the `mid-frequency` rank of the words is $exp(7.15) = 1276$ (rounded).
<br> in other words , the tokens with frequency 1276 has occured moderate number of time (not high not low). From this frequency level we pick 10 nouns, verbs and adjectives.

In [3]:
len(words)

NameError: name 'words' is not defined

In [None]:
import h5py
f = h5py.File('halmatrix.hdf5','w')
hm = f.create_dataset("HAL_CM",(voc_count,voc_count),dtype=np.int16,compression="gzip")

In [146]:
# dictionary for storing the token and their indices (ranks)
token_index_dict = {}
for i in token_indexed:
    token_index_dict[i[0]] = i[2]

In [147]:
token_index_dict

{'এবং': 1,
 'করে': 2,
 'হয়': 3,
 'থেকে': 4,
 'ছিল': 5,
 'একটি': 6,
 'করা': 7,
 'তার': 8,
 'তিনি': 9,
 'করেন': 10,
 'জন্য': 11,
 'কুয়েতের': 12,
 'সালে': 13,
 'তাদের': 14,
 'কুয়েত': 15,
 'যায়': 16,
 'সাথে': 17,
 'মধ্যে': 18,
 'তারা': 19,
 'হয়ে': 20,
 'প্রথম': 21,
 'করার': 22,
 'সাবাহ': 23,
 'হিসেবে': 24,
 'থাকে': 25,
 'সালের': 26,
 'ছিলেন': 27,
 'এটি': 28,
 'কুয়েতে': 29,
 'তথ্যসূত্র': 30,
 'সময়': 31,
 'অর্থনৈতিক': 32,
 'উপর': 33,
 'কুয়েতি': 34,
 'প্রধান': 35,
 'বেশি': 36,
 'অত্তোমান': 37,
 'একজন': 38,
 'করতে': 39,
 'জন্ম': 40,
 'পূর্বে': 41,
 'পরিবার': 42,
 'মাধ্যমে': 43,
 'চলচ্চিত্র': 44,
 'রয়েছে': 45,
 'নিয়ে': 46,
 'দেয়': 47,
 'শুরু': 48,
 'বিভিন্ন': 49,
 'কারণে': 50,
 'বছর': 51,
 'পর্যন্ত': 52,
 'কিছু': 53,
 'হয়েছিল': 54,
 'সাবাহদের': 55,
 'ব্রিটিশ': 56,
 'ব্যবসায়ী': 57,
 'দ্বারা': 58,
 'আন্তর্জাতিক': 59,
 'পরে': 60,
 'মতো': 61,
 'কিন্তু': 62,
 'যেখানে': 63,
 'হয়েছে': 64,
 'জাতীয়': 65,
 'অন্যান্য': 66,
 'বলে': 67,
 'অত্তোমানরা': 68,
 'ক্ষমতা': 69,
 'ব্যবস্থা': 70,
 'কার

In [193]:
# the unique words (tokens)
tokens = list(token_index_dict.keys())

In [194]:
n = 11
grams_11 = ngrams(tokens, n)

#### initializing the HAL dictioaries

In [218]:
hal_l2r   = {}
hal_r2l   = {}

In [219]:
ramp = [0, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]

In [220]:
for index, grams in enumerate(grams_11):
    reversed_grams = grams[::-1]
    if None not in grams:
        for index, gram in enumerate(grams):  # Left to right
            hal_l2r[token_index_dict[grams[0]], token_index_dict[grams[index]]] =0#h5py dataset for l2r

        for index, gram in enumerate(reversed_grams):  # right to left
            hal_r2l[token_index_dict[reversed_grams[0]], token_index_dict[reversed_grams[index]]] =0 #h5py dataset for r2l

In [221]:
for index, grams in enumerate(grams_11):
    reversed_grams = grams[::-1]
    if None not in grams:
        for index, gram in enumerate(grams):  # Left to right
            hal_l2r[token_index_dict[grams[0]], token_index_dict[grams[index]]]+=ramp[index]#h5py dataset for l2r
        for index, gram in enumerate(reversed_grams):  # right to left
            hal_r2l[token_index_dict[reversed_grams[0]], token_index_dict[reversed_grams[index]]] += ramp[index] #h5py dataset for r2l

In [222]:
hal_l2r

{}