In [1]:
import numpy as np
import json

In [2]:
#vocabulary generate kore pura training corpus er

def generate_vocab(json_file_path, threshold=2):
    with open(json_file_path, "r" , encoding="utf-8") as json_file:
        json_data = json.load(json_file)

    vocab = {'<unk>': 0}
    counter = 0

    # Counting the number of times each word occurs
    for entry in json_data:
        sentence = entry["sentence"]
        counter+=1

        for word in sentence:
            for subword in word.split():
                if subword.strip():
                    if subword not in vocab:
                        vocab[subword] = 1
                    else:
                        vocab[subword] += 1

    # Adding count of rare words to the count of <unk>
    for k in list(vocab.keys())[1:]:
        if vocab[k] < threshold:
            vocab['<unk>'] += vocab[k]
            del vocab[k]

    # Sort vocab by count in descending order
    sorted_vocab = {k: v for k, v in sorted(vocab.items(), key=lambda item: item[1], reverse=True)}

    # Push <unk> to the top
    unk_val = sorted_vocab.pop('<unk>')
    sorted_vocab = {'<unk>': unk_val, **sorted_vocab}

    with open('vocab.txt', 'w', encoding="utf-8") as f:
        format_str = ''
        for index, (key, count) in enumerate(sorted_vocab.items()):
            format_str += key + '\t' + str(index) + '\t' + str(count) + '\n'
        f.write(format_str)

    #print("Threshold =", threshold)

    return sorted_vocab, counter

vocab, counter = generate_vocab("C:/Users/Admin/Desktop/Thesis/POS Tagging/archive/train_bangla.json")
print("Overall size of my vocabulary =", len(vocab))
print("No. of times '<unk>' occurs in my vocabulary =", vocab['<unk>'])
print("No. of sentence =", counter)

Overall size of my vocabulary = 24
No. of times '<unk>' occurs in my vocabulary = 183
No. of sentence = 20


In [3]:
def load_training_data(file_path):
    with open(file_path, 'r', encoding="utf-8") as json_file:
        return json.load(json_file)

#transition holo NNP to VBP kotobar, emission holo "Book" word ta kotobar NNP r kotobar VBP
#unique state holo distinct tag gulo niye banano list
def process_training_data(data, vocab):
    transition_probabilities = {}
    emission_probabilities = {}
    unique_states = []

    for entry in data:
        sentence_tokens = entry['sentence']
        labels = entry['labels']
        prior_state = 'head'  

        for word, tag in zip(sentence_tokens, labels):
            if word not in vocab.keys():
                word = '<unk>'

            # Update unique states
            if tag not in unique_states:
                unique_states.append(tag)

            # Update transition count
            transition_key = (prior_state, tag)
            if transition_key not in transition_probabilities:
                transition_probabilities[transition_key] = 1
            else:
                transition_probabilities[transition_key] += 1

            # Update emission count
            emission_key = (tag, word)
            if emission_key not in emission_probabilities:
                emission_probabilities[emission_key] = 1
            else:
                emission_probabilities[emission_key] += 1

            prior_state = tag

    return transition_probabilities, emission_probabilities, unique_states

def normalize_probabilities(probabilities):
    normalized_probabilities = {}
    for key in probabilities:
        state, value = key
        total = sum(v for k, v in probabilities.items() if k[0] == state)
        normalized_probabilities[key] = probabilities[key] / total
    return normalized_probabilities

#transition r emission gulo k dictionary er moto kore dekhano hoyese
def save_hmm_model(transition_probs, emission_probs, output_file):
    
    # Convert tuple keys to strings
    transition_probs = {str(k): v for k, v in transition_probs.items()}
    emission_probs = {str(k): v for k, v in emission_probs.items()}

    hmm_model = {
        'transition': transition_probs,
        'emission': emission_probs,
    }

    with open(output_file, 'w') as f:
        json.dump(hmm_model, f)

    return transition_probs, emission_probs

training_data = load_training_data('C:/Users/Admin/Desktop/Thesis/POS Tagging/archive/train_bangla.json')
transition_probs, emission_probs, unique_states = process_training_data(training_data, vocab)
normalized_transition_probs = normalize_probabilities(transition_probs)
normalized_emission_probs = normalize_probabilities(emission_probs)
transition_param_dict, emission_param_dict = save_hmm_model(normalized_transition_probs, normalized_emission_probs, 'hmm.json')

print("No. of transition parameters =", len(transition_param_dict))
print("No. of emission parameters =", len(emission_param_dict))

No. of transition parameters = 96
No. of emission parameters = 54


In [4]:
import nltk

In [5]:
#user_text = input("Enter your sentence: ")

In [52]:

from bnlp import BengaliPOS

bn_pos = BengaliPOS()

#text = "আমি ভাত খাই।" 
text = "আমি ভাত খাই। সে বাজারে যায়। তিনি কি সত্যিই ভালো মানুষ? সে ধনী কিন্তু তার ভাই গরীব।"
res_p = bn_pos.tag(text)
#print(len(res_p))
#print(res_p[3][0])

# Example list of tuples
def transfer_and_delete(original_list, specific_character):
    new_list = []
    i = 0
    while i < len(original_list):
        if original_list[i][0] != specific_character:
            new_list.append(original_list[i])
            del original_list[i]
        else:
            new_list.append(original_list[i])
            del original_list[i]
            break 
    return new_list

# Example list of tuples
original_list =res_p
specific_character = "।"

# Transfer elements until specific character is encountered
res = transfer_and_delete(original_list, specific_character)

print("New list:", res)
print("Original list:", original_list)


#print(res_p)


New list: [('আমি', 'PPR'), ('ভাত', 'NC'), ('খাই', 'VM'), ('।', 'VAUX')]
Original list: [('সে', 'PPR'), ('বাজারে', 'NC'), ('যায়', 'VM'), ('।', 'VAUX'), ('তিনি', 'PPR'), ('কি', 'PWH'), ('সত্যিই', 'AMN'), ('ভালো', 'JJ'), ('মানুষ', 'NC'), ('?', 'PU'), ('সে', 'PPR'), ('ধনী', 'NC'), ('কিন্তু', 'CSB'), ('তার', 'PPR'), ('ভাই', 'NC'), ('গরীব', 'NC'), ('।', 'PU')]


In [7]:
'''
from bnlp import NLTKTokenizer

bnltk = NLTKTokenizer()

#text = user_text
text = "আমি ভাত খাই। সে বাজারে যায়। তিনি কি সত্যিই ভালো মানুষ? সে ধনী কিন্তু তার ভাই গরীব।"
sentence_tokens = bnltk.sentence_tokenize(text)
print(len(sentence_tokens))
word_tokens = bnltk.word_tokenize(sentence_tokens[0])
print(word_tokens)
print(sentence_tokens)
'''

'\nfrom bnlp import NLTKTokenizer\n\nbnltk = NLTKTokenizer()\n\n#text = user_text\ntext = "আমি ভাত খাই। সে বাজারে যায়। তিনি কি সত্যিই ভালো মানুষ? সে ধনী কিন্তু তার ভাই গরীব।"\nsentence_tokens = bnltk.sentence_tokenize(text)\nprint(len(sentence_tokens))\nword_tokens = bnltk.word_tokenize(sentence_tokens[0])\nprint(word_tokens)\nprint(sentence_tokens)\n'

In [8]:
'''
from bnlp import BengaliPOS

bn_pos = BengaliPOS()
pos = word_tokens 
res = []
x = 0
for i in pos:
    res.insert(x, bn_pos.tag(i))
    x = x+1
print(len(res))
print(res[0])
'''

'\nfrom bnlp import BengaliPOS\n\nbn_pos = BengaliPOS()\npos = word_tokens \nres = []\nx = 0\nfor i in pos:\n    res.insert(x, bn_pos.tag(i))\n    x = x+1\nprint(len(res))\nprint(res[0])\n'

In [44]:
#POS tag kore each word er r accuracy dey
def greedy_decoding(formatted_data, vocab, transition_param_dict, emission_param_dict, state_track):
    accuracy = []
    transition_prob ={}
    emission_prob = {}
    max_stt = {}

    prior_st = 'head'
    counter = 0
    for i in formatted_data:
        print(i)
        counter = counter+1
        transition_prob[counter]=transition_param_dict.get(str((prior_st, i[1])), 1e-7)
        emission_prob[counter]=emission_param_dict.get(str((i[1], i[0])), 0)
        prior_st = i[1]

    return transition_prob, emission_prob


transition_prob, emission_prob = greedy_decoding(res, vocab, transition_param_dict, emission_param_dict, unique_states)
print('Transition_prob = ', transition_prob)
print('Emission_prob = ', emission_prob)

('আমি', 'PPR')
('ভাত', 'NC')
('খাই', 'VM')
('।', 'VAUX')
Transition_prob =  {1: 0.15, 2: 0.5333333333333333, 3: 0.17073170731707318, 4: 0.20833333333333334}
Emission_prob =  {1: 0, 2: 0, 3: 0, 4: 0}


In [45]:
## eita nibo
def reverse_graph(G):
    '''Return the reversed graph g[dst][src]=G[src][dst]'''
    g = {}
    for src in G.keys():
        for dst in G[src].keys():
            if dst not in g.keys():
                g[dst] = {}
            g[dst][src] = G[src][dst]
    return g


def build_max(rg, root):
    '''Find the max in-edge for every node except for the root.'''
    mg = {}
    for dst in rg.keys():
        if dst == root:
            continue
        max_ind = -100
        max_value = -100
        for src in rg[dst].keys():
            if rg[dst][src] >= max_value:
                max_ind = src
                max_value = rg[dst][src]
        mg[dst] = {max_ind: max_value}
    return mg


def find_circle(mg):
    '''Return the first circle if find, otherwise return None'''

    for start in mg.keys():
        visited = []
        stack = [start]
        while stack:
            n = stack.pop()
            if n in visited:
                C = []
                while n not in C:
                    C.append(n)
                    n = list(mg[n].keys())[0]
                return C
            visited.append(n)
            if n in mg.keys():
                stack.extend(list(mg[n].keys()))
    return None


def chu_liu_edmond(G, root):
    ''' G: dict of dict of weights
            G[i][j] = w means the edge from node i to node j has weight w.
        root: the root node, has outgoing edges only.
    '''
    # reversed graph rg[dst][src] = G[src][dst]
    rg = reverse_graph(G)
    # root er only out edge
    rg[root] = {}
    # the maximum edge select korlam for each node other than root
    mg = build_max(rg, root)

    # check if mg is a tree (contains a circle)
    C = find_circle(mg)
    # circle na thakle, mg tai max_spanning_tree
    if not C:
        return reverse_graph(mg)

    # jesob node circle kore tader k niye compact node korlm
    all_nodes = G.keys()
    vc = max(all_nodes) + 1

    # new graph holo G_prime
    V_prime = list(set(all_nodes) - set(C)) + [vc]
    G_prime = {}
    vc_in_idx = {}
    vc_out_idx = {}
    # Now add the edges to G_prime
    for u in all_nodes:
        for v in G[u].keys():
            # incoming edge er weight calculation
            if (u not in C) and (v in C):
                if u not in G_prime.keys():
                    G_prime[u] = {}
                w = G[u][v] - list(mg[v].values())[0]
                if (vc not in G_prime[u]) or (vc in G_prime[u] and w > G_prime[u][vc]):
                    G_prime[u][vc] = w
                    vc_in_idx[u] = v

            # outgoing edge er weight calculation
            elif (u in C) and (v not in C):
                if vc not in G_prime.keys():
                    G_prime[vc] = {}
                w = G[u][v]
                if (v not in G_prime[vc]) or (v in G_prime[vc] and w > G_prime[vc][v]):
                    G_prime[vc][v] = w
                    vc_out_idx[v] = u

            # Third case: if the source and dest are all not in the circle, then just add the edge to the new graph.
            elif (u not in C) and (v not in C):
                if u not in G_prime.keys():
                    G_prime[u] = {}
                G_prime[u][v] = G[u][v]

    # Recursively run the algorihtm on the new graph G_prime
    A = chu_liu_edmond(G_prime, root)
    # print(A)

    # compacted node k vangbo, erpor incoming r outgoing edge gulo identify krbo
    # always max ta choose krbo r bakigulo delete krbo
    all_nodes_A = list(A.keys())
    for src in all_nodes_A:
        # The number of out-edges varies, could be 0 or any number <=|V\C|
        if src == vc:
            for node_in in A[src].keys():
                orig_out = vc_out_idx[node_in]
                if orig_out not in A.keys():
                    A[orig_out] = {}
                A[orig_out][node_in] = G[orig_out][node_in]
        else:
            #for dst in A[src]:
            for dst in list(A[src].keys()):
                # There must be only one in-edge to vc.
                if dst == vc:
                    orig_in = vc_in_idx[src]
                    A[src][orig_in] = G[src][orig_in]
                    del A[src][dst]
    del A[vc]


    for node in C:
        if node != orig_in:
            src = list(mg[node].keys())[0]
            if src not in A.keys():
                A[src] = {}
            A[src][node] = mg[node][src]

    return A


In [46]:
res = ['root']+res
print(res)

['root', ('আমি', 'PPR'), ('ভাত', 'NC'), ('খাই', 'VM'), ('।', 'VAUX')]


In [47]:
def get_edges(graph):
    edges = []
    for node in graph:
        for neighbor in graph[node]:
            edges.append((node, neighbor))
    return edges

# eita nibo
#num_vertices = len(res)+1
#print(num_vertices)

G = {}
dp = {}
print(len(res))
for i in range(len(res)):
    G[i] = {}
    if i==0:
        p=100
        q=200
        for j in range(1,len(res)):
            print(res[j])
            weight = transition_prob[j] + emission_prob[j] 
            #if p ==0:
            if res[j][1] == 'VM' or res[j][1] == 'VAUX':
                #print('#')
                weight = p*(transition_prob[j] + emission_prob[j])
                p = p/3
            if res[j][1] == 'CCD' or res[j][1] == 'CSB':
                weight = p*(transition_prob[j] + emission_prob[j])
                p = p*3
                
            G[i][j] = weight
        continue
                
    for j in range(len(res)):
        if (j == 0):
            continue
        if (i == j):
            continue
        weight = transition_prob[j] + emission_prob[j] 
        
        G[i][j] = weight
        
print("G =", G)

dp = chu_liu_edmond(G, 0)
print('DP =',dp)
edges = get_edges(dp)
print("Edges of the graph:", edges)
for i in edges:
    #print(i)
    x = i[0]
    y = i[1]
    if x == 0:
        print('root', '-->', res[y][0])
    else:
        print(res[x][0],'-->', res[y][0])
'''
G3 = {}
dp = {}
G3= {0: {1: 0.13175059355956434, 2: 0.18211908429298618*10, 3: 1.0444370537508816, 4: 0.03227769819785418},
    1: {2: 0.18211908429298618, 3: 1.0444370537508816, 4: 0.03227769819785418},
    2: {1: 0.13175059355956434*3, 3: 1.0444370537508816*5, 4: 0.03227769819785418*2.5},
    3: {1: 0.13175059355956434, 2: 0.18211908429298618, 4: 0.03227769819785418*3},
    4: {1: 0.13175059355956434*2, 2: 0.18211908429298618*5, 3: 1.0444370537508816}}
#print("G3 :", G3[1][2])
dp = chu_liu_edmond(G3, 0)
print(dp)
edges = get_edges(dp)
print("Edges of the graph:", edges)
for i in edges:
    x = i[0]-1
    y = i[1]-1
    if x == -1:
        print('root', '-->', formatted_dev_data[y])
    else:
        print(formatted_dev_data[x],'-->', formatted_dev_data[y])
'''

5
('আমি', 'PPR')
('ভাত', 'NC')
('খাই', 'VM')
('।', 'VAUX')
G = {0: {1: 0.15, 2: 0.5333333333333333, 3: 17.073170731707318, 4: 6.9444444444444455}, 1: {2: 0.5333333333333333, 3: 0.17073170731707318, 4: 0.20833333333333334}, 2: {1: 0.15, 3: 0.17073170731707318, 4: 0.20833333333333334}, 3: {1: 0.15, 2: 0.5333333333333333, 4: 0.20833333333333334}, 4: {1: 0.15, 2: 0.5333333333333333, 3: 0.17073170731707318}}
DP = {4: {1: 0.15, 2: 0.5333333333333333}, 0: {3: 17.073170731707318, 4: 6.9444444444444455}}
Edges of the graph: [(4, 1), (4, 2), (0, 3), (0, 4)]
। --> আমি
। --> ভাত
root --> খাই
root --> ।


'\nG3 = {}\ndp = {}\nG3= {0: {1: 0.13175059355956434, 2: 0.18211908429298618*10, 3: 1.0444370537508816, 4: 0.03227769819785418},\n    1: {2: 0.18211908429298618, 3: 1.0444370537508816, 4: 0.03227769819785418},\n    2: {1: 0.13175059355956434*3, 3: 1.0444370537508816*5, 4: 0.03227769819785418*2.5},\n    3: {1: 0.13175059355956434, 2: 0.18211908429298618, 4: 0.03227769819785418*3},\n    4: {1: 0.13175059355956434*2, 2: 0.18211908429298618*5, 3: 1.0444370537508816}}\n#print("G3 :", G3[1][2])\ndp = chu_liu_edmond(G3, 0)\nprint(dp)\nedges = get_edges(dp)\nprint("Edges of the graph:", edges)\nfor i in edges:\n    x = i[0]-1\n    y = i[1]-1\n    if x == -1:\n        print(\'root\', \'-->\', formatted_dev_data[y])\n    else:\n        print(formatted_dev_data[x],\'-->\', formatted_dev_data[y])\n'

In [13]:
print(res[0][0])

r


In [14]:
print(transition_param_dict)

{"('head', 'RDF')": 0.1, "('RDF', 'NC')": 1.0, "('NC', 'NC')": 0.25609756097560976, "('NC', 'JJ')": 0.08536585365853659, "('JJ', 'NC')": 0.8, "('NC', 'CCD')": 0.024390243902439025, "('CCD', 'NC')": 0.4444444444444444, "('NC', 'NV')": 0.04878048780487805, "('NV', 'PU')": 0.2, "('NC', 'NP')": 0.024390243902439025, "('NP', 'JJ')": 0.1, "('NC', 'VM')": 0.17073170731707318, "('VM', 'JQ')": 0.041666666666666664, "('JQ', 'JJ')": 0.3076923076923077, "('NC', 'PP')": 0.012195121951219513, "('PP', 'JJ')": 0.3333333333333333, "('VM', 'PU')": 0.5, "('head', 'JQ')": 0.05, "('NC', 'PPR')": 0.06097560975609756, "('PPR', 'NC')": 0.5333333333333333, "('VM', 'VAUX')": 0.20833333333333334, "('VAUX', 'CCD')": 0.4, "('CCD', 'PPR')": 0.3333333333333333, "('NC', 'NST')": 0.036585365853658534, "('NST', 'JJ')": 0.3333333333333333, "('VAUX', 'PU')": 0.6, "('head', 'JJ')": 0.1, "('NV', 'CCD')": 0.2, "('NC', 'PU')": 0.18292682926829268, "('PU', 'NC')": 0.5, "('NV', 'JJ')": 0.2, "('JJ', 'VM')": 0.15, "('VM', 'PRL')

In [15]:
print(emission_param_dict)

{"('RDF', '<unk>')": 1.0, "('NC', '<unk>')": 0.9390243902439024, "('JJ', '<unk>')": 0.8, "('CCD', 'এবং')": 0.6666666666666666, "('NV', '<unk>')": 0.6, "('PU', '।')": 0.35555555555555557, "('NP', '<unk>')": 0.7, "('JJ', 'রাজনৈতিক')": 0.1, "('VM', '<unk>')": 0.8333333333333334, "('JQ', '<unk>')": 1.0, "('PP', '<unk>')": 1.0, "('VM', 'করে')": 0.125, "('NC', 'স্থান')": 0.024390243902439025, "('PPR', '<unk>')": 0.6, "('VAUX', '<unk>')": 0.8, "('NST', 'মধ্যে')": 0.6666666666666666, "('NV', 'চালানো')": 0.4, "('CCD', 'ও')": 0.2222222222222222, "('PU', '-')": 0.08888888888888889, "('PRL', '<unk>')": 1.0, "('AMN', '<unk>')": 1.0, "('NP', 'আম')": 0.1, "('PU', ',')": 0.4, "('PPR', 'তারা')": 0.13333333333333333, "('CCL', '<unk>')": 1.0, "('CSB', 'কিন্তু')": 0.6666666666666666, "('DAB', 'এ')": 0.75, "('NP', 'সি')": 0.2, "('CX', '<unk>')": 1.0, "('EX', '<unk>')": 1.0, "('MD', '<unk>')": 1.0, "('VB', '<unk>')": 1.0, "('NNS', 'ও')": 1.0, "('VBG', '<unk>')": 1.0, "('WP', '<unk>')": 1.0, "('PRP', '<unk>'

In [16]:
print(unique_states)

['RDF', 'NC', 'JJ', 'CCD', 'NV', 'PU', 'NP', 'VM', 'JQ', 'PP', 'PPR', 'VAUX', 'NST', 'PRL', 'AMN', 'CCL', 'CSB', 'DAB', 'CX', 'EX', 'MD', 'VB', 'NNS', 'VBG', 'WP', 'PRP', 'VBD', '.', "''", 'ALC']


In [17]:
#!pip install Flask flask-ngrok

In [1]:
from flask import Flask, render_template, request

app = Flask(__name__)

# Example NLP function
def analyze_text(text):
    # Replace this with your actual NLP processing logic
    # For demonstration, let's just return the input text
    return {"input": text, "analysis": "Placeholder analysis"}

@app.route("/", methods=['GET', 'POST'])
def index():
    if request.method == 'POST':
        user_input = request.form['user_input']
        analysis = analyze_text(user_input)
        return render_template('new_result.html', analysis=analysis)
    return render_template('new.html')

if __name__ == '__main__':
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [16/Feb/2024 15:24:37] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [16/Feb/2024 15:24:41] "POST / HTTP/1.1" 200 -
