In [1]:
import numpy as np
import json

In [2]:
#vocabulary generate kore pura training corpus er

def generate_vocab(json_file_path, threshold=2):
    with open(json_file_path, "r" , encoding="utf-8") as json_file:
        json_data = json.load(json_file)

    vocab = {'<unk>': 0}
    counter = 0

    # Counting the number of times each word occurs
    for entry in json_data:
        sentence = entry["sentence"]
        counter+=1

        for word in sentence:
            for subword in word.split():
                if subword.strip():
                    if subword not in vocab:
                        vocab[subword] = 1
                    else:
                        vocab[subword] += 1

    # Adding count of rare words to the count of <unk>
    for k in list(vocab.keys())[1:]:
        if vocab[k] < threshold:
            vocab['<unk>'] += vocab[k]
            del vocab[k]

    # Sort vocab by count in descending order
    sorted_vocab = {k: v for k, v in sorted(vocab.items(), key=lambda item: item[1], reverse=True)}

    # Push <unk> to the top
    unk_val = sorted_vocab.pop('<unk>')
    sorted_vocab = {'<unk>': unk_val, **sorted_vocab}

    with open('vocab.txt', 'w', encoding="utf-8") as f:
        format_str = ''
        for index, (key, count) in enumerate(sorted_vocab.items()):
            format_str += key + '\t' + str(index) + '\t' + str(count) + '\n'
        f.write(format_str)

    #print("Threshold =", threshold)

    return sorted_vocab, counter

vocab, counter = generate_vocab("C:/Users/Admin/Desktop/Thesis/POS Tagging/archive/train_bangla.json")
print("Overall size of my vocabulary =", len(vocab))
print("No. of times '<unk>' occurs in my vocabulary =", vocab['<unk>'])
print("No. of sentence =", counter)

Overall size of my vocabulary = 61
No. of times '<unk>' occurs in my vocabulary = 347
No. of sentence = 51


In [3]:
def load_training_data(file_path):
    with open(file_path, 'r', encoding="utf-8") as json_file:
        return json.load(json_file)

#transition holo NNP to VBP kotobar, emission holo "Book" word ta kotobar NNP r kotobar VBP
#unique state holo distinct tag gulo niye banano list
def process_training_data(data, vocab):
    transition_probabilities = {}
    emission_probabilities = {}
    unique_states = []

    for entry in data:
        sentence_tokens = entry['sentence']
        labels = entry['labels']
        prior_state = 'root'  

        for word, tag in zip(sentence_tokens, labels):
            if word not in vocab.keys():
                word = '<unk>'

            # Update unique states
            if tag not in unique_states:
                unique_states.append(tag)

            # Update transition count
            transition_key = (prior_state, tag)
            if transition_key not in transition_probabilities:
                transition_probabilities[transition_key] = 1
            else:
                transition_probabilities[transition_key] += 1

            # Update emission count
            emission_key = (tag, word)
            if emission_key not in emission_probabilities:
                emission_probabilities[emission_key] = 1
            else:
                emission_probabilities[emission_key] += 1

            prior_state = tag

    return transition_probabilities, emission_probabilities, unique_states

def normalize_probabilities(probabilities):
    normalized_probabilities = {}
    for key in probabilities:
        state, value = key
        total = sum(v for k, v in probabilities.items() if k[0] == state)
        normalized_probabilities[key] = probabilities[key] / total
    return normalized_probabilities

#transition r emission gulo k dictionary er moto kore dekhano hoyese
def save_hmm_model(transition_probs, emission_probs, output_file):
    
    # Convert tuple keys to strings
    transition_probs = {str(k): v for k, v in transition_probs.items()}
    emission_probs = {str(k): v for k, v in emission_probs.items()}

    hmm_model = {
        'transition': transition_probs,
        'emission': emission_probs,
    }

    with open(output_file, 'w') as f:
        json.dump(hmm_model, f)

    return transition_probs, emission_probs

training_data = load_training_data('C:/Users/Admin/Desktop/Thesis/POS Tagging/archive/train_bangla.json')
transition_probs, emission_probs, unique_states = process_training_data(training_data, vocab)
normalized_transition_probs = normalize_probabilities(transition_probs)
normalized_emission_probs = normalize_probabilities(emission_probs)
transition_param_dict, emission_param_dict = save_hmm_model(normalized_transition_probs, normalized_emission_probs, 'hmm.json')

print("No. of transition parameters =", len(transition_param_dict))
print("No. of emission parameters =", len(emission_param_dict))

No. of transition parameters = 144
No. of emission parameters = 98


In [4]:
import nltk

In [5]:
#user_text = input("Enter your sentence: ")

In [6]:
#POS tag kore each word er r accuracy dey
def greedy_decoding(formatted_data, vocab, transition_param_dict, emission_param_dict, state_track):
    accuracy = []
    transition_prob ={}
    emission_prob = {}
    max_stt = {}

    prior_st = 'root'
    counter = 0
    for i in formatted_data:
        print(i)
        counter = counter+1
        transition_prob[counter]=transition_param_dict.get(str((prior_st, i[1])), 1e-7)
        emission_prob[counter]=emission_param_dict.get(str((i[1], i[0])), 0)
        prior_st = i[1]

    return transition_prob, emission_prob

In [7]:
## eita nibo

def reverse_graph(G):
    '''Return the reversed graph g[dst][src]=G[src][dst]'''
    g = {}
    for src in G.keys():
        for dst in G[src].keys():
            if dst not in g.keys():
                g[dst] = {}
            g[dst][src] = G[src][dst]
    return g


def build_max(rg, root):
    '''Find the max in-edge for every node except for the root.'''
    mg = {}
    for dst in rg.keys():
        if dst == root:
            continue
        max_ind = -100
        max_value = -100
        for src in rg[dst].keys():
            if rg[dst][src] >= max_value:
                max_ind = src
                max_value = rg[dst][src]
        mg[dst] = {max_ind: max_value}
    return mg


def find_circle(mg):
    '''Return the first circle if find, otherwise return None'''

    for start in mg.keys():
        visited = []
        stack = [start]
        while stack:
            n = stack.pop()
            if n in visited:
                C = []
                while n not in C:
                    C.append(n)
                    n = list(mg[n].keys())[0]
                return C
            visited.append(n)
            if n in mg.keys():
                stack.extend(list(mg[n].keys()))
    return None


def chu_liu_edmond(G, root):
    ''' G: dict of dict of weights
            G[i][j] = w means the edge from node i to node j has weight w.
        root: the root node, has outgoing edges only.
    '''
    # reversed graph rg[dst][src] = G[src][dst]
    rg = reverse_graph(G)
    # root er only out edge
    rg[root] = {}
    # the maximum edge select korlam for each node other than root
    mg = build_max(rg, root)

    # check if mg is a tree (contains a circle)
    C = find_circle(mg)
    # circle na thakle, mg tai max_spanning_tree
    if not C:
        return reverse_graph(mg)

    # jesob node circle kore tader k niye compact node korlm
    all_nodes = G.keys()
    vc = max(all_nodes) + 1

    # new graph holo G_prime
    V_prime = list(set(all_nodes) - set(C)) + [vc]
    G_prime = {}
    vc_in_idx = {}
    vc_out_idx = {}
    # Now add the edges to G_prime
    for u in all_nodes:
        for v in G[u].keys():
            # incoming edge er weight calculation
            if (u not in C) and (v in C):
                if u not in G_prime.keys():
                    G_prime[u] = {}
                w = G[u][v] - list(mg[v].values())[0] 
                if (vc not in G_prime[u]) or (vc in G_prime[u] and w > G_prime[u][vc]):
                    G_prime[u][vc] = w
                    vc_in_idx[u] = v

            # outgoing edge er weight calculation
            elif (u in C) and (v not in C):
                if vc not in G_prime.keys():
                    G_prime[vc] = {}
                w = G[u][v]
                if (v not in G_prime[vc]) or (v in G_prime[vc] and w > G_prime[vc][v]):
                    G_prime[vc][v] = w
                    vc_out_idx[v] = u

            # Third case: if the source and dest are all not in the circle, then just add the edge to the new graph.
            elif (u not in C) and (v not in C):
                if u not in G_prime.keys():
                    G_prime[u] = {}
                G_prime[u][v] = G[u][v]

    # Recursively run the algorihtm on the new graph G_prime
    A = chu_liu_edmond(G_prime, root)
    # print(A)

    # compacted node k vangbo, erpor incoming r outgoing edge gulo identify krbo
    # always max ta choose krbo r bakigulo delete krbo
    all_nodes_A = list(A.keys())
    for src in all_nodes_A:
        # The number of out-edges varies, could be 0 or any number <=|V\C|
        if src == vc:
            for node_in in A[src].keys():
                orig_out = vc_out_idx[node_in]
                if orig_out not in A.keys():
                    A[orig_out] = {}
                A[orig_out][node_in] = G[orig_out][node_in]
        else:
            #for dst in A[src]:
            for dst in list(A[src].keys()):
                # There must be only one in-edge to vc.
                if dst == vc:
                    orig_in = vc_in_idx[src]
                    A[src][orig_in] = G[src][orig_in]
                    del A[src][dst]
    #del A[vc]
    if vc in A:
        del A[vc]
    '''
    try:
        del A[vc]
    except KeyError:
        pass  # Do nothing if the key doesn't exist
    '''


    for node in C:
        if node != orig_in:
            src = list(mg[node].keys())[0]
            if src not in A.keys():
                A[src] = {}
            A[src][node] = mg[node][src]

    return A


In [8]:
def separate_bangla_sentences(text):
    # Initialize a list to store the separated sentences
    sentences = []
    current_sentence = ""
    end_sentence_marks = ['।', '?', '!']
    for char in text:
        current_sentence += char
        if char in end_sentence_marks:
            sentences.append(current_sentence.strip())
            current_sentence = ""
    if current_sentence:
        sentences.append(current_sentence.strip())
    
    return sentences

In [25]:

from bnlp import BengaliPOS

bn_pos = BengaliPOS()

#bangla_text = "আমি বাংলায় গান গাই। তুমি কি গান শুনবে? আমি ভাত খাই। সে বাজারে যায়। তিনি কি সত্যিই ভালো মানুষ? সে ধনী কিন্তু তার ভাই গরীব।"
#bangla_text = "আমি ভাত খাই। সে বাজারে যায়। তিনি কি সত্যিই ভালো মানুষ? সে ধনী কিন্তু তার ভাই গরীব।"
def tree_generation(text):
    bangla_text = text
    dependency = []
    dependency_2 = []
    res_s = []
    dp_edges = []
    separated_sentences = separate_bangla_sentences(bangla_text)
    print("Separated Sentences:")

    for sentence in separated_sentences:
        print(sentence)
        res = bn_pos.tag(sentence)
        print(res)

        transition_prob, emission_prob = greedy_decoding(res, vocab, transition_param_dict, emission_param_dict, unique_states)
        print('Transition_prob = ', transition_prob)
        print('Emission_prob = ', emission_prob)

        res = ['root']+res
        res_s.append(res)
        print(res)

        def get_edges(graph):
            edges = []
            for node in graph:
                for neighbor in graph[node]:
                    edges.append((node, neighbor))
            return edges

        # eita nibo
        #num_vertices = len(res)+1
        #print(num_vertices)

        G = {}
        dp = {}
        print(len(res))
        for i in range(len(res)):
            G[i] = {}
            if i==0:
                p=500
                q=100
                r=50
                s=1
                for j in range(1, len(res)):
                    print(res[j])
                    weight = transition_prob[j] + emission_prob[j]
                    if res[j][1] == 'CCD' or res[j][1] == 'CSB':
                        weight = p*(transition_prob[j] + emission_prob[j])
                        p = p*3
                    if res[j][1] == 'VM' or res[j][1] == 'VAUX':
                        weight = q*(transition_prob[j] + emission_prob[j])
                        q = q/3
                    if res[j][1] == 'PPR':
                        weight = r*(transition_prob[j] + emission_prob[j])
                        r = r/3
                        
                    else:
                        weight = weight
                    G[i][j] = weight # done. r kono root add korbo na
                continue

            for j in range(len(res)):
                if (j == 0):
                    continue
                if (i == j):
                    continue
                weight = transition_prob[j] + emission_prob[j]
                if res[j][1] == 'NC':
                    weight = p*(transition_prob[j] + emission_prob[j])
                    p = p/3
                if res[j][1] == 'DAB':
                    weight = p*(transition_prob[j] + emission_prob[j])
                    p = p/3
                if res[j][1] == 'NP':
                    weight = p*(transition_prob[j] + emission_prob[j])
                    p = p/3

                G[i][j] = weight
                
        print("G =", G)

        dp = chu_liu_edmond(G, 0)
        #dp = max_spanning_tree(G)
        print('DP =',dp)
        edges_of_dp = get_edges(dp)
        dp_edges.append(get_edges(dp)) 

        # Define the list of tuples
        list_of_tuples = edges_of_dp
        word_tags = res
        edges = []
        for edge in list_of_tuples:
            src = word_tags[edge[0]][0] if edge[0] != 0 else 'root'
            dst = word_tags[edge[1]][0]
            edges.append((src, dst))
        print("List of Edges:", edges)
        dependency.append(edges)
        
        ########
        list_of_tuples_2 = edges_of_dp
        word_tags_2 = res
        edges_2 = []
        for edge in list_of_tuples_2:
            src = word_tags[edge[0]][1] if edge[0] != 0 else 'root'
            dst = word_tags[edge[1]][1]
            edges_2.append((src, dst))
        print("List of Edges_2:", edges_2)
        dependency_2.append(edges_2)
        

    return dependency, dependency_2, res_s, dp_edges

#sentence = "আমি ভাত খাই।"
#sentence = "আমি ভাত খাই। আমি বাংলায় গান গাই।  সে বাজারে যায়। তুমি কি গান শুনবে?"
#sentence = "প্রাকৃতিক রূপবৈচিত্র্যে ভরা আমাদের এই বাংলাদেশ। এই দেশে পরিচিত অপরিচিত অনেক পর্যটক-আকর্ষক স্থান আছে। এর মধ্যে প্রত্নতাত্ত্বিক নিদর্শন, ঐতিহাসিক মসজিদ এবং মিনার, পৃথিবীর দীর্ঘতম প্রাকৃতিক সমুদ্র সৈকত, পাহাড়, অরণ্য ইত্যাদি অন্যতম। এদেশের প্রাকৃতিক সৌন্দর্য পর্যটকদের মুগ্ধ করে। বাংলাদেশের প্রত্যেকটি এলাকা বিভিন্ন স্বতন্ত্র্র বৈশিষ্ট্যে বিশেষায়িত ।"
#sentence = "পদ্মা বাংলাদেশের একটি প্রধান নদী। এটি হিমালয়ে উৎপন্ন গঙ্গানদীর প্রধান শাখা এবং বাংলাদেশের ২য় দীর্ঘতম নদী। বাংলাদেশের গুরুত্বপূর্ণ শহর রাজশাহী এই পদ্মার উত্তর তীরে অবস্থিত। পদ্মার সর্বোচ্চ গভীরতা ১,৫৭১ ফুট(৪৭৯ মিটার) এবং গড় গভীরতা ৯৬৮ফুট(২৯৫ মিটার)।"
sentence = "সে ধনী কিন্তু তার ভাই গরীব।"
dependencies, dependencies_2, res_sies, dp_edgesies = tree_generation(sentence)

Separated Sentences:
সে ধনী কিন্তু তার ভাই গরীব।
[('সে', 'PPR'), ('ধনী', 'NC'), ('কিন্তু', 'CSB'), ('তার', 'PPR'), ('ভাই', 'NC'), ('গরীব', 'NC'), ('।', 'PU')]
('সে', 'PPR')
('ধনী', 'NC')
('কিন্তু', 'CSB')
('তার', 'PPR')
('ভাই', 'NC')
('গরীব', 'NC')
('।', 'PU')
Transition_prob =  {1: 0.1568627450980392, 2: 0.45, 3: 1e-07, 4: 0.25, 5: 0.45, 6: 0.31794871794871793, 7: 0.15384615384615385}
Emission_prob =  {1: 0, 2: 0, 3: 0.25, 4: 0.05, 5: 0, 6: 0, 7: 0.4838709677419355}
['root', ('সে', 'PPR'), ('ধনী', 'NC'), ('কিন্তু', 'CSB'), ('তার', 'PPR'), ('ভাই', 'NC'), ('গরীব', 'NC'), ('।', 'PU')]
8
('সে', 'PPR')
('ধনী', 'NC')
('কিন্তু', 'CSB')
('তার', 'PPR')
('ভাই', 'NC')
('গরীব', 'NC')
('।', 'PU')
G = {0: {1: 7.8431372549019605, 2: 0.45, 3: 125.00005, 4: 5.0, 5: 0.45, 6: 0.31794871794871793, 7: 0.6377171215880894}, 1: {2: 675.0, 3: 0.2500001, 4: 0.3, 5: 225.0, 6: 52.99145299145299, 7: 0.6377171215880894}, 2: {1: 0.1568627450980392, 3: 0.2500001, 4: 0.3, 5: 24.999999999999996, 6: 5.887939221272553, 

In [28]:
print(dependencies)

[[('root', 'সে'), ('root', 'কিন্তু'), ('root', 'তার'), ('সে', 'ধনী'), ('সে', 'ভাই'), ('সে', 'গরীব'), ('গরীব', '।')]]


In [29]:
print(dependencies_2)

[[('root', 'PPR'), ('root', 'CSB'), ('root', 'PPR'), ('PPR', 'NC'), ('PPR', 'NC'), ('PPR', 'NC'), ('NC', 'PU')]]


In [30]:
#print(res_sies)

In [31]:
print(dp_edgesies)

[[(0, 1), (0, 3), (0, 4), (1, 2), (1, 5), (1, 6), (6, 7)]]


In [32]:
print(transition_param_dict)

{"('root', 'RDF')": 0.0784313725490196, "('RDF', 'NC')": 0.8333333333333334, "('NC', 'NC')": 0.31794871794871793, "('NC', 'JJ')": 0.06153846153846154, "('JJ', 'NC')": 0.7209302325581395, "('NC', 'CCD')": 0.03076923076923077, "('CCD', 'NC')": 0.47368421052631576, "('NC', 'NV')": 0.05128205128205128, "('NV', 'PU')": 0.07692307692307693, "('NC', 'NP')": 0.010256410256410256, "('NP', 'JJ')": 0.06666666666666667, "('NC', 'VM')": 0.16923076923076924, "('VM', 'JQ')": 0.017857142857142856, "('JQ', 'JJ')": 0.28, "('NC', 'PP')": 0.02564102564102564, "('PP', 'JJ')": 0.2, "('VM', 'PU')": 0.5357142857142857, "('root', 'JQ')": 0.0392156862745098, "('NC', 'PPR')": 0.05128205128205128, "('PPR', 'NC')": 0.45, "('VM', 'VAUX')": 0.19642857142857142, "('VAUX', 'CCD')": 0.18181818181818182, "('CCD', 'PPR')": 0.21052631578947367, "('NC', 'NST')": 0.02564102564102564, "('NST', 'JJ')": 0.16666666666666666, "('VAUX', 'PU')": 0.6363636363636364, "('root', 'JJ')": 0.0784313725490196, "('NV', 'CCD')": 0.076923076

In [33]:
'''
def get_values_with_string_at_position_0(dictionary, string):
    values = []
    for key, value in dictionary.items():
        if key.startswith(f"('{string}'"):
            values.append(value)
    return values

string_values = get_values_with_string_at_position_0(transition_param_dict, 'root')
print(string_values)
'''

'\ndef get_values_with_string_at_position_0(dictionary, string):\n    values = []\n    for key, value in dictionary.items():\n        if key.startswith(f"(\'{string}\'"):\n            values.append(value)\n    return values\n\nstring_values = get_values_with_string_at_position_0(transition_param_dict, \'root\')\nprint(string_values)\n'

In [34]:
def get_keys_and_values_with_string_at_position_0(dictionary, string):
    key_value_tuples = []
    for key, value in dictionary.items():
        if key.startswith(f"('{string}'"):
            key_value_tuples.append(((string, key.split(", ")[1][1:-2]), value))
    return key_value_tuples

In [35]:
def accuracy_measure(dependencies_2):
    tuple_sort_accuracy_list = []
    for sent in dependencies_2:
        tuple_sort_accuracy = 0
        print(len(sent))
        for tuples in sent:
            #print(tuples)
            result = get_keys_and_values_with_string_at_position_0(transition_param_dict, tuples[0])
            sorted_result = sorted(result, key=lambda x: x[1], reverse=True)
            #print(sorted_result)
            #print(len(sorted_result))
            x = 0
            p = 0
            for tuple_sort in sorted_result:
                #print(tuple_sort[0])
                x = x+1
                if(tuples == tuple_sort[0]):
                    print("tuple_sort:", tuple_sort)
                    tuple_sort_accuracy = tuple_sort_accuracy + (tuple_sort[1])/len(sent)*x
                    print("tuple_sort accuracy:", tuple_sort_accuracy)
                    p = 1
            if (p == 0):
                print("sorted result:", sorted_result)
                tuple_sort_accuracy = tuple_sort_accuracy + len(sorted_result)* .02
                print("tuple_sort accuracyyyy:", tuple_sort_accuracy)
        tuple_sort_accuracy_list.append(tuple_sort_accuracy)
        print("tuple_sort accuracy for a sentence:", tuple_sort_accuracy)
    #print("tuple_sort accuracy for a sentence:", tuple_sort_accuracy_list)
    return tuple_sort_accuracy_list
print("tuple_sort accuracy for a sentence:", accuracy_measure(dependencies_2))

7
tuple_sort: (('root', 'PPR'), 0.1568627450980392)
tuple_sort accuracy: 0.04481792717086835
tuple_sort: (('root', 'CSB'), 0.0784313725490196)
tuple_sort accuracy: 0.10084033613445378
tuple_sort: (('root', 'PPR'), 0.1568627450980392)
tuple_sort accuracy: 0.14565826330532214
tuple_sort: (('PPR', 'NC'), 0.45)
tuple_sort accuracy: 0.20994397759103645
tuple_sort: (('PPR', 'NC'), 0.45)
tuple_sort accuracy: 0.2742296918767507
tuple_sort: (('PPR', 'NC'), 0.45)
tuple_sort accuracy: 0.338515406162465
tuple_sort: (('NC', 'PU'), 0.15384615384615385)
tuple_sort accuracy: 0.4044494720965309
tuple_sort accuracy for a sentence: 0.4044494720965309
tuple_sort accuracy for a sentence: [0.4044494720965309]


In [18]:
def accuracy_measure_2(dependencies_2):
    tuple_sort_accuracy_list = []
    for sent in dependencies_2:
        tuple_sort_accuracy = 0
        len_sen = len(sent)
        print(len(sent))
        tuple_sort_accuracy = 0
        for tuples in sent:
            #print(tuples)
            result = get_keys_and_values_with_string_at_position_0(transition_param_dict, tuples[0])
            sorted_result = sorted(result, key=lambda x: x[1], reverse=True)
            #print(sorted_result)
            #print(len(sorted_result))
            x = 0
            for tuple_sort in sorted_result:
                #print(tuple_sort[0])
                x = x+1
                if(tuples == tuple_sort[0]):
                    print("tuple_sort:", tuple_sort)
                    tuple_sort_accuracy = tuple_sort_accuracy + 1
                    print("tuple_sort accuracy:", tuple_sort_accuracy)
                    
        tuple_sort_accuracy = tuple_sort_accuracy/len_sen
        tuple_sort_accuracy_list.append(tuple_sort_accuracy)
        print("tuple_sort accuracy for a sentence:", tuple_sort_accuracy)
    #print("tuple_sort accuracy for a sentence:", tuple_sort_accuracy_list)
    return tuple_sort_accuracy_list
print("tuple_sort accuracy for a sentence:", accuracy_measure_2(dependencies_2))

6
tuple_sort: (('root', 'JQ'), 0.0392156862745098)
tuple_sort accuracy: 1
tuple_sort: (('NC', 'PU'), 0.15384615384615385)
tuple_sort accuracy: 2
tuple_sort: (('NP', 'JJ'), 0.06666666666666667)
tuple_sort accuracy: 3
tuple_sort: (('NP', 'NC'), 0.13333333333333333)
tuple_sort accuracy: 4
tuple_sort accuracy for a sentence: 0.6666666666666666
12
tuple_sort: (('root', 'PPR'), 0.1568627450980392)
tuple_sort accuracy: 1
tuple_sort: (('root', 'VM'), 0.0392156862745098)
tuple_sort accuracy: 2
tuple_sort: (('root', 'CCD'), 0.0196078431372549)
tuple_sort accuracy: 3
tuple_sort: (('PU', 'JJ'), 0.045454545454545456)
tuple_sort accuracy: 4
tuple_sort: (('PU', 'JJ'), 0.045454545454545456)
tuple_sort accuracy: 5
tuple_sort: (('PU', 'JJ'), 0.045454545454545456)
tuple_sort accuracy: 6
tuple_sort: (('PPR', 'NC'), 0.45)
tuple_sort accuracy: 7
tuple_sort: (('PPR', 'NC'), 0.45)
tuple_sort accuracy: 8
tuple_sort: (('PPR', 'NC'), 0.45)
tuple_sort accuracy: 9
tuple_sort: (('NC', 'PU'), 0.15384615384615385)
tu

In [19]:
print(emission_param_dict)

{"('RDF', '<unk>')": 1.0, "('NC', '<unk>')": 0.9076923076923077, "('JJ', 'বিরোধী')": 0.046511627906976744, "('CCD', 'এবং')": 0.631578947368421, "('NV', 'করা')": 0.23076923076923078, "('PU', '।')": 0.4838709677419355, "('NC', 'সালে')": 0.010256410256410256, "('NP', '<unk>')": 0.8, "('JJ', 'রাজনৈতিক')": 0.046511627906976744, "('JJ', '<unk>')": 0.6511627906976745, "('VM', '<unk>')": 0.7142857142857143, "('JQ', '<unk>')": 0.48, "('PP', '<unk>')": 0.6, "('VM', 'করে')": 0.08928571428571429, "('JQ', 'একটি')": 0.24, "('NC', 'স্থান')": 0.010256410256410256, "('PPR', 'আমাদের')": 0.1, "('NC', 'জীবনের')": 0.010256410256410256, "('VAUX', '<unk>')": 0.8181818181818182, "('PPR', 'আমরা')": 0.075, "('NST', 'মধ্যে')": 0.3333333333333333, "('JJ', 'বিভিন্ন')": 0.06976744186046512, "('NV', 'চালানো')": 0.15384615384615385, "('CCD', 'ও')": 0.3157894736842105, "('PU', '-')": 0.06451612903225806, "('PRL', '<unk>')": 0.5, "('AMN', '<unk>')": 1.0, "('NP', 'আম')": 0.06666666666666667, "('PU', ',')": 0.33333333333

In [20]:
print(unique_states)

['RDF', 'NC', 'JJ', 'CCD', 'NV', 'PU', 'NP', 'VM', 'JQ', 'PP', 'PPR', 'VAUX', 'NST', 'PRL', 'AMN', 'CCL', 'CSB', 'DAB', 'CX', 'EX', 'MD', 'VB', 'NNS', 'VBG', 'WP', 'PRP', 'VBD', '.', "''", 'ALC', 'PWH', 'PRF', 'NNP']


In [21]:
#!pip install Flask flask-ngrok

In [26]:
from flask import Flask, render_template, request

app = Flask(__name__)

# Example NLP function
def analyze_text(text):
    # Replace this with your actual NLP processing logic
    # For demonstration, let's just return the input text
    dependencies, dependencies_2, res_sies, dp_edgesies = tree_generation(text)
    accuracy = accuracy_measure(dependencies_2)
    return {"title": "For Bangla", "sentence": text, "pos": res_sies, "input": dependencies, "num": dp_edgesies, "accu": accuracy}

@app.route("/", methods=['GET', 'POST'])
def index():
    if request.method == 'POST':
        user_input = request.form['user_input']
        analysis = analyze_text(user_input)
        return render_template('new_result.html', analysis=analysis)
    return render_template('new.html')

if __name__ == '__main__':
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


In [27]:
def format_data(json_file_path):
    with open(json_file_path, 'r', encoding="utf-8") as json_file:
        data = json.load(json_file)

    formatted_data = []

    for entry in data:
        sentence_tokens = entry['sentence']
        labels = entry['labels']

        tokens_with_index = []

        for index, (word, tag) in enumerate(zip(sentence_tokens, labels)):
            tokens_with_index.append(f"{index}\t{word}\t{tag}")

        sentence = '\n'.join(tokens_with_index)
        formatted_data.append(sentence)

    return '\n\n'.join(formatted_data)

In [24]:
def greedy_decoding_accuracy(formatted_data, vocab, transition_param_dict, emission_param_dict, state_track):
    accuracy = []

    for line in formatted_data.split('\n\n'):
        prior_st = 'root'
        for ent in line.split('\n'):
            index, word, st = ent.split('\t')
            if word not in vocab.keys():
                word = '<unk>'

            max_prob = 0
            max_st = None

            for st_ in state_track:
                tr = transition_param_dict.get(str((prior_st, st_)), 1e-7)
                em = emission_param_dict.get(str((st_, word)), 0)
                prob = tr * em

                if prob > max_prob:
                    max_prob = prob
                    max_st = st_

            accuracy.append(st == max_st)
            prior_st = max_st
            #print(prior_st)

    return sum(accuracy) / len(accuracy)

formatted_dev_data = format_data('C:/Users/Admin/Desktop/Thesis/POS Tagging/dev_bangla.json')
accuracy = greedy_decoding_accuracy(formatted_dev_data, vocab, transition_param_dict, emission_param_dict, unique_states)
print("Accuracy on dev data with greedy decoding =", accuracy)

Accuracy on dev data with greedy decoding = 0.7692307692307693
