In [1]:
import numpy as np
import json

In [2]:
#vocabulary generate kore pura training corpus er

def generate_vocab(json_file_path, threshold=2):
    with open(json_file_path, "r") as json_file:
        json_data = json.load(json_file)

    vocab = {'<unk>': 0}
    counter = 0

    # Counting the number of times each word occurs
    for entry in json_data:
        sentence = entry["sentence"]
        counter+=1

        for word in sentence:
            for subword in word.split():
                if subword.strip():
                    if subword not in vocab:
                        vocab[subword] = 1
                    else:
                        vocab[subword] += 1

    # Adding count of rare words to the count of <unk>
    for k in list(vocab.keys())[1:]:
        if vocab[k] < threshold:
            vocab['<unk>'] += vocab[k]
            del vocab[k]

    # Sort vocab by count in descending order
    sorted_vocab = {k: v for k, v in sorted(vocab.items(), key=lambda item: item[1], reverse=True)}

    # Push <unk> to the top
    unk_val = sorted_vocab.pop('<unk>')
    sorted_vocab = {'<unk>': unk_val, **sorted_vocab}

    with open('vocab.txt', 'w') as f:
        format_str = ''
        for index, (key, count) in enumerate(sorted_vocab.items()):
            format_str += key + '\t' + str(index) + '\t' + str(count) + '\n'
        f.write(format_str)

    #print("Threshold =", threshold)

    return sorted_vocab, counter

vocab, counter = generate_vocab("C:/Users/Admin/Desktop/Thesis/POS Tagging/archive/train2.json")
print("Overall size of my vocabulary =", len(vocab))
print("No. of times '<unk>' occurs in my vocabulary =", vocab['<unk>'])
print("No. of sentence =", counter)

Overall size of my vocabulary = 318
No. of times '<unk>' occurs in my vocabulary = 619
No. of sentence = 100


In [3]:
def load_training_data(file_path):
    with open(file_path, 'r') as json_file:
        return json.load(json_file)

#transition holo NNP to VBP kotobar, emission holo "Book" word ta kotobar NNP r kotobar VBP
#unique state holo distinct tag gulo niye banano list
def process_training_data(data, vocab):
    transition_probabilities = {}
    emission_probabilities = {}
    unique_states = []

    for entry in data:
        sentence_tokens = entry['sentence']
        labels = entry['labels']
        prior_state = 'head'  

        for word, tag in zip(sentence_tokens, labels):
            if word not in vocab.keys():
                word = '<unk>'

            # Update unique states
            if tag not in unique_states:
                unique_states.append(tag)

            # Update transition count
            transition_key = (prior_state, tag)
            if transition_key not in transition_probabilities:
                transition_probabilities[transition_key] = 1
            else:
                transition_probabilities[transition_key] += 1

            # Update emission count
            emission_key = (tag, word)
            if emission_key not in emission_probabilities:
                emission_probabilities[emission_key] = 1
            else:
                emission_probabilities[emission_key] += 1

            prior_state = tag

    return transition_probabilities, emission_probabilities, unique_states

def normalize_probabilities(probabilities):
    normalized_probabilities = {}
    for key in probabilities:
        state, value = key
        total = sum(v for k, v in probabilities.items() if k[0] == state)
        normalized_probabilities[key] = probabilities[key] / total
    return normalized_probabilities

#transition r emission gulo k dictionary er moto kore dekhano hoyese
def save_hmm_model(transition_probs, emission_probs, output_file):
    
    # Convert tuple keys to strings
    transition_probs = {str(k): v for k, v in transition_probs.items()}
    emission_probs = {str(k): v for k, v in emission_probs.items()}

    hmm_model = {
        'transition': transition_probs,
        'emission': emission_probs,
    }

    with open(output_file, 'w') as f:
        json.dump(hmm_model, f)

    return transition_probs, emission_probs

training_data = load_training_data('C:/Users/Admin/Desktop/Thesis/POS Tagging/archive/train2.json')
transition_probs, emission_probs, unique_states = process_training_data(training_data, vocab)
normalized_transition_probs = normalize_probabilities(transition_probs)
normalized_emission_probs = normalize_probabilities(emission_probs)
transition_param_dict, emission_param_dict = save_hmm_model(normalized_transition_probs, normalized_emission_probs, 'hmm.json')

print("No. of transition parameters =", len(transition_param_dict))
print("No. of emission parameters =", len(emission_param_dict))

No. of transition parameters = 390
No. of emission parameters = 379


In [4]:
import nltk

In [5]:
#POS tag kore each word er r accuracy dey
def greedy_decoding(formatted_data, vocab, transition_param_dict, emission_param_dict, state_track):
    accuracy = []
    transition_prob ={}
    emission_prob = {}
    max_stt = {}

    prior_st = 'head'
    counter = 0
    for i in formatted_data:
        print(i)
        counter = counter+1
        transition_prob[counter]=transition_param_dict.get(str((prior_st, i[1])), 1e-7)
        emission_prob[counter]=emission_param_dict.get(str((i[1], i[0])), 0)
        prior_st = i[1]

    return transition_prob, emission_prob

In [6]:
## eita nibo
def reverse_graph(G):
    '''Return the reversed graph g[dst][src]=G[src][dst]'''
    g = {}
    for src in G.keys():
        for dst in G[src].keys():
            if dst not in g.keys():
                g[dst] = {}
            g[dst][src] = G[src][dst]
    return g


def build_max(rg, root):
    '''Find the max in-edge for every node except for the root.'''
    mg = {}
    for dst in rg.keys():
        if dst == root:
            continue
        max_ind = -100
        max_value = -100
        for src in rg[dst].keys():
            if rg[dst][src] >= max_value:
                max_ind = src
                max_value = rg[dst][src]
        mg[dst] = {max_ind: max_value}
    return mg


def find_circle(mg):
    '''Return the first circle if find, otherwise return None'''

    for start in mg.keys():
        visited = []
        stack = [start]
        while stack:
            n = stack.pop()
            if n in visited:
                C = []
                while n not in C:
                    C.append(n)
                    n = list(mg[n].keys())[0]
                return C
            visited.append(n)
            if n in mg.keys():
                stack.extend(list(mg[n].keys()))
    return None


def chu_liu_edmond(G, root):
    ''' G: dict of dict of weights
            G[i][j] = w means the edge from node i to node j has weight w.
        root: the root node, has outgoing edges only.
    '''
    # reversed graph rg[dst][src] = G[src][dst]
    rg = reverse_graph(G)
    # root er only out edge
    rg[root] = {}
    # the maximum edge select korlam for each node other than root
    mg = build_max(rg, root)

    # check if mg is a tree (contains a circle)
    C = find_circle(mg)
    # circle na thakle, mg tai max_spanning_tree
    if not C:
        return reverse_graph(mg)

    # jesob node circle kore tader k niye compact node korlm
    all_nodes = G.keys()
    vc = max(all_nodes) + 1

    # new graph holo G_prime
    V_prime = list(set(all_nodes) - set(C)) + [vc]
    G_prime = {}
    vc_in_idx = {}
    vc_out_idx = {}
    # Now add the edges to G_prime
    for u in all_nodes:
        for v in G[u].keys():
            # incoming edge er weight calculation
            if (u not in C) and (v in C):
                if u not in G_prime.keys():
                    G_prime[u] = {}
                w = G[u][v] - list(mg[v].values())[0]
                if (vc not in G_prime[u]) or (vc in G_prime[u] and w > G_prime[u][vc]):
                    G_prime[u][vc] = w
                    vc_in_idx[u] = v

            # outgoing edge er weight calculation
            elif (u in C) and (v not in C):
                if vc not in G_prime.keys():
                    G_prime[vc] = {}
                w = G[u][v]
                if (v not in G_prime[vc]) or (v in G_prime[vc] and w > G_prime[vc][v]):
                    G_prime[vc][v] = w
                    vc_out_idx[v] = u

            # Third case: if the source and dest are all not in the circle, then just add the edge to the new graph.
            elif (u not in C) and (v not in C):
                if u not in G_prime.keys():
                    G_prime[u] = {}
                G_prime[u][v] = G[u][v]

    # Recursively run the algorihtm on the new graph G_prime
    A = chu_liu_edmond(G_prime, root)
    # print(A)

    # compacted node k vangbo, erpor incoming r outgoing edge gulo identify krbo
    # always max ta choose krbo r bakigulo delete krbo
    all_nodes_A = list(A.keys())
    for src in all_nodes_A:
        # The number of out-edges varies, could be 0 or any number <=|V\C|
        if src == vc:
            for node_in in A[src].keys():
                orig_out = vc_out_idx[node_in]
                if orig_out not in A.keys():
                    A[orig_out] = {}
                A[orig_out][node_in] = G[orig_out][node_in]
        else:
            #for dst in A[src]:
            for dst in list(A[src].keys()):
                # There must be only one in-edge to vc.
                if dst == vc:
                    orig_in = vc_in_idx[src]
                    A[src][orig_in] = G[src][orig_in]
                    del A[src][dst]
    #del A[vc]
    #print("A: ",A)
    #print("vc: ",vc)
    if vc in A:
        del A[vc]
    '''
    try:
        del A[vc]
    except KeyError:
        #print("####", vc)
        pass  # Do nothing if the key doesn't exist
    '''

    for node in C:
        if node != orig_in:
            src = list(mg[node].keys())[0]
            if src not in A.keys():
                A[src] = {}
            A[src][node] = mg[node][src]

    return A


In [7]:
def separate_English_sentences(text):
    # Initialize a list to store the separated sentences
    sentences = []
    current_sentence = ""
    end_sentence_marks = ['.', '?', '!']
    for char in text:
        current_sentence += char
        if char in end_sentence_marks:
            sentences.append(current_sentence.strip())
            current_sentence = ""
    if current_sentence:
        sentences.append(current_sentence.strip())
    
    return sentences

In [8]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [9]:
def tree_generation(text):
    bangla_text = text
    dependency = []
    dependency_2 = []
    res_s = []
    dp_edges = []
    separated_sentences = separate_English_sentences(bangla_text)
    print("Separated Sentences:")

    for sentence in separated_sentences:
        print(sentence)
        #res = bn_pos.tag(sentence)
        tokens = word_tokenize(sentence)
        res = pos_tag(tokens)
        print(res)

        transition_prob, emission_prob = greedy_decoding(res, vocab, transition_param_dict, emission_param_dict, unique_states)
        print('Transition_prob = ', transition_prob)
        print('Emission_prob = ', emission_prob)

        res = ['root']+res
        res_s.append(res)
        print(res)

        def get_edges(graph):
            edges = []
            for node in graph:
                for neighbor in graph[node]:
                    edges.append((node, neighbor))
            return edges

        # eita nibo
        #num_vertices = len(res)+1
        #print(num_vertices)

        G = {}
        dp = {}
        print(len(res))
        for i in range(len(res)):
            G[i] = {}
            if i==0:
                p=200
                q=300
                for j in range(1,len(res)):
                    print(res[j])
                    weight = transition_prob[j] + emission_prob[j] 
                    #if p ==0:
                    if res[j][1] == 'CC':
                        weight = q*(transition_prob[j] + emission_prob[j])
                        q = q/3
                    if res[j][1] == 'VB' or res[j][1] == 'VBD' or res[j][1] == 'VBP' or res[j][1] == 'VBN' or res[j][1] == 'VBZ' or res[j][1] == 'VBG':
                        #print('#')
                        weight = p*(transition_prob[j] + emission_prob[j])
                        p = p*3

                    G[i][j] = weight
                continue

            for j in range(len(res)):
                if (j == 0):
                    continue
                if (i == j):
                    continue
                if res[j][1] == 'DT':
                    weight = p*(transition_prob[j] + emission_prob[j])
                    p = p/3
                if res[j][1] == 'NNP':
                    weight = p*(transition_prob[j] + emission_prob[j])
                    p = p/3
                if res[j][1] == 'JJ':
                    weight = p*(transition_prob[j] + emission_prob[j])
                    p = p/3
                if res[j][1] == 'IN':
                    weight = p*(transition_prob[j] + emission_prob[j])
                    p = p/3
                if res[j][1] == 'NN':
                    weight = p*(transition_prob[j] + emission_prob[j])
                    p = p/3
                weight = transition_prob[j] + emission_prob[j] 

                G[i][j] = weight

        print("G =", G)

        dp = chu_liu_edmond(G, 0)
        print('DP =',dp)
        edges_of_dp = get_edges(dp)
        dp_edges.append(get_edges(dp)) 
        
        list_of_tuples = edges_of_dp

        # Define the list of tuples containing words and tags
        word_tags = res

        # Convert the list of tuples to a list of edges
        edges = []
        for edge in list_of_tuples:
            src = word_tags[edge[0]][0] if edge[0] != 0 else 'root'
            dst = word_tags[edge[1]][0]
            edges.append((src, dst))

        # Print the list of edges
        print("List of Edges:", edges)
        dependency.append(edges)
        
        ########
        list_of_tuples_2 = edges_of_dp
        word_tags_2 = res
        edges_2 = []
        for edge in list_of_tuples_2:
            src = word_tags[edge[0]][1] if edge[0] != 0 else 'root'
            dst = word_tags[edge[1]][1]
            edges_2.append((src, dst))
        print("List of Edges_2:", edges_2)
        dependency_2.append(edges_2)

    return dependency, dependency_2, res_s, dp_edges

#sentence = "John saw marry. Bangladesh, to the east of India on the Bay of Bengal, is a South Asian country marked by lush greenery and many waterways. Its Padma (Ganges), Meghna and Jamuna rivers create fertile plains, and travel by boat is common. On the southern coast, the Sundarbans, an enormous mangrove forest shared with Eastern India, is home to the royal Bengal tiger."
sentence = "The Padma is a major river in Bangladesh. It is the main distributary of the Ganges, flowing generally southeast for 356 kilometres to its confluence with the Meghna River near the Bay of Bengal. The city of Rajshahi is situated on the banks of the river."
#sentence = "English is a West Germanic language in the Indo-European language family, whose speakers, called Anglophones, originated in early medieval England. The namesake of the language is the Angles, one of the ancient Germanic peoples that migrated to the island of Great Britain."
dependencies, dependencies_2, res_sies, dp_edgesies = tree_generation(sentence)

Separated Sentences:
The Padma is a major river in Bangladesh.
[('The', 'DT'), ('Padma', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('major', 'JJ'), ('river', 'NN'), ('in', 'IN'), ('Bangladesh', 'NNP'), ('.', '.')]
('The', 'DT')
('Padma', 'NNP')
('is', 'VBZ')
('a', 'DT')
('major', 'JJ')
('river', 'NN')
('in', 'IN')
('Bangladesh', 'NNP')
('.', '.')
Transition_prob =  {1: 0.23, 2: 0.1518324607329843, 3: 0.03308823529411765, 4: 0.06779661016949153, 5: 0.1518324607329843, 6: 0.4319526627218935, 7: 0.29194630872483224, 8: 0.16666666666666666, 9: 0.05514705882352941}
Emission_prob =  {1: 0.1099476439790576, 2: 0, 3: 0.2033898305084746, 4: 0.20418848167539266, 5: 0, 6: 0, 7: 0.1124031007751938, 8: 0, 9: 0.98}
['root', ('The', 'DT'), ('Padma', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('major', 'JJ'), ('river', 'NN'), ('in', 'IN'), ('Bangladesh', 'NNP'), ('.', '.')]
10
('The', 'DT')
('Padma', 'NNP')
('is', 'VBZ')
('a', 'DT')
('major', 'JJ')
('river', 'NN')
('in', 'IN')
('Bangladesh', 'NNP')
('.', '.')
G = 

In [10]:
print(dependencies)

[[('root', 'is'), ('is', 'a'), ('a', 'The'), ('a', 'Padma'), ('a', 'major'), ('major', 'river'), ('river', 'in'), ('in', 'Bangladesh'), ('Bangladesh', '.')], [('root', 'is'), ('root', 'flowing'), ('root', 'southeast'), ('southeast', 'for'), ('for', 'It'), ('for', 'the'), ('for', 'main'), ('for', 'distributary'), ('for', 'of'), ('for', 'the'), ('for', 'Ganges'), ('for', ','), ('for', 'generally'), ('for', '356'), ('356', 'kilometres'), ('kilometres', 'to'), ('to', 'its'), ('its', 'confluence'), ('confluence', 'with'), ('with', 'the'), ('the', 'Meghna'), ('Meghna', 'River'), ('River', 'near'), ('near', 'the'), ('the', 'Bay'), ('Bay', 'of'), ('of', 'Bengal'), ('Bengal', '.')], [('root', 'is'), ('root', 'situated'), ('situated', 'on'), ('on', 'The'), ('on', 'city'), ('on', 'of'), ('on', 'Rajshahi'), ('on', 'the'), ('the', 'banks'), ('banks', 'of'), ('of', 'the'), ('the', 'river'), ('river', '.')]]


In [11]:
print(dependencies_2[0])

[('root', 'VBZ'), ('VBZ', 'DT'), ('DT', 'DT'), ('DT', 'NNP'), ('DT', 'JJ'), ('JJ', 'NN'), ('NN', 'IN'), ('IN', 'NNP'), ('NNP', '.')]


In [12]:
print(res_sies)

[['root', ('The', 'DT'), ('Padma', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('major', 'JJ'), ('river', 'NN'), ('in', 'IN'), ('Bangladesh', 'NNP'), ('.', '.')], ['root', ('It', 'PRP'), ('is', 'VBZ'), ('the', 'DT'), ('main', 'JJ'), ('distributary', 'NN'), ('of', 'IN'), ('the', 'DT'), ('Ganges', 'NNP'), (',', ','), ('flowing', 'VBG'), ('generally', 'RB'), ('southeast', 'VBN'), ('for', 'IN'), ('356', 'CD'), ('kilometres', 'NNS'), ('to', 'TO'), ('its', 'PRP$'), ('confluence', 'NN'), ('with', 'IN'), ('the', 'DT'), ('Meghna', 'NNP'), ('River', 'NNP'), ('near', 'IN'), ('the', 'DT'), ('Bay', 'NNP'), ('of', 'IN'), ('Bengal', 'NNP'), ('.', '.')], ['root', ('The', 'DT'), ('city', 'NN'), ('of', 'IN'), ('Rajshahi', 'NNP'), ('is', 'VBZ'), ('situated', 'VBN'), ('on', 'IN'), ('the', 'DT'), ('banks', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('river', 'NN'), ('.', '.')]]


In [13]:
print(dp_edgesies)

[[(0, 3), (3, 4), (4, 1), (4, 2), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9)], [(0, 2), (0, 10), (0, 12), (12, 13), (13, 1), (13, 3), (13, 4), (13, 5), (13, 6), (13, 7), (13, 8), (13, 9), (13, 11), (13, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22), (22, 23), (23, 24), (24, 25), (25, 26), (26, 27), (27, 28)], [(0, 5), (0, 6), (6, 7), (7, 1), (7, 2), (7, 3), (7, 4), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), (12, 13)]]


In [14]:
print(transition_param_dict)

{"('head', 'IN')": 0.13, "('IN', 'DT')": 0.29069767441860467, "('DT', 'NN')": 0.4973821989528796, "('NN', ',')": 0.12751677852348994, "(',', 'DT')": 0.12598425196850394, "('DT', 'NNS')": 0.09424083769633508, "('NNS', 'VBD')": 0.05555555555555555, "('VBD', ',')": 0.02631578947368421, "(',', '``')": 0.015748031496062992, "('``', 'PRP')": 0.25, "('PRP', 'VBP')": 0.2, "('VBP', 'PDT')": 0.02, "('PDT', 'DT')": 1.0, "('NNS', '.')": 0.18888888888888888, '(\'.\', "\'\'")': 1.0, "('head', 'JJ')": 0.06, "('JJ', 'NNS')": 0.3136094674556213, "('NNS', 'VBP')": 0.14444444444444443, "('VBP', 'JJ')": 0.22, "('JJ', 'IN')": 0.0650887573964497, "('IN', 'NNP')": 0.16666666666666666, "('NNP', 'NNP')": 0.35294117647058826, "('NNP', '.')": 0.05514705882352941, "('head', '``')": 0.09, "('PRP', 'VBD')": 0.4222222222222222, "('VBD', 'DT')": 0.17105263157894737, "('NN', 'IN')": 0.29194630872483224, "('IN', 'NN')": 0.08139534883720931, '(\',\', "\'\'")': 0.05511811023622047, '("\'\'", \'VBZ\')': 0.3333333333333333

In [15]:
def get_keys_and_values_with_string_at_position_0(dictionary, string):
    key_value_tuples = []
    for key, value in dictionary.items():
        if key.startswith(f"('{string}'"):
            key_value_tuples.append(((string, key.split(", ")[1][1:-2]), value))
    return key_value_tuples

In [23]:
def accuracy_measure(dependencies_2):
    tuple_sort_accuracy_list = []
    for sent in dependencies_2:
        tuple_sort_accuracy = 0
        print(len(sent))
        for tuples in sent:
            #print(tuples)
            result = get_keys_and_values_with_string_at_position_0(transition_param_dict, tuples[0])
            sorted_result = sorted(result, key=lambda x: x[1], reverse=True)
            #print(sorted_result)
            #print(len(sorted_result))
            x = 0
            p = 0
            for tuple_sort in sorted_result:
                #print(tuple_sort[0])
                x = x+1
                if(tuples == tuple_sort[0]):
                    #print("tuple_sort:", tuple_sort)
                    tuple_sort_accuracy = tuple_sort_accuracy + (tuple_sort[1])/len(sent)*x
                    #print("tuple_sort accuracy:", tuple_sort_accuracy)
                    p = 1
            if (p == 0):
                tuple_sort_accuracy = tuple_sort_accuracy + len(sorted_result)* .02
                #print("tuple_sort accuracyyyy:", tuple_sort_accuracy)
        tuple_sort_accuracy_list.append(tuple_sort_accuracy)
        print("tuple_sort accuracy for a sentence:", tuple_sort_accuracy)
    #print("tuple_sort accuracy for a sentence:", tuple_sort_accuracy_list)
    return tuple_sort_accuracy_list
print("tuple_sort accuracy for a sentence:", accuracy_measure(dependencies_2))

9
tuple_sort accuracy for a sentence: 0.5915291671951165
28
tuple_sort accuracy for a sentence: 0.6384512702148489
13
tuple_sort accuracy for a sentence: 0.27069518973579243
tuple_sort accuracy for a sentence: [0.5915291671951165, 0.6384512702148489, 0.27069518973579243]


In [17]:
print(emission_param_dict)

{"('IN', '<unk>')": 0.050387596899224806, "('DT', 'the')": 0.4973821989528796, "('NN', '<unk>')": 0.4697986577181208, "(',', ',')": 1.0, "('NNS', '<unk>')": 0.37222222222222223, "('VBD', '<unk>')": 0.3815789473684211, "('``', '``')": 1.0, "('PRP', 'We')": 0.044444444444444446, "('VBP', 'have')": 0.2, "('PDT', 'all')": 1.0, "('.', '.')": 0.98, '("\'\'", "\'\'")': 1.0, "('JJ', '<unk>')": 0.4911242603550296, "('VBP', 'are')": 0.3, "('IN', 'at')": 0.023255813953488372, "('NNP', 'Mrs.')": 0.01838235294117647, "('NNP', 'Yeargin')": 0.011029411764705883, "('PRP', '<unk>')": 0.1111111111111111, "('VBD', 'did')": 0.039473684210526314, "('DT', 'a')": 0.20418848167539266, "('NN', 'lot')": 0.006711409395973154, "('IN', 'of')": 0.32558139534883723, "('VBZ', 'says')": 0.1694915254237288, "('NNP', '<unk>')": 0.33088235294117646, "('WP', 'who')": 0.3, "('VBD', 'had')": 0.02631578947368421, "('VBN', '<unk>')": 0.68, "('VBP', '<unk>')": 0.16, "('RB', 'damn')": 0.029411764705882353, "('RB', 'hard')": 0.0

In [18]:
print(unique_states)

['IN', 'DT', 'NN', ',', 'NNS', 'VBD', '``', 'PRP', 'VBP', 'PDT', '.', "''", 'JJ', 'NNP', 'VBZ', 'WP', 'VBN', 'RB', 'CC', 'VBG', 'RP', 'EX', 'MD', 'VB', 'RBR', 'TO', 'CD', 'PRP$', ':', 'NNPS', 'JJR', 'POS', '-LRB-', '-RRB-', 'WDT', 'WRB', 'RBS', 'JJS', '$', 'WP$']


In [24]:
from flask import Flask, render_template, request

app = Flask(__name__)

# Example NLP function
def analyze_text(text):
    # Replace this with your actual NLP processing logic
    # For demonstration, let's just return the input text
    dependencies, dependencies_2, res_sies, dp_edgesies = tree_generation(text)
    accuracy = accuracy_measure(dependencies_2)
    return {"title": "For English","sentence": text, "pos": res_sies, "input": dependencies, "num": dp_edgesies, "accu": accuracy}

@app.route("/", methods=['GET', 'POST'])
def index():
    if request.method == 'POST':
        user_input = request.form['user_input']
        analysis = analyze_text(user_input)
        return render_template('new_result.html', analysis=analysis)
    return render_template('new.html')

if __name__ == '__main__':
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [20/Feb/2024 09:54:57] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [20/Feb/2024 09:55:05] "POST / HTTP/1.1" 200 -


Separated Sentences:
The Padma is a major river in Bangladesh.
[('The', 'DT'), ('Padma', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('major', 'JJ'), ('river', 'NN'), ('in', 'IN'), ('Bangladesh', 'NNP'), ('.', '.')]
('The', 'DT')
('Padma', 'NNP')
('is', 'VBZ')
('a', 'DT')
('major', 'JJ')
('river', 'NN')
('in', 'IN')
('Bangladesh', 'NNP')
('.', '.')
Transition_prob =  {1: 0.23, 2: 0.1518324607329843, 3: 0.03308823529411765, 4: 0.06779661016949153, 5: 0.1518324607329843, 6: 0.4319526627218935, 7: 0.29194630872483224, 8: 0.16666666666666666, 9: 0.05514705882352941}
Emission_prob =  {1: 0.1099476439790576, 2: 0, 3: 0.2033898305084746, 4: 0.20418848167539266, 5: 0, 6: 0, 7: 0.1124031007751938, 8: 0, 9: 0.98}
['root', ('The', 'DT'), ('Padma', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('major', 'JJ'), ('river', 'NN'), ('in', 'IN'), ('Bangladesh', 'NNP'), ('.', '.')]
10
('The', 'DT')
('Padma', 'NNP')
('is', 'VBZ')
('a', 'DT')
('major', 'JJ')
('river', 'NN')
('in', 'IN')
('Bangladesh', 'NNP')
('.', '.')
G = 

In [20]:
def format_data(json_file_path):
    with open(json_file_path, 'r') as json_file:
        data = json.load(json_file)

    formatted_data = []

    for entry in data:
        sentence_tokens = entry['sentence']
        labels = entry['labels']

        tokens_with_index = []

        for index, (word, tag) in enumerate(zip(sentence_tokens, labels)):
            tokens_with_index.append(f"{index}\t{word}\t{tag}")

        sentence = '\n'.join(tokens_with_index)
        formatted_data.append(sentence)

    return '\n\n'.join(formatted_data)

In [21]:
def greedy_decoding_accuracy(formatted_data, vocab, transition_param_dict, emission_param_dict, state_track):
    accuracy = []

    for line in formatted_data.split('\n\n'):
        prior_st = 'head'
        for ent in line.split('\n'):
            index, word, st = ent.split('\t')
            if word not in vocab.keys():
                word = '<unk>'

            max_prob = 0
            max_st = None

            for st_ in state_track:
                tr = transition_param_dict.get(str((prior_st, st_)), 1e-7)
                em = emission_param_dict.get(str((st_, word)), 0)
                prob = tr * em

                if prob > max_prob:
                    max_prob = prob
                    max_st = st_

            accuracy.append(st == max_st)
            prior_st = max_st
            #print(prior_st)

    return sum(accuracy) / len(accuracy), accuracy

formatted_dev_data = format_data('C:/Users/Admin/Desktop/Thesis/POS Tagging/dev.json')
accuracy, value = greedy_decoding_accuracy(formatted_dev_data, vocab, transition_param_dict, emission_param_dict, unique_states)
print("Accuracy on dev data with greedy decoding =", accuracy)
#print(value)

Accuracy on dev data with greedy decoding = 0.7027027027027027
