In [None]:
import os

# Load the Malayalam dataset
dataset_path = r"C:\Users\mvy48\Downloads\ml_combined_anoop-cc-gokul_07Dec19.txt"

# Read the dataset and limit to the first 200,000 sentences
with open(dataset_path, "r", encoding="utf-8") as f:
    dataset = f.read().splitlines()[:200000]

# Define the split ratio
train_ratio = 0.7
split_index = int(train_ratio * len(dataset))

# Split the dataset into training and testing sets
train_data = dataset[:split_index]
test_data = dataset[split_index:]


In [None]:
import re

# Regular expression to detect Malayalam characters
regex = re.compile("[^\u0D00-\u0D7F]")

def preprocess(data):
    hidden_state_lst = []
    data_lst = []

    for sentence in data:
        hidden_state = ""
        words = []

        for token in sentence.split():
            token = regex.sub("", token)
            # Label the token according to its length
            if len(token) == 1:
                hidden_state += "S"
            elif len(token) == 2:
                hidden_state += "BE"
            elif len(token) > 2:
                hidden_state += "B" + (len(token) - 2) * "I" + "E"

        # Remove the spaces between characters
        sentence = sentence.replace(" ", "")
        for word in sentence:
            # Verify if the word is a Malayalam character
            word = regex.sub("", word)
            words.append(word)

        if len(words) > 0:
            data_lst.append(words)
            states = [s for s in hidden_state]
            hidden_state_lst.append(states)

    return data_lst, hidden_state_lst

train_data, train_hs = preprocess(train_data)
test_data, test_hs = preprocess(test_data)


'B': Beginning character
'I': Intermediate (internal) character
'E': Ending character
'S': Single character

In [None]:
import numpy as np

# Initialize a dictionary to store the state count
state_count = {}
states = ["B", "I", "E", "S"]

for state in states:
    state_count[state] = 0

for i in range(len(train_hs)):
    length = len(train_hs[i])
    if length > 0:
        for j in range(length - 1):
            # Update the state count
            state_count[train_hs[i][j]] += 1
        state_count[train_hs[i][length - 1]] += 1

total_states = sum(state_count.values())  # Get the total number of states in the sample
start_prob = {}

for state in states:
    # Normalize the state count with the total number of states
    start_prob[state] = state_count[state] / total_states

# Initialize the transition probabilities
trans_prob = {}

for state in states:
    trans_prob[state] = {}
    for state_i in states:
        trans_prob[state][state_i] = 0

for i in range(len(train_hs)):
    length = len(train_hs[i])
    if length > 0:
        for j in range(length - 1):
            # Update the transition probabilities
            s_from = train_hs[i][j]
            s_to = train_hs[i][j + 1]
            trans_prob[s_from][s_to] += 1

for i in states:
    for j in states:
        # Normalize the frequency of the transition with the state counts
        trans_prob[i][j] /= float(state_count[i])

# Initialize the emission probabilities
emission_prob = {}

# Get all the vocabulary in the corpus (train and test sets)
vocab = list(set([word for sentence in train_data for word in sentence] +
                 [word for sentence in test_data for word in sentence]))

# Initialize the emission probabilities
for state in states:
    emission_prob[state] = {}
    for word in vocab:
        emission_prob[state][word] = 1

for i in range(len(train_hs)):
    length = len(train_hs[i])
    for j in range(length):
        # Update the emission probabilities
        obs = train_data[i][j]
        hidden = train_hs[i][j]
        emission_prob[hidden][obs] += 1

for state in states:
    for word in vocab:
        if emission_prob[state][word] == 0:
            continue
        else:
            # Normalize the emission probabilities
            emission_prob[state][word] /= float(state_count[state])


In [None]:
def viterbi_decoding(obs):
    obs = [i for i in obs if i]
    if len(obs) > 0:
        # Initialize a list of dictionary to store the probabilites
        V = [{}]
        for st in states:
            # Append the initial probabilites
            V[0][st] = {"prob": start_prob[st] * emission_prob[st].get(obs[0], 0), "prev": None}
        for t in range(1, len(obs)):
            # Append a dictionary to store the probabilities at time/step t
            V.append({})
            for st in states:
                max_tr_prob = V[t - 1][states[0]]["prob"] * trans_prob[states[0]][st]
                prev_st_selected = states[0]
                for prev_st in states[1:]:
                    # Calculate the probabilities of each state
                    tr_prob = V[t - 1][prev_st]["prob"] * trans_prob[prev_st][st]
                    if tr_prob > max_tr_prob:
                        max_tr_prob = tr_prob
                        prev_st_selected = prev_st

                # Get the max probability at time/step t
                max_prob = max_tr_prob * emission_prob[st].get(obs[t], 0)
                V[t][st] = {"prob": max_prob, "prev": prev_st_selected}

        path = []
        max_prob = -float("inf")
        best_st = None
        # Get most probable state and its backtrack
        for st, data in V[-1].items():
            if data["prob"] > max_prob:
                max_prob = data["prob"]
                best_st = st
        path.append(best_st)
        previous = best_st

        # Follow the backtrack till the first observation
        for t in range(len(V) - 2, -1, -1):
            path.insert(0, V[t + 1][previous]["prev"])
            previous = V[t + 1][previous]["prev"]

        # Return the path
        return path
    else:
        return []

test_pred = []
for obs in test_data:
    # Label the sequence using Viterbi algorithm
    test_pred.append(viterbi_decoding(obs))


In [None]:
def word_segmentation(test_data, pred):
    segmented = ""
    i = 0  # Counter for the test data index
    j = 0  # Counter for the prediction index
    while i < len(test_data):
        segmented += test_data[i]
        # Check for Malayalam character
        if test_data[i] > u"\u0D00" and test_data[i] < u"\u0D7F":
            # Add space after the character if label
            # is either "E" or "S"
            if pred[j] in ["E", "S"]:
                segmented += " "
            j += 1
        i += 1
    return segmented

segmented = []
for i in range(len(test_data)):
    # Convert BIES to word segmentation
    segmented.append(word_segmentation(test_data[i], test_pred[i]))

# Save the segmented sentences to my_prediction.txt file
with open("my_prediction.txt", "w", encoding="utf-8") as fout:
    for item in segmented:
        fout.write("%s\n" % item)

In [None]:
import numpy as np

# Look at random segmented sentence
i = np.random.randint(0, len(test_data))  # Random index within the test data size
print("Original Sentence:")
print(''.join(test_data[i]))  # Original sentence
print("Gold Label:")
print(test_hs[i])  # True label
print("Predicted Label:")
print(test_pred[i])  # Predicted label
print("Segmented Sentence:")
print(segmented[i])  # Segmented sentence

Original Sentence:
കീസ്റ്റോൺഎക്സെൽഡകോട്ടആക്സസ്എന്നീപദ്ധതികൾക്കാണ്പ്രസിഡന്റ്അനുമതിനൽകിയത്
Gold Label:
['B', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'E', 'B', 'I', 'I', 'I', 'I', 'E', 'B', 'I', 'I', 'I', 'I', 'E', 'B', 'I', 'I', 'I', 'I', 'E', 'B', 'I', 'I', 'I', 'E', 'B', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'E', 'B', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'E', 'B', 'I', 'I', 'I', 'I', 'E', 'B', 'I', 'I', 'I', 'I', 'I', 'E']
Predicted Label:
['I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'E', 'B', 'I', 'E', 'B', 'I', 'E', 'B', 'I', 'I', 'I', 'I', 'E', 'B', 'I', 'E', 'B', 'I', 'E', 'B', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'E', 'B', 'I', 'I', 'I', 'I', 'E', 'B', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'E', 'B', 'I', 'I', 'I', 'I', 'I', 'I', 'E', 'B', 'I', 'I', 'I', 'I']
Segmented Sentence:
കീസ്റ്റോൺ എക് സെൽ ഡകോട്ട ആക് സസ് എന്നീപദ്ധതികൾ ക്കാണ് പ്രസിഡന്റ് അനുമതിനൽ കിയത്


In [None]:
def predict_segmentation(input_sequence):
    # Preprocess the input sequence
    input_data = [list(input_sequence)]
    
    # Predict the segmentation
    pred_segmentation = viterbi_decoding(input_data[0])
    
    # Convert BIES to word segmentation
    segmented_sentence = word_segmentation(input_data[0], pred_segmentation)
    
    # Print the predicted segmentation
    print("Input Sequence:", input_sequence)
    print("Predicted Segmentation:", segmented_sentence)

# Example usage
input_sequence = "ഞാൻഇന്നുനിങ്ങളെഎങ്ങനെസഹായിക്കാനാകും"
predict_segmentation(input_sequence)


Input Sequence: ഞാൻഇന്നുനിങ്ങളെഎങ്ങനെസഹായിക്കാനാകും
Predicted Segmentation: ഞാൻ ഇന്നുനിങ്ങളെ എങ്ങനെ സഹായിക്കാനാകും 


In [None]:
# Evaluation
map_dict = {
    "B": 0,
    "I": 1,
    "E": 2,
    "S": 3
}

true = list(np.concatenate(test_hs).flat)
pred = list(np.concatenate(test_pred).flat)

k = len(np.unique(true))  # Number of classes
result = np.zeros((k, k))  # Initialize the confusion matrix

for i in range(len(true)):
    # Calculate the confusion matrix
    result[map_dict[true[i]]][map_dict[pred[i]]] += 1

precision_B = result[0][0] / (result[0][0] + result[1][0] + result[2][0] + result[3][0])
precision_I = result[1][1] / (result[0][1] + result[1][1] + result[2][1] + result[3][1])
precision_E = result[2][2] / (result[0][2] + result[1][2] + result[2][2] + result[3][2])
precision_S = result[3][3] / (result[0][3] + result[1][3] + result[2][3] + result[3][3])

recall_B = result[0][0] / (result[0][0] + result[0][1] + result[0][2] + result[0][3])
recall_I = result[1][1] / (result[1][0] + result[1][1] + result[1][2] + result[1][3])
recall_E = result[2][2] / (result[2][0] + result[2][1] + result[2][2] + result[2][3])
recall_S = result[3][3] / (result[3][0] + result[3][1] + result[3][2] + result[3][3])

f1_B = 2 * (precision_B * recall_B) / (precision_B + recall_B)
f1_I = 2 * (precision_I * recall_I) / (precision_I + recall_I)
f1_E = 2 * (precision_E * recall_E) / (precision_E + recall_E)
f1_S = 2 * (precision_S * recall_S) / (precision_S + recall_S)

macro_f1 = (f1_B + f1_I + f1_E + f1_S) / k

print(f"F1 score for state B: {f1_B}")
print(f"F1 score for state I: {f1_I}")
print(f"F1 score for state E: {f1_E}")
print(f"F1 score for state S: {f1_S}")
print(f"Macro-F1 score: {macro_f1}")
# Calculate accuracy
correct_predictions = sum(1 for true, pred in zip(true, pred) if true == pred)
total_predictions = len(true)
accuracy = correct_predictions / total_predictions

# Print accuracy
print(f"Accuracy: {accuracy:.2%}")

F1 score for state B: 0.6279667579279399
F1 score for state I: 0.9103146014683459
F1 score for state E: 0.6284848255479902
F1 score for state S: 0.6457575459640152
Macro-F1 score: 0.7031309327270727
Accuracy: 85.46%
