In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, TimeDistributed, Dense, Dropout
from sklearn.metrics import classification_report




In [2]:
# Load the Data
dataset_path = "C:\\Users\\mvy48\\Downloads\\ml_combined_anoop-cc-gokul_07Dec19.txt"
with open(dataset_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()
lines = lines[:200000]

def label_sentences(sentences):
    labeled_data = []
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
        words = sentence.split()
        labels = []
        for word in words:
            if len(word) == 1:
                labels.append((word, 'S'))
            else:
                labels.append((word[0], 'B'))
                for char in word[1:-1]:
                    labels.append((char, 'I'))
                labels.append((word[-1], 'E'))
        labeled_data.append(labels)
    return labeled_data

labeled_lines = label_sentences(lines)

sentences = [[char for char, label in sentence] for sentence in labeled_lines]

In [None]:
sample_sentence = labeled_lines[0]  # Taking the first sentence as an example
sample_sentence_processed = [(char + '/' + tag) for char, tag in sample_sentence]
print(" ".join(sample_sentence_processed))

In [3]:
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.save("word2vec.model")
vocab = list(word2vec_model.wv.index_to_key)
embeddings = [word2vec_model.wv[word] for word in vocab]
embedding_df = pd.DataFrame(embeddings, index=vocab)
embedding_df.to_csv('word_embeddings.csv')

In [4]:
# Convert Characters to Indices and Prepare Labels
char_to_index = {char: idx for idx, char in enumerate(word2vec_model.wv.index_to_key, start=1)}
char_to_index['PAD'] = 0

label_to_index = {'B': 0, 'I': 1, 'E': 2, 'S': 3}

def prepare_data(labeled_sentences, char_to_index, label_to_index):
    X = []
    y = []
    for sentence in labeled_sentences:
        sentence_indices = [char_to_index[char] for char, label in sentence]
        label_indices = [label_to_index[label] for char, label in sentence]
        X.append(sentence_indices)
        y.append(label_indices)
    return X, y

X, y = prepare_data(labeled_lines, char_to_index, label_to_index)

In [5]:
MAX_LEN = 100
X_padded = pad_sequences(X, maxlen=MAX_LEN, padding='post', value=char_to_index['PAD'])
y_padded = pad_sequences(y, maxlen=MAX_LEN, padding='post', value=label_to_index['S'])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_padded, test_size=0.3, random_state=42)
y_train = np.expand_dims(y_train, axis=-1)
y_test = np.expand_dims(y_test, axis=-1)

In [7]:
vocab_size = len(char_to_index)
output_size = len(label_to_index)
embedding_dim = 100

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=MAX_LEN, mask_zero=True),
    Bidirectional(LSTM(units=64, return_sequences=True, recurrent_dropout=0.1)),
    TimeDistributed(Dense(output_size, activation='softmax'))
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          10100     
                                                                 
 bidirectional (Bidirection  (None, 100, 128)          84480     
 al)                                                             
                                                                 
 time_distributed (TimeDist  (None, 100, 4)            516       
 ributed)                                                        
                                                                 
Total params: 95096 (371.47 KB)
Trainable params: 95096 (371.47 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [8]:
history = model.fit(X_train, y_train, batch_size=64, epochs=10, validation_split=0.1)

Epoch 1/10



In [None]:
from sklearn.metrics import classification_report

# Evaluate the Model on Test Data
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')

# Predictions
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
y_true = np.squeeze(y_test, axis=-1)

# Classification Report
label_names = ['B', 'I', 'E', 'S']
print(classification_report(y_true.flatten(), y_pred.flatten(), target_names=label_names))


In [None]:
def decode_predictions(preds, index_to_label):
    label_preds = np.argmax(preds, axis=-1)
    return [index_to_label[idx] for idx in label_preds[0]]

index_to_label = {idx: label for label, idx in label_to_index.items()}

def reconstruct_sentence(chars, labels):
    words = []
    word = ''
    for char, label in zip(chars, labels):
        if label == 'B':
            if word:
                words.append(word)
            word = char
        elif label == 'I':
            word += char
        elif label == 'E':
            word += char
            words.append(word)
            word = ''
        elif label == 'S':
            if word:
                words.append(word)
            words.append(char)
            word = ''
    if word:
        words.append(word)
    return words

def predict_sentence(sentence, model, char_to_index, index_to_label):
    sentence_indices = [char_to_index.get(char, 0) for char in sentence]
    sentence_padded = pad_sequences([sentence_indices], maxlen=MAX_LEN, padding='post', value=char_to_index['PAD'])
    preds = model.predict(sentence_padded)
    label_preds = decode_predictions(preds, index_to_label)
    segmented_words = reconstruct_sentence(sentence, label_preds)
    segmented_labels = label_preds
    return segmented_words, segmented_labels

# Define the sentence to predict
sentence_to_predict = "ആഗ്രഹങ്ങൾസാക്ഷാത്കരിക്കാന്പഠിക്കണം"

# Make Prediction
predicted_words, predicted_labels = predict_sentence(sentence_to_predict, model, char_to_index, index_to_label)

# Print the Segmented Sentence and Labels
print("Segmented Sentence:", ' '.join(predicted_words))

