# **1. Data Analysis and Preprocessing**

Load and Clean Data


In [None]:
import pandas as pd

data = pd.read_csv('/content/data.tsv', sep='\t', names=['token', 'pos_tag', 'ner_tag'])

data.dropna(inplace=True)

print(data.head())


        token pos_tag ner_tag
1      শনিবার     NNP   B-D&T
2         (২৭   PUNCT   B-OTH
3      আগস্ট)     NNP   B-D&T
4        রাতে     NNC   B-D&T
5  পটুয়াখালী     NNP   B-GPE


# **Tokenization and Padding**
Tokenize and pad sequences using TensorFlow and Keras utilities.

In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
data = pd.read_csv('/content/data.tsv', sep='\t', names=['token', 'pos_tag', 'ner_tag'])

data.dropna(inplace=True)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['token'])
sequences = tokenizer.texts_to_sequences(data['token'])
padded_sequences = pad_sequences(sequences, padding='post')


# **2. Label Encoding and One-Hot Encoding**
This section is about encoding categorical labels for POS and NER tagging into a numeric format that can be processed by the neural network, including one-hot encoding to transform these labels into a binary class matrix.

In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Label encoding
label_encoder_pos = LabelEncoder()
label_encoder_ner = LabelEncoder()

data['pos_encoded'] = label_encoder_pos.fit_transform(data['pos_tag'])
data['ner_encoded'] = label_encoder_ner.fit_transform(data['ner_tag'])

# One-hot encoding
pos_onehot = to_categorical(data['pos_encoded'], num_classes=len(label_encoder_pos.classes_))
ner_onehot = to_categorical(data['ner_encoded'], num_classes=len(label_encoder_ner.classes_))


# **3. Padding Labels**
To ensure the labels match the length of input sequences, this part includes padding the one-hot encoded labels to create uniform input for training.

In [None]:
import numpy as np

def pad_labels_to_match_sequences(onehot_labels, seq_length):
    padded_labels = np.zeros((len(onehot_labels), seq_length, onehot_labels.shape[1]))
    for i, seq in enumerate(onehot_labels):
        padded_labels[i, :len(seq), :] = seq
    return padded_labels

pos_labels_padded = pad_labels_to_match_sequences(pos_onehot, padded_sequences.shape[1])
ner_labels_padded = pad_labels_to_match_sequences(ner_onehot, padded_sequences.shape[1])
combined_labels = np.concatenate([pos_labels_padded, ner_labels_padded], axis=-1)


# **4. Model Definition and Compilation**
This section involves setting up the neural network architecture, including embedding layers, LSTM layers, and the output layer, followed by compiling the model with appropriate loss function and metrics.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional

model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dense(32, activation='relu'),
    Dense(combined_labels.shape[2], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


# **5. Model Training and Evaluation**
This final step involves training the model on the training data and evaluating its performance on the test set. This helps understand the model’s accuracy and other performance metrics.

In [None]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, combined_labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=42)

# Train model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32)

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc}")


Epoch 1/5
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 64ms/step - accuracy: 0.1018 - loss: 10.7672 - val_accuracy: 0.1009 - val_loss: 42.3872
Epoch 2/5
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 56ms/step - accuracy: 0.1124 - loss: 61.9704 - val_accuracy: 0.1010 - val_loss: 107.3848
Epoch 3/5
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 56ms/step - accuracy: 0.1171 - loss: 128.8853 - val_accuracy: 0.1010 - val_loss: 174.9969
Epoch 4/5
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 57ms/step - accuracy: 0.1184 - loss: 197.2652 - val_accuracy: 0.1010 - val_loss: 252.4154
Epoch 5/5
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 57ms/step - accuracy: 0.1199 - loss: 278.5403 - val_accuracy: 0.2269 - val_loss: 330.6854
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.2254 - loss: 312.7773
Test Accuracy: 0.2240431010723114


# **Performance Metrics:**

In [None]:
from sklearn.metrics import classification_report


y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=-1)

y_pred_flat = []
y_true_flat = []

for i in range(len(X_test)):
    length = np.sum(X_test[i] != 0)
    y_pred_flat.extend(y_pred_classes[i][:length])
    y_true_flat.extend(y_test[i][:length])

y_pred_flat = np.array(y_pred_flat)
y_true_flat = np.array(y_true_flat)

from sklearn.metrics import classification_report
print(classification_report(y_true_flat, y_pred_flat, zero_division=0))



[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00    8182.0
         1.0       0.00      0.00      0.00     948.0
         5.0       0.00      0.00      0.00       0.0
        22.0       0.00      0.00      0.00       0.0

    accuracy                           0.00    9130.0
   macro avg       0.00      0.00      0.00    9130.0
weighted avg       0.00      0.00      0.00    9130.0



In [None]:

model.save('ner_pos_model.h5')

from tensorflow.keras.models import load_model
deployed_model = load_model('ner_pos_model.h5')

def make_prediction(text):
    tokenized_sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(tokenized_sequence, maxlen=padded_sequences.shape[1], padding='post')
    prediction = deployed_model.predict(padded_sequence)
    predicted_tags = np.argmax(prediction, axis=-1)[0]
    return predicted_tags

# Example usage
text_example = "example text for testing"
print(make_prediction(text_example))




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 698ms/step
[ 5  5  5  5  5  5  5  5  5 22 22 22  5  5  5  5 22 22 22 22  9  9 22 22
 22 10 18 18 18 18 18 18 18 18 18 18 18 18 18 18  9]
