In [16]:
import numpy as np
import pandas as pd
import json

def read_json_to_df(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Read each file into a separate DataFrame
df_train = read_json_to_df('E:/Desktop/Text/Task2/train.json')
df_test = read_json_to_df('E:/Desktop/Text/Task2/test.json')
df_valid = read_json_to_df('E:/Desktop/Text/Task2/valid.json')

# Convert numbers to strings (processing tokens and tags)
df_train['tokens'] = df_train['tokens'].apply(lambda x: [str(i) for i in x])
df_train['tags'] = df_train['tags'].apply(lambda x: [str(i) for i in x])
df_valid['tokens'] = df_valid['tokens'].apply(lambda x: [str(i) for i in x])
df_valid['tags'] = df_valid['tags'].apply(lambda x: [str(i) for i in x])
df_test['tokens'] = df_test['tokens'].apply(lambda x: [str(i) for i in x])
df_test['tags'] = df_test['tags'].apply(lambda x: [str(i) for i in x])


In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Extract tokens and tags
train_sentences = df_train['tokens'].tolist()
train_tags = df_train['tags'].tolist()
valid_sentences = df_valid['tokens'].tolist()
valid_tags = df_valid['tags'].tolist()
test_sentences = df_test['tokens'].tolist()
test_tags = df_test['tags'].tolist()

# Create tokenizer
tokenizer = Tokenizer(num_words=5120, oov_token='UNK')
tokenizer.fit_on_texts(train_sentences)

# Convert text to sequences
train_seq = tokenizer.texts_to_sequences(train_sentences)
valid_seq = tokenizer.texts_to_sequences(valid_sentences)
test_seq = tokenizer.texts_to_sequences(test_sentences)

# Pad sequences
max_len = 46
train_seq_padded = pad_sequences(train_seq, maxlen=max_len, padding='post')
valid_seq_padded = pad_sequences(valid_seq, maxlen=max_len, padding='post')
test_seq_padded = pad_sequences(test_seq, maxlen=max_len, padding='post')

# Encode tags
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(train_tags)

# Convert tags to sequences
train_tag_seq = tag_tokenizer.texts_to_sequences(train_tags)
valid_tag_seq = tag_tokenizer.texts_to_sequences(valid_tags)
test_tag_seq = tag_tokenizer.texts_to_sequences(test_tags)

# Pad tag sequences
train_tag_seq_padded = pad_sequences(train_tag_seq, maxlen=max_len, padding='post')
valid_tag_seq_padded = pad_sequences(valid_tag_seq, maxlen=max_len, padding='post')
test_tag_seq_padded = pad_sequences(test_tag_seq, maxlen=max_len, padding='post')

# One-hot encode tags
num_tags = len(tag_tokenizer.word_index) + 1
train_tags_encoded = to_categorical(train_tag_seq_padded, num_classes=num_tags)
valid_tags_encoded = to_categorical(valid_tag_seq_padded, num_classes=num_tags)
test_tags_encoded = to_categorical(test_tag_seq_padded, num_classes=num_tags)


In [18]:
# # Build a BiLSTM model
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Dropout

# model = Sequential()
# model.add(Embedding(input_dim=5120, output_dim=128, input_length=max_len))
# model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
# model.add(Dropout(0.5))
# model.add(TimeDistributed(Dense(num_tags, activation='softmax')))

# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# # model.summary()

# # Train and evaluate the model
# history = model.fit(train_seq_padded, train_tags_encoded, batch_size=32, epochs=10, validation_data=(valid_seq_padded, valid_tags_encoded))

# loss, accuracy = model.evaluate(test_seq_padded, test_tags_encoded)
# print("Test loss:", loss)
# print("Test accuracy:", accuracy)

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Dropout
import tensorflow as tf

# Define a custom F1 Score metric
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.precision = tf.keras.metrics.Precision()
        self.recall = tf.keras.metrics.Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_pred = tf.round(y_pred)  # Apply threshold to convert probabilities to binary values
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)

    def result(self):
        p = self.precision.result()
        r = self.recall.result()
        return 2 * ((p * r) / (p + r + tf.keras.backend.epsilon()))

    def reset_states(self):
        self.precision.reset_states()
        self.recall.reset_states()

# Build a BiLSTM model
model = Sequential()
model.add(Embedding(input_dim=5120, output_dim=128, input_length=max_len))
model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(num_tags, activation='softmax')))

# Compile the model with custom F1 Score metric
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', F1Score()])

# Display model summary
# model.summary()

# Train and evaluate the model
history = model.fit(train_seq_padded, train_tags_encoded, batch_size=32, epochs=10, validation_data=(valid_seq_padded, valid_tags_encoded))

loss, accuracy, f1_score = model.evaluate(test_seq_padded, test_tags_encoded)
print("Test loss:", loss)
print("Test accuracy:", accuracy)
print("Test F1 Score:", f1_score)


Epoch 1/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 47ms/step - accuracy: 0.8797 - f1_score: 0.8097 - loss: 0.5120 - val_accuracy: 0.9521 - val_f1_score: 0.9576 - val_loss: 0.1389
Epoch 2/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 41ms/step - accuracy: 0.9601 - f1_score: 0.9644 - loss: 0.1160 - val_accuracy: 0.9720 - val_f1_score: 0.9716 - val_loss: 0.0946
Epoch 3/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 42ms/step - accuracy: 0.9819 - f1_score: 0.9812 - loss: 0.0637 - val_accuracy: 0.9755 - val_f1_score: 0.9757 - val_loss: 0.0750
Epoch 4/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 41ms/step - accuracy: 0.9878 - f1_score: 0.9875 - loss: 0.0400 - val_accuracy: 0.9747 - val_f1_score: 0.9750 - val_loss: 0.0762
Epoch 5/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 42ms/step - accuracy: 0.9902 - f1_score: 0.9900 - loss: 0.0316 - val_accuracy: 0.9771 - val_f1_score: 

In [21]:
# Find the optimal sequence length and vocabulary size
from collections import Counter

def analyze_data(df):
    # Calculate the length of each sentence
    df['sentence_length'] = df['tokens'].apply(len)

    # Count the occurrence of each word
    word_counts = Counter(word for tokens_list in df['tokens'] for word in tokens_list)

    return df['sentence_length'].describe(), word_counts

# Analyze training data
sentence_length_desc, word_counts = analyze_data(df_train)

# Print descriptive statistics for sentence lengths
print("Sentence length statistics:\n", sentence_length_desc)

# Determine the appropriate sequence length: choose the length that covers 95% of the data
max_length = int(np.percentile(df_train['sentence_length'], 95))
print("Recommended max sequence length (95 percentile):", max_length)

# Determine vocabulary size: choose the vocabulary size that covers 95% of word frequencies
cumulative_coverage = 0
total_frequency = sum(word_counts.values())
sorted_words = sorted(word_counts.items(), key=lambda item: item[1], reverse=True)
vocab_size = 0
for i, (word, freq) in enumerate(sorted_words):
    cumulative_coverage += freq / total_frequency
    if cumulative_coverage >= 0.95:
        vocab_size = i + 1
        break

print(f"Recommended vocabulary size to cover 95% of all word occurrences: {vocab_size}")


Sentence length statistics:
 count    5228.000000
mean       20.910865
std        14.487604
min         1.000000
25%        11.000000
50%        18.000000
75%        27.000000
max       133.000000
Name: sentence_length, dtype: float64
Recommended max sequence length (95 percentile): 46
Recommended vocabulary size to cover 95% of all word occurrences: 5120
