In [8]:
import numpy as np
import pandas as pd
import json

def read_json_to_df(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# 读取各个文件到独立的DataFrame
df_train = read_json_to_df('E:/Desktop/Text/Task2/train.json')
df_test = read_json_to_df('E:/Desktop/Text/Task2/test.json')
df_valid = read_json_to_df('E:/Desktop/Text/Task2/valid.json')

# 将数字转换为字符串（对tokens和tags进行处理）
df_train['tokens'] = df_train['tokens'].apply(lambda x: [str(i) for i in x])
df_train['tags'] = df_train['tags'].apply(lambda x: [str(i) for i in x])
df_valid['tokens'] = df_valid['tokens'].apply(lambda x: [str(i) for i in x])
df_valid['tags'] = df_valid['tags'].apply(lambda x: [str(i) for i in x])
df_test['tokens'] = df_test['tokens'].apply(lambda x: [str(i) for i in x])
df_test['tags'] = df_test['tags'].apply(lambda x: [str(i) for i in x])

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# 提取tokens和tags
train_sentences = df_train['tokens'].tolist()
train_tags = df_train['tags'].tolist()
valid_sentences = df_valid['tokens'].tolist()
valid_tags = df_valid['tags'].tolist()
test_sentences = df_test['tokens'].tolist()
test_tags = df_test['tags'].tolist()

# 创建tokenizer
tokenizer = Tokenizer(num_words=5120, oov_token='UNK')
tokenizer.fit_on_texts(train_sentences)

# 文本转序列
train_seq = tokenizer.texts_to_sequences(train_sentences)
valid_seq = tokenizer.texts_to_sequences(valid_sentences)
test_seq = tokenizer.texts_to_sequences(test_sentences)

# 序列填充
max_len = 46
train_seq_padded = pad_sequences(train_seq, maxlen=max_len, padding='post')
valid_seq_padded = pad_sequences(valid_seq, maxlen=max_len, padding='post')
test_seq_padded = pad_sequences(test_seq, maxlen=max_len, padding='post')

# 标签编码
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(train_tags)

# 标签转序列
train_tag_seq = tag_tokenizer.texts_to_sequences(train_tags)
valid_tag_seq = tag_tokenizer.texts_to_sequences(valid_tags)
test_tag_seq = tag_tokenizer.texts_to_sequences(test_tags)

# 标签填充
train_tag_seq_padded = pad_sequences(train_tag_seq, maxlen=max_len, padding='post')
valid_tag_seq_padded = pad_sequences(valid_tag_seq, maxlen=max_len, padding='post')
test_tag_seq_padded = pad_sequences(test_tag_seq, maxlen=max_len, padding='post')

# 标签one-hot编码
num_tags = len(tag_tokenizer.word_index) + 1
train_tags_encoded = to_categorical(train_tag_seq_padded, num_classes=num_tags)
valid_tags_encoded = to_categorical(valid_tag_seq_padded, num_classes=num_tags)
test_tags_encoded = to_categorical(test_tag_seq_padded, num_classes=num_tags)

In [10]:
# 构建BiLSTM模型
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Dropout

model = Sequential()
model.add(Embedding(input_dim=5120, output_dim=128, input_length=max_len))
model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(num_tags, activation='softmax')))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# model.summary()

# 训练和评估模型
history = model.fit(train_seq_padded, train_tags_encoded, batch_size=32, epochs=10, validation_data=(valid_seq_padded, valid_tags_encoded))

loss, accuracy = model.evaluate(test_seq_padded, test_tags_encoded)
print("Test loss:", loss)
print("Test accuracy:", accuracy)

Epoch 1/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 42ms/step - accuracy: 0.8747 - loss: 0.4986 - val_accuracy: 0.9527 - val_loss: 0.1307
Epoch 2/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step - accuracy: 0.9600 - loss: 0.1145 - val_accuracy: 0.9719 - val_loss: 0.0912
Epoch 3/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 37ms/step - accuracy: 0.9818 - loss: 0.0642 - val_accuracy: 0.9769 - val_loss: 0.0710
Epoch 4/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step - accuracy: 0.9885 - loss: 0.0377 - val_accuracy: 0.9772 - val_loss: 0.0756
Epoch 5/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step - accuracy: 0.9904 - loss: 0.0294 - val_accuracy: 0.9779 - val_loss: 0.0778
Epoch 6/10
[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step - accuracy: 0.9931 - loss: 0.0220 - val_accuracy: 0.9772 - val_loss: 0.0756
Epoch 7/10
[1m164/16

In [12]:
# # 寻找最优序列长度和词汇表
# from collections import Counter

# def analyze_data(df):
#     # 计算每个句子的长度
#     df['sentence_length'] = df['tokens'].apply(len)

#     # 统计所有词的出现次数
#     word_counts = Counter(word for tokens_list in df['tokens'] for word in tokens_list)

#     return df['sentence_length'].describe(), word_counts

# # 分析训练数据
# sentence_length_desc, word_counts = analyze_data(df_train)

# # 打印句子长度的描述性统计信息
# print("Sentence length statistics:\n", sentence_length_desc)

# # 确定合适的序列长度：选择覆盖95%数据的长度
# max_length = int(np.percentile(df_train['sentence_length'], 95))
# print("Recommended max sequence length (95 percentile):", max_length)

# # 确定词汇表大小：选择覆盖95%词频的词汇表大小
# cumulative_coverage = 0
# total_frequency = sum(word_counts.values())
# sorted_words = sorted(word_counts.items(), key=lambda item: item[1], reverse=True)
# vocab_size = 0
# for i, (word, freq) in enumerate(sorted_words):
#     cumulative_coverage += freq / total_frequency
#     if cumulative_coverage >= 0.95:
#         vocab_size = i + 1
#         break

# print(f"Recommended vocabulary size to cover 95% of all word occurrences: {vocab_size}")