In [None]:
# 1_text_preprocessing.ipynb
# Purpose: Load Chinese hotel review data, perform preprocessing, tokenization, and word index mapping
import os
import re
import bz2
import numpy as np
import pandas as pd
import jieba
from gensim.models import KeyedVectors
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
# Load positive and negative samples (local, not to be published)
train_texts = []
train_labels = []

with open("positive_samples.txt", "r", encoding="utf-8") as f:
    for line in f:
        dic = eval(line.strip())
        train_texts.append(dic["text"])
        train_labels.append(dic["label"])

with open("negative_samples.txt", "r", encoding="utf-8") as f:
    for line in f:
        dic = eval(line.strip())
        train_texts.append(dic["text"])
        train_labels.append(dic["label"])

# Display number of samples
print("Total samples:", len(train_texts))


In [None]:

# Load pre-trained word vectors (Zhihu bigram)
if not os.path.exists("embeddings/sgns.zhihu.bigram"):
    with open("embeddings/sgns.zhihu.bigram", 'wb') as new_file, open("embeddings/sgns.zhihu.bigram.bz2", 'rb') as file:
        decompressor = bz2.BZ2Decompressor()
        for data in iter(lambda: file.read(100 * 1024), b''):
            new_file.write(decompressor.decompress(data))

# Load the word vector model
cn_model = KeyedVectors.load_word2vec_format("embeddings/sgns.zhihu.bigram", binary=False, unicode_errors="ignore")
embedding_dim = cn_model.vector_size
print(f"Embedding dimension: {embedding_dim}")


In [None]:
# Convert each sentence into a list of word indexes
train_tokens = []
for text in train_texts:
    # Remove punctuation
    text = re.sub(r"[\s+\.!/_,$%^*(+\"']+|[+\-\-！，。？、~@#￥%……&*（）]+", "", text)
    words = list(jieba.cut(text))
    word_indexes = []
    for word in words:
        index = cn_model.key_to_index.get(word, 0)
        if index >= 50000:
            index = 0
        word_indexes.append(index)
    train_tokens.append(word_indexes)

# Analyze token lengths
num_tokens = np.array([len(t) for t in train_tokens])
print("Average token length:", np.mean(num_tokens))
print("Max token length:", np.max(num_tokens))


In [None]:
# Set max token length to cover ~95% of samples
max_tokens = int(np.mean(num_tokens) + 2 * np.std(num_tokens))
print("Max tokens to pad:", max_tokens)

# Padding and truncating
train_pad = pad_sequences(train_tokens, maxlen=max_tokens, padding='pre', truncating='pre')
train_pad[train_pad >= 50000] = 0
train_labels = np.array(train_labels)


In [None]:
# Token length histogram
plt.hist(np.log(num_tokens), bins=10)
plt.xlabel('Log(Token Length)')
plt.ylabel('Sample Count')
plt.title('Token Length Distribution')
plt.show()

# Reverse function for debugging
index_to_word = {v: k for k, v in cn_model.key_to_index.items()}


In [None]:
def reverse_tokens(tokens):
    return ''.join(index_to_word.get(i, ' ') for i in tokens if i != 0)

# Print example
print("Original text:", train_texts[0])
print("Tokenized (reconstructed):", reverse_tokens(train_pad[0]))
