In [None]:
# 2_embedding_visualization.ipynb
# Purpose: Explore and visualize pretrained Chinese word embeddings with full English output

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
import os
import bz2

In [None]:
# Load pretrained word vectors if not loaded already
if not os.path.exists("embeddings/sgns.zhihu.bigram"):
    with open("embeddings/sgns.zhihu.bigram", 'wb') as new_file, open("embeddings/sgns.zhihu.bigram.bz2", 'rb') as file:
        decompressor = bz2.BZ2Decompressor()
        for data in iter(lambda: file.read(100 * 1024), b''):
            new_file.write(decompressor.decompress(data))

# Load model
cn_model = KeyedVectors.load_word2vec_format("embeddings/sgns.zhihu.bigram", binary=False, unicode_errors="ignore")


In [None]:
# Define a utility function to visualize first 50 dimensions

def plot_word_embedding(word, label):
    if word not in cn_model.key_to_index:
        print(f"Word '{word}' not in vocabulary.")
        return
    vec = cn_model[word]
    df = pd.DataFrame(vec[:50]).T
    plt.figure(figsize=(14, 1.5))
    sns.heatmap(df, cmap="YlGnBu", cbar=False, square=True)
    plt.title(f"Embedding (first 50 dims): {label}")
    plt.yticks([])
    plt.xlabel("Dimensions")
    plt.show()


In [None]:
# Visualize embeddings of selected words (English labels only)
words_to_plot = {
    "hotel": "\u9152\u5e97",
    "black_tea": "\u7ea2\u8336",
    "green_tea": "\u7eff\u8336",
    "price": "\u4ef7\u683c"
}

for label, word in words_to_plot.items():
    print(f"Visualizing embedding for: {label}")
    plot_word_embedding(word, label)

In [None]:
# Semantic analogy test with English labels only
occupation_words = ["teacher", "accountant", "programmer", "lawyer", "doctor", "elderly"]
occupation_chinese = ["\u8001\u5e08", "\u4f1a\u8ba1\u5e08", "\u7a0b\u5e8f\u5458", "\u5f8b\u5e08", "\u533b\u751f", "\u8001\u4eba"]
odd_index_1 = cn_model.doesnt_match(occupation_chinese)
odd_label_1 = occupation_words[occupation_chinese.index(odd_index_1)]
print(f"In the occupation list {occupation_words}, the unrelated word is: {odd_label_1}")

tea_words = ["tea_leaf", "black_tea", "dark_tea", "white_tea", "green_tea", "yellow_tea", "travel"]
tea_chinese = ["\u8336\u53f6", "\u7ea2\u8336", "\u9ed1\u8336", "\u767d\u8336", "\u7eff\u8336", "\u9ec4\u8336", "\u65c5\u6e38"]
odd_index_2 = cn_model.doesnt_match(tea_chinese)
odd_label_2 = tea_words[tea_chinese.index(odd_index_2)]
print(f"In the tea-related list {tea_words}, the unrelated word is: {odd_label_2}")



In [None]:
# Compute cosine similarity between selected terms (displaying in English)
similarity_1 = cn_model.similarity("\u7ea2\u8336", "\u7eff\u8336")
similarity_2 = cn_model.similarity("\u7ea2\u8336", "\u4ef7\u683c")

print("Cosine similarity between black_tea and green_tea:", similarity_1)
print("Cosine similarity between black_tea and price:", similarity_2)