In [1]:
# Section 1: 加载必要的库
import os
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Section 2: 加载模型和分词器
device = "cpu"  # 可切换为 "cuda" 如果有 GPU 支持
model_path = "Alibaba-NLP/gte-modernbert-base"
local_model_path = "./模型"  # 替换为本地模型路径

if os.path.exists(local_model_path):
    tokenizer = AutoTokenizer.from_pretrained(local_model_path)
    model = AutoModel.from_pretrained(local_model_path).to(device)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path).to(device)

In [3]:
# Section 3: 定义嵌入函数
def embedding(model, tokenizer, input_texts):
    """
    将输入文本转换为嵌入向量。
    :param model: 加载的模型
    :param tokenizer: 加载的分词器
    :param input_texts: List[str]，待嵌入的文本列表
    :return: 嵌入向量的 numpy 数组
    """
    batch_dict = tokenizer(input_texts, max_length=5000, padding=True, truncation=True, return_tensors='pt')
    batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
    outputs = model(**batch_dict)
    embeddings = outputs.last_hidden_state[:, 0]
    return embeddings.cpu().detach().numpy()

In [4]:
# Section 4: 加载和处理数据
# 加载 CSV 数据文件
csv_file_path = "c:\\Users\\Vow\\.vscode\\信存检\\小组作业\\autoSteam\\DBScripts\\steam_games_translated_final.csv"
data = pd.read_csv(csv_file_path, encoding="utf-8")

# 提取 `about the game` 和 `review` 字段
data_subset = data[["游戏应用ID", "About the game", "Reviews"]].dropna()

# 检查数据
print(f"数据集大小: {data_subset.shape}")
data_subset.head()

数据集大小: (1072, 3)


Unnamed: 0,游戏应用ID,About the game,Reviews
0,396680,"A solo indie title from Chris Parsons, Sol Tra...",“Sol Trader is great fun and is a must-play.” ...
1,1287530,Experience a highly complex simulation of mark...,“You never know what new oddities will await y...
2,985830,TO BE CONTINUED IN... / UNLEASH YOUR RAGE Beco...,"“It's insanely violent, it's reprehensible, an..."
3,660160,Field of Glory II?is a turn-based tactical gam...,“Turn-based strategy wargame experience that b...
4,457340,"A first-person, narrative-driven puzzle game D...",“Recommendation: Solid and challenging grab fo...


In [5]:
# Section 5: 生成嵌入向量
# 定义一个函数来处理每一行数据
def process_row(row):
    try:
        about_embedding = embedding(model, tokenizer, [row["About the game"]])[0]
        review_embedding = embedding(model, tokenizer, [row["Reviews"]])[0]
        return {
            "game_id": row["游戏应用ID"],
            "about_embedding": about_embedding.tolist(),
            "review_embedding": review_embedding.tolist()
        }
    except Exception as e:
        print(f"处理游戏 ID {row['游戏应用ID']} 时出错: {e}")
        return None

# 对数据集进行处理
embedding_results = []
for _, row in data_subset.iterrows():
    result = process_row(row)
    if result:
        embedding_results.append(result)

print(f"成功生成嵌入向量的记录数: {len(embedding_results)}")

成功生成嵌入向量的记录数: 1072


In [6]:
# Section 6: 保存嵌入结果
# 定义保存路径
output_file_path = "c:\\Users\\Vow\\.vscode\\信存检\\小组作业\\autoSteam\\embeddings.json"

# 保存为 JSON 文件
with open(output_file_path, "w", encoding="utf-8") as f:
    json.dump(embedding_results, f, ensure_ascii=False, indent=4)

print(f"嵌入结果已保存到: {output_file_path}")

嵌入结果已保存到: c:\Users\Vow\.vscode\信存检\小组作业\autoSteam\embeddings.json


In [7]:
# 将嵌入结果转换为DataFrame
embedding_df = pd.DataFrame(embedding_results)

# 合并原始数据和嵌入数据
merged_df = pd.merge(data, embedding_df, left_on="游戏应用ID", right_on="game_id", how="left")

# 保存为新的CSV文件
merged_csv_path = "c:\\Users\\Vow\\.vscode\\信存检\\小组作业\\autoSteam\\merged_steam_games_with_embeddings.csv"
merged_df.to_csv(merged_csv_path, index=False, encoding="utf-8-sig")

print(f"合并后的数据已保存到: {merged_csv_path}")

合并后的数据已保存到: c:\Users\Vow\.vscode\信存检\小组作业\autoSteam\merged_steam_games_with_embeddings.csv
