依赖部分

In [1]:
# import sys

# # 显示 Python 可执行文件路径
# python_path = sys.executable
# print(f"当前 Python 路径: {python_path}")


In [2]:
# !C:\Users\陈衍鑫\AppData\Local\Programs\Python\Python310\python.exe -m pip install transformers 

In [19]:
import re
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import string
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
import requests
from io import BytesIO

读入数据

In [4]:

# 按数据集提示解析 JSON 数据
def parse(path):
    with open(path, "r") as f:
        for line in f:
            yield json.loads(line)


# 将数据转换为 PD
def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient="index")


# 读取和展示 meta_Sports_and_Outdoors.json 数据
meta_file_path = "Data/meta_Sports_and_Outdoors.json"
meta_df = getDF(meta_file_path)

# 读取和展示 Sports_and_Outdoors_5.json 数据
reviews_file_path = "Data/Sports_and_Outdoors_5.json"
review_df = getDF(reviews_file_path)

数据展示

In [5]:
print("前5个meta数据:")
print(meta_df.head(5))

print("\n前5个review数据:")
print(review_df.head(5))

# 提取属性名列表
meta_columns = meta_df.columns.tolist()
review_columns = review_df.columns.tolist()

print("meta_Sports_and_Outdoors.json 的属性名列表:")
print(meta_columns)

print("\nSports_and_Outdoors_5.json 的属性名列表:")
print(review_columns)

前5个meta数据:
                                            category tech1  \
0  [Sports & Outdoors, Sports & Fitness, Other Sp...         
1  [Sports & Outdoors, Sports & Fitness, Other Sp...         
2  [Sports & Outdoors, Sports & Fitness, Other Sp...         
3  [Sports & Outdoors, Sports & Fitness, Other Sp...         
4  [Sports & Outdoors, Sports & Fitness, Other Sp...         

                                         description fit  \
0  [3 layers of super-soft polyester tulle can be...       
1  [3 layers of super-soft polyester tulle can be...       
2  [3 layers of super-soft polyester tulle can be...       
3                                             [TUtu]       
4  [Dance tutu for girls ages 2-8 years. Perfect ...       

                                     title  \
0   Adult Tutu Assorted Colors (Turquoise)   
1  Bububibi Adult Ballet Tutu Cheetah Pink   
2            Girls Ballet Tutu Neon Orange   
3         Girls Ballet Tutu Zebra Hot Pink   
4              Girls Ball

多模态数据选取与数据清理

In [6]:
import numpy as np
# is_empty 函数，确保正确处理各种类型的空值
def is_empty(value):
    if value is None:
        return True
    if isinstance(value, float) and np.isnan(value):
        return True
    if isinstance(value, str) and value.strip() == "":
        return True
    if isinstance(value, list) and len(value) == 0:
        return True
    return False

# 选取 meta_Sports_and_Outdoors.json 数据集的指定特征
meta_selected_features = [
    'asin',            # 产品ID
    'title',           # 产品名称
    'description',     # 产品描述
    'feature',         # 产品特征
    'category',      # 产品类别
    'brand',           # 品牌
    'price',           # 价格
    'imageURL',        # 产品图片URL
    'also_buy',        # 购买该商品的用户还购买了
    'also_view',       # 浏览该商品的用户还浏览了
    'rank'             # 销售排名
]

# 从 meta_df 中选取指定的特征属性
meta_df = meta_df[meta_selected_features]

# 显示选取后的前5行数据
print("选取后的 meta 数据集：")
print(meta_df.head())

# 选取 Sports_and_Outdoors_5.json 数据集的指定特征
review_selected_features = [
    'reviewerID',   # 评论者ID
    'asin',         # 产品ID
    'overall',      # 评分
    'summary',   # 评论内容
    'reviewTime',   # 评论时间
    'image'         # 用户上传的图片
]

# 从 review_df 中选取指定的特征属性
review_df = review_df[review_selected_features]

# 显示选取后的前5行数据
print("\n选取后的 review 数据集：")
print(review_df.head())


# 统计 meta_Sports_and_Outdoors.json 的空缺值和总数
meta_null_counts = meta_df.applymap(is_empty).sum()
meta_total_counts = len(meta_df)

print("meta_Sports_and_Outdoors.json 的空缺值统计:\n", meta_null_counts)
print("\nmeta_Sports_and_Outdoors.json 的总记录数:\n", meta_total_counts)

# 统计 Sports_and_Outdoors_5.json 的空缺值和总数
review_null_counts = review_df.applymap(is_empty).sum()
review_total_counts = len(review_df)

print("\nSports_and_Outdoors_5.json 的空缺值统计:\n", review_null_counts)
print("\nSports_and_Outdoors_5.json 的总记录数:\n", review_total_counts)

选取后的 meta 数据集：
         asin                                    title  \
0  0000032042   Adult Tutu Assorted Colors (Turquoise)   
1  0000032069  Bububibi Adult Ballet Tutu Cheetah Pink   
2  0000031860            Girls Ballet Tutu Neon Orange   
3  0000031852         Girls Ballet Tutu Zebra Hot Pink   
4  0000031895              Girls Ballet Tutu Neon Blue   

                                         description  \
0  [3 layers of super-soft polyester tulle can be...   
1  [3 layers of super-soft polyester tulle can be...   
2  [3 layers of super-soft polyester tulle can be...   
3                                             [TUtu]   
4  [Dance tutu for girls ages 2-8 years. Perfect ...   

                                             feature  \
0  [3 Layers - 100% Polyester Tulle, Hand Wash La...   
1  [3 Layers - 100% Polyester Tulle, Hand Wash La...   
2  [3 Layers - 100% Polyester Tulle, Hand Wash La...   
3                                             [Tutu]   
4  [3 Layers - 100%

  meta_null_counts = meta_df.applymap(is_empty).sum()


meta_Sports_and_Outdoors.json 的空缺值统计:
 asin                0
title              15
description    150771
feature        158207
category        68321
brand           96973
price          556560
imageURL       485817
also_buy       696170
also_view      605315
rank            19604
dtype: int64

meta_Sports_and_Outdoors.json 的总记录数:
 962300


  review_null_counts = review_df.applymap(is_empty).sum()



Sports_and_Outdoors_5.json 的空缺值统计:
 reviewerID          0
asin                0
overall             0
summary           617
reviewTime          0
image         2775405
dtype: int64

Sports_and_Outdoors_5.json 的总记录数:
 2839940


特征提取

In [7]:
# 定义 is_empty 函数，确保正确处理各种类型的空值
def is_empty(value):
    if value is None:
        return True
    if isinstance(value, float) and np.isnan(value):
        return True
    if isinstance(value, str) and value.strip() == "":
        return True
    if isinstance(value, list) and len(value) == 0:
        return True
    return False

# 查看缺少 'rank' 的数据项数量
missing_rank_count = meta_df['rank'].apply(is_empty).sum()
total_meta_entries = len(meta_df)
print(f"缺少 'rank' 的数据项数量：{missing_rank_count}")
print(f"meta_df 总记录数：{total_meta_entries}")
print(f"需要删除的比例：{(missing_rank_count / total_meta_entries) * 100:.2f}%")

# 删除 'rank' 列为空的数据项
meta_df = meta_df[~meta_df['rank'].apply(is_empty)].reset_index(drop=True)

# 查看缺少 'summary' 的数据项数量
missing_summary_count = review_df['summary'].apply(is_empty).sum()
total_review_entries = len(review_df)
print(f"缺少 'summary' 的数据项数量：{missing_summary_count}")
print(f"review_df 总记录数：{total_review_entries}")
print(f"需要删除的比例：{(missing_summary_count / total_review_entries) * 100:.2f}%")

# 删除 'summary' 列为空的数据项
review_df = review_df[~review_df['summary'].apply(is_empty)].reset_index(drop=True)

# 删除后的数据集情况
print(f"删除后 meta_df 的记录数：{len(meta_df)}")
print(f"删除后 review_df 的记录数：{len(review_df)}")

# 重新统计缺失值
meta_null_counts = meta_df.applymap(is_empty).sum()
print("meta_df 的空缺值统计:\n", meta_null_counts)

review_null_counts = review_df.applymap(is_empty).sum()
print("\nreview_df 的空缺值统计:\n", review_null_counts)

# 合并数据集
merged_df = pd.merge(review_df, meta_df, on='asin', how='inner')
print("合并后的数据集大小：", len(merged_df))


缺少 'rank' 的数据项数量：19604
meta_df 总记录数：962300
需要删除的比例：2.04%
缺少 'summary' 的数据项数量：617
review_df 总记录数：2839940
需要删除的比例：0.02%
删除后 meta_df 的记录数：942696
删除后 review_df 的记录数：2839323


  meta_null_counts = meta_df.applymap(is_empty).sum()


meta_df 的空缺值统计:
 asin                0
title              15
description    146940
feature        151839
category        66238
brand           94200
price          541676
imageURL       475694
also_buy       679676
also_view      590800
rank                0
dtype: int64


  review_null_counts = review_df.applymap(is_empty).sum()



review_df 的空缺值统计:
 reviewerID          0
asin                0
overall             0
summary             0
reviewTime          0
image         2774812
dtype: int64
合并后的数据集大小： 2828532


In [8]:
# 显示合并后数据集的属性列表
print("合并后数据集的属性列表:")
print(merged_df.columns.tolist())

# 显示合并后数据集的第一行数据
print("\n合并后数据集的第一行数据:")
print(merged_df.iloc[0])

# 统计合并后数据集的缺失值
merged_null_counts = merged_df.isnull().sum()
print("\n合并后数据集的缺失值统计:")
print(merged_null_counts)

合并后数据集的属性列表:
['reviewerID', 'asin', 'overall', 'summary', 'reviewTime', 'image', 'title', 'description', 'feature', 'category', 'brand', 'price', 'imageURL', 'also_buy', 'also_view', 'rank']

合并后数据集的第一行数据:
reviewerID                                        A180LQZBUWVOLF
asin                                                  0000032034
overall                                                      5.0
summary                                               Five Stars
reviewTime                                            06 3, 2015
image                                                        NaN
title                                   Adult Ballet Tutu Yellow
description    [3 layers of super-soft polyester tulle can be...
feature        [3 Layers - 100% Polyester Tulle, Hand Wash La...
category       [Sports & Outdoors, Sports & Fitness, Other Sp...
brand                                                   BubuBibi
price                                                     $12.50
imageURL      

数据预处理

In [10]:
# 处理 'price' 字段

# 去除美元符号和逗号，创建新的数值型字段 'price_num'
merged_df['price_num'] = merged_df['price'].replace('[\$,]', '', regex=True)

# 转换为浮点数
merged_df['price_num'] = pd.to_numeric(merged_df['price_num'], errors='coerce')

# 检查是否有转换失败的值
price_missing = merged_df['price_num'].isnull().sum()
print(f"'price_num' 字段转换后缺失值数量：{price_missing}")

# 如果存在缺失值，用中位数填充
median_price_num = merged_df['price_num'].median()
merged_df['price_num'] = merged_df['price_num'].fillna(median_price_num)
print(f"已用中位数 {median_price_num} 填充 'price_num' 字段的缺失值。")

# 处理 'rank' 字段

# 检查 'rank' 字段的数据类型
print("'rank' 字段的数据类型：", merged_df['rank'].dtype)

# 定义一个函数来提取 'rank' 中的数字部分
def extract_rank_number(rank_str):
    if isinstance(rank_str, str):
        # 使用正则表达式提取数字部分
        match = re.search(r'([\d,]+)', rank_str)
        if match:
            # 去除逗号，转换为整数
            rank_num = int(match.group(1).replace(',', ''))
            return rank_num
    return None  # 如果无法提取数字，返回 None

# 应用提取函数，创建新的数值型 'rank_num' 字段
merged_df['rank_num'] = merged_df['rank'].apply(extract_rank_number)

# 检查提取后的 'rank_num' 字段的缺失值数量
rank_num_missing = merged_df['rank_num'].isnull().sum()
print(f"'rank_num' 字段缺失值数量：{rank_num_missing}")

if rank_num_missing > 0:
    # 用排名的最大值（表示最低排名）来填充缺失值
    max_rank_num = merged_df['rank_num'].max()
    merged_df['rank_num'] = merged_df['rank_num'].fillna(max_rank_num)
    print(f"已用最大值 {max_rank_num} 填充 'rank_num' 字段的缺失值。")
else:
    print("'rank_num' 字段无缺失值。")

# 转换 'reviewTime' 字段为日期类型
merged_df['reviewTime'] = pd.to_datetime(merged_df['reviewTime'], errors='coerce')

# 检查是否有转换失败的值
reviewTime_missing = merged_df['reviewTime'].isnull().sum()
print(f"'reviewTime' 转换为日期类型后缺失值数量：{reviewTime_missing}")

# 如果存在缺失值，删除对应行
merged_df = merged_df.dropna(subset=['reviewTime']).reset_index(drop=True)

# 标准化 'price_num' 和 'rank_num' 字段，创建新的标准化字段 'price_norm' 和 'rank_norm'
scaler = MinMaxScaler()
merged_df[['price_norm', 'rank_norm']] = scaler.fit_transform(merged_df[['price_num', 'rank_num']])

# 检查标准化后的值
print("标准化后的 'price_norm' 和 'rank_norm'：")
print(merged_df[['price_norm', 'rank_norm']].describe())


# 保留原始的 'price' 和 'rank' 字段，删除中间的数值型字段
merged_df = merged_df.drop(columns=['price_num', 'rank_num'])


'price_num' 字段转换后缺失值数量：0
已用中位数 15.36 填充 'price_num' 字段的缺失值。
'rank' 字段的数据类型： object
'rank_num' 字段缺失值数量：198429
已用最大值 26324490.0 填充 'rank_num' 字段的缺失值。
'reviewTime' 转换为日期类型后缺失值数量：0
标准化后的 'price_norm' 和 'rank_norm'：
         price_norm     rank_norm
count  2.828532e+06  2.828532e+06
mean   5.148706e-04  7.667761e-02
std    2.364790e-03  2.540335e-01
min    0.000000e+00  0.000000e+00
25%    2.792338e-04  5.120328e-04
50%    3.312394e-04  2.711886e-03
75%    4.076295e-04  1.197322e-02
max    1.000000e+00  1.000000e+00


类别数据处理

In [11]:

# 填充缺失值
merged_df['brand'] = merged_df['brand'].fillna('unknown')

# 进行标签编码
le_brand = LabelEncoder()
merged_df['brand_encoded'] = le_brand.fit_transform(merged_df['brand'])

# 如果 'category' 是列表，取第一个类别
merged_df['category_first'] = merged_df['category'].apply(lambda x: x[0][-1] if isinstance(x, list) and len(x) > 0 else 'unknown')

# 进行标签编码
le_category = LabelEncoder()
merged_df['category_encoded'] = le_category.fit_transform(merged_df['category_first'])


文本数据处理

In [16]:
import re
import string

def clean_text(text):
    if isinstance(text, list):
        # 如果输入是列表，将列表元素连接成一个字符串，元素之间用空格分隔
        text = ' '.join(text)
    if isinstance(text, str):
        text = text.lower()  # 转为小写
        text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # 去除标点符号
        text = re.sub(r'\s+', ' ', text)  # 合并多个空格
        text = text.strip()  # 去除首尾空格
        return text
    else:
        return ''

# 清洗文本字段
merged_df['title_clean'] = merged_df['title'].apply(clean_text)
merged_df['description_clean'] = merged_df['description'].apply(clean_text)
merged_df['summary_clean'] = merged_df['summary'].apply(clean_text)


In [18]:
# 查看每个字段的最大长度和平均长度
max_title_length = merged_df['title_length'].max()
max_description_length = merged_df['description_length'].max()
max_summary_length = merged_df['summary_length'].max()

mean_title_length = merged_df['title_length'].mean()
mean_description_length = merged_df['description_length'].mean()
mean_summary_length = merged_df['summary_length'].mean()

print(f"标题最大长度: {max_title_length}")
print(f"描述最大长度: {max_description_length}")
print(f"摘要最大长度: {max_summary_length}")

print(f"标题平均长度: {mean_title_length:.2f}")
print(f"描述平均长度: {mean_description_length:.2f}")
print(f"摘要平均长度: {mean_summary_length:.2f}")

# 合并文本字段
merged_df['combined_text'] = merged_df['title_clean'] + ' ' + merged_df['description_clean'] + ' ' + merged_df['summary_clean']

# 计算合并后文本的长度
merged_df['combined_length'] = merged_df['combined_text'].apply(len)

# 查看合并后文本的最大长度和平均长度
max_combined_length = merged_df['combined_length'].max()
mean_combined_length = merged_df['combined_length'].mean()

print(f"合并后文本的最大长度: {max_combined_length}")
print(f"合并后文本的平均长度: {mean_combined_length:.2f}")

标题最大长度: 536
描述最大长度: 50212
摘要最大长度: 277
标题平均长度: 65.40
描述平均长度: 580.16
摘要平均长度: 23.06
合并后文本的最大长度: 50318
合并后文本的平均长度: 670.62


特征工程

文本工程提取

In [23]:

# 检查 GPU 是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

# 加载预训练的 BERT 模型和分词器，并将模型移动到 GPU
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.to(device)  # 移动模型到 GPU
model.eval()  # 设置为评估模式

# 创建自定义数据集
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

# 定义批处理函数
def collate_fn(batch):
    # tokenizer 会自动处理批量文本的 padding 和编码
    inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # 将输入张量移动到 GPU
    return inputs

# 创建 DataLoader
texts = merged_df['combined_text'].tolist()
dataset = TextDataset(texts)
data_loader = DataLoader(dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)  

# 进行批量嵌入计算
embeddings = []
with torch.no_grad():
    for batch_inputs in data_loader:
        outputs = model(**batch_inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)  # 取最后一层的平均池化结果
        embeddings.append(batch_embeddings.cpu())

# 将所有批量的嵌入拼接起来
embeddings = torch.cat(embeddings, dim=0).numpy()

# 将嵌入结果添加到 DataFrame 中
merged_df['text_embedding'] = list(embeddings)

print(merged_df[['combined_text', 'text_embedding']].head())
merged_df.to_pickle('processed_merged_df.pkl')

使用设备: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

文本嵌入处理中:   0%|                                                           | 448/353567 [00:34<7:29:04, 13.11it/s]


KeyboardInterrupt: 

类别和品牌嵌入

In [None]:
import torch.nn as nn

# 获取类别和品牌的总数
num_categories = merged_df['category_encoded'].nunique()
num_brands = merged_df['brand_encoded'].nunique()

# 定义嵌入层，设定嵌入维度为 32
category_embedding_layer = nn.Embedding(num_categories, 32)
brand_embedding_layer = nn.Embedding(num_brands, 32)

# 将编码映射为嵌入向量
merged_df['category_embedding'] = merged_df['category_encoded'].apply(lambda x: category_embedding_layer(torch.tensor(x)).detach().numpy())
merged_df['brand_embedding'] = merged_df['brand_encoded'].apply(lambda x: brand_embedding_layer(torch.tensor(x)).detach().numpy())


数值特征处理

In [None]:
# 将数值特征转换为 numpy 数组
merged_df['price_feature'] = merged_df['price'].values
merged_df['rank_feature'] = merged_df['rank'].values


图像特征提取

In [None]:


# 加载预训练的 ResNet50 模型
resnet50 = models.resnet50(pretrained=True)
resnet50.eval()

# 定义图像预处理步骤
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    # 可添加归一化步骤
])

# 定义图像特征提取函数
def get_image_embedding(image_urls):
    if not image_urls:  # 如果没有图片，返回零向量
        return np.zeros(2048)
    image_embeddings = []
    for url in image_urls:
        try:
            response = requests.get(url, timeout=5)
            img = Image.open(BytesIO(response.content)).convert('RGB')
            img = image_transform(img).unsqueeze(0)
            with torch.no_grad():
                features = resnet50(img)
            image_embeddings.append(features.squeeze().numpy())
        except:
            continue
    if image_embeddings:
        return np.mean(image_embeddings, axis=0)
    else:
        return np.zeros(2048)

# 提取图像特征
merged_df['image_embedding'] = merged_df['imageURL'].apply(get_image_embedding)


关联商品特征处理

In [None]:
# 计算关联商品的数量
merged_df['also_buy_count'] = merged_df['also_buy'].apply(lambda x: len(x) if isinstance(x, list) else 0)
merged_df['also_view_count'] = merged_df['also_view'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# 可选择进一步处理，如归一化
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
merged_df[['also_buy_count_norm', 'also_view_count_norm']] = scaler.fit_transform(merged_df[['also_buy_count', 'also_view_count']])


特征融合

In [None]:
def combine_features(row):
    features = np.concatenate([
        row['title_embedding'],
        row['description_embedding'],
        row['summary_embedding'],
        row['category_embedding'],
        row['brand_embedding'],
        np.array([row['price_feature'], row['rank_feature']]),
        row['image_embedding'],
        np.array([row['also_buy_count_norm'], row['also_view_count_norm']])
    ])
    return features

# 生成最终的特征向量
merged_df['combined_features'] = merged_df.apply(combine_features, axis=1)


PCA降维

In [None]:
from sklearn.decomposition import PCA

# 对文本特征进行降维
pca_text = PCA(n_components=256)
text_features = np.stack(merged_df['title_embedding'] + merged_df['description_embedding'] + merged_df['summary_embedding'])
text_features_reduced = pca_text.fit_transform(text_features)
merged_df['text_features_reduced'] = list(text_features_reduced)

# 对图像特征进行降维
pca_image = PCA(n_components=256)
image_features = np.stack(merged_df['image_embedding'])
image_features_reduced = pca_image.fit_transform(image_features)
merged_df['image_features_reduced'] = list(image_features_reduced)


重新组合特征

In [None]:
def combine_reduced_features(row):
    features = np.concatenate([
        row['text_features_reduced'],
        row['category_embedding'],
        row['brand_embedding'],
        np.array([row['price_feature'], row['rank_feature']]),
        row['image_features_reduced'],
        np.array([row['also_buy_count_norm'], row['also_view_count_norm']])
    ])
    return features

# 生成降维后的特征向量
merged_df['combined_features_reduced'] = merged_df.apply(combine_reduced_features, axis=1)


保存处理后的特征

In [None]:
# 提取模型输入
X = np.stack(merged_df['combined_features_reduced'])
y = merged_df['overall'].values  # 或者根据任务定义的标签

# 保存到文件
np.save('features.npy', X)
np.save('labels.npy', y)
