In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from transformers import RobertaTokenizer, RobertaModel, ViTFeatureExtractor, ViTModel
from PIL import Image
import joblib
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from torch.nn import ReLU, Sigmoid, LeakyReLU

# 定义设备 # Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 读取CSV文件 # Read CSV file
df = pd.read_csv("最终的记录.csv", encoding='utf-8', encoding_errors='ignore')

# 重命名列（确保与之前一致）# Rename columns (ensure consistency with previous ones)
df.columns = [
    '商品名称', '一级种类', '二级种类', '图片地址', 'sku', '文本描述', '折扣率',
    '折扣价', '价格', '星级', '销量', '收入', '评论', 'cc-1', 'cc-2',
    'cc-3', 'DRC-1', 'DRC-2', 'DRC-3', 'RCV-1', 'RCV-2',
    'RCV-3', 'RSV-1', 'RSV-2', 'RSV-3'
]
# 重置索引
df = df.reset_index(drop=True)

# 转换数据类型 # Data type conversion
numeric_columns = ['折扣率', '折扣价', '价格', '销量', '收入']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 重新计算收入（如果需要） # Recalculate income (if necessary)
df['收入'] = df['折扣价'] * df['销量']
df['收入'] = np.log10(df['收入'].replace(0, 1))  # 避免 log10(0) 错误
df['折扣率'] = 1 - df['折扣价'] / df['价格']

# 数值特征  # Numerical Features
numerical_features = ['折扣价', '折扣率', '价格']

# 标准化数值特征 # Standardized numerical features
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# 保存 Scaler 以备后续使用  # Save the Scaler for future use
joblib.dump(scaler, 'scaler.pkl')

import torchvision.models as models
import torchvision
from torchvision import transforms

# 图像特征提取  # Image feature extraction
def vit_encoding(image_dir, num_images=2194, device='cpu'):
    vit_list = []
    model_name = "/root/.cache/huggingface/hub/models--google--vit-base-patch16-224/snapshots/3f49326eb077187dfe1c2a2bb15fbd74e6ab91e3"
    feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
    model = ViTModel.from_pretrained(model_name).to(device).eval()

    def process_image(image_path):
        try:
            image = Image.open(image_path).convert('RGB')
            return feature_extractor(images=image, return_tensors="pt")
        except Exception as e:
            print(f"无法打开图片 {image_path}: {e}")
            # 返回全零张量以保持尺寸一致  # Return a tensor of all zeros to maintain consistent dimensions
            return feature_extractor(images=Image.new('RGB', (224, 224), (0, 0, 0)), return_tensors="pt")

    for i in tqdm(range(num_images), desc="提取图像特征"):
        image_path = os.path.join(image_dir, f"{i}.jpg")
        inputs = process_image(image_path)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            cls_token = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # [1, hidden_size]
            vit_list.append(cls_token.squeeze())
    
    vit_np = np.array(vit_list)  # [num_images, hidden_size]
    
    # PCA降维  # PCA dimensionality reduction
    vit_pca = PCA(n_components=50)
    vit_features_pca = vit_pca.fit_transform(vit_np)
    
    # 保存PCA模型  # Save PCA model
    joblib.dump(vit_pca, 'vit_pca.pkl')
    
    return vit_features_pca

# 使用您的图像路径定义 # Define using your image path
image_directory = r"/root/img"
vit_features = vit_encoding(image_directory, num_images=len(df))
df['vit_features'] = list(vit_features)

from transformers import AutoTokenizer, AutoModel, ViTFeatureExtractor, ViTModel
# 设置设备
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# 初始化 BERT 模型和分词器 # Initialize BERT model and tokenizer
bert_model_name = r"/root/.cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594"
tokenizer_bert = AutoTokenizer.from_pretrained(bert_model_name)
model_bert = AutoModel.from_pretrained(bert_model_name, from_tf=False)  # 根据实际情况设置 from_tf # 根据实际情况设置 from_tf # Set `from_tf` according to the actual situation
model_bert.to(device)
model_bert.to(device)
model_bert.eval()  # 设置为评估模式  # Set to evaluation mode

def extract_text_features(texts, model, tokenizer, device, batch_size=32, max_length=128):
    all_features = []
    num_samples = len(texts)
    num_batches = int(np.ceil(num_samples / batch_size))
    
    with torch.no_grad():  # 禁用梯度计算 # Disable gradient computation
        for batch_idx in tqdm(range(num_batches), desc="提取 BERT 文本特征"):
            start_idx = batch_idx * batch_size
            end_idx = min(start_idx + batch_size, num_samples)
            batch_texts = texts[start_idx:end_idx]
            
            # Tokenize with fixed max_length and padding to max_length
            inputs = tokenizer(
                batch_texts,
                return_tensors='pt',
                padding='max_length',  # 固定填充到 max_length
                truncation=True,
                max_length=max_length,
            )
            inputs = {key: val.to(device) for key, val in inputs.items()}
            
            # 获取模型输出  # Obtain model output
            outputs = model(**inputs)
            
            # 对最后一个隐藏状态进行平均池化，得到每个样本的固定维度特征
            # Perform average pooling on the last hidden state to obtain fixed-dimensional features for each sample
            encoded_text = outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # [batch_size, hidden_size]
            
            # 添加到特征列表
            # Add to feature list
            all_features.append(encoded_text)
    
    # 将所有批次的特征堆叠起来，得到 [num_samples, hidden_size]
    # Stack the features of all batches to obtain [num_samples, hidden_size]
    all_features = np.vstack(all_features)
    return all_features

# 示例 DataFrame（请根据您的数据来源调整）
# df = pd.read_csv('your_data.csv')  # 根据您的数据来源加载 DataFrame

# Sample DataFrame (Please adjust according to your data source)
# df = pd.read_csv('your_data.csv')  # Load DataFrame based on your data source
# 确保所有文本字段为字符串类型
# Ensure that all text fields are of string type
text_columns = ['商品名称', '文本描述', '一级种类', '二级种类', 'cc-2']
for col in text_columns:
    df[col] = df[col].astype(str)

# 构建“其他文本”特征 # Constructing "other text" features
other_texts = (
    df['商品名称'] + " " +
    df['一级种类'] + " " +
    df['二级种类'] + " " +
    df['cc-1']
).tolist()

# 构建“文本描述”特征  # Constructing "Text Description" Features
text_descs = df['文本描述'].tolist()

# 提取 BERT 的“其他文本”特征  # Extracting the "Other Text" features of BERT
bert_other_text_features = extract_text_features(
    other_texts,
    model_bert,
    tokenizer_bert,
    device,
    batch_size=32,
    max_length=128
)
df['other_text_features'] = list(bert_other_text_features)

# 提取 BERT 的“文本描述”特征# Extracting the "text description" feature of BERT
bert_text_desc_features = extract_text_features(
    text_descs,
    model_bert,
    tokenizer_bert,
    device,
    batch_size=32,
    max_length=128
)
df['text_desc_features'] = list(bert_text_desc_features)

print("BERT 文本特征提取完成。")



Some weights of ViTModel were not initialized from the model checkpoint at /root/.cache/huggingface/hub/models--google--vit-base-patch16-224/snapshots/3f49326eb077187dfe1c2a2bb15fbd74e6ab91e3 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
提取图像特征: 100%|██████████| 2194/2194 [02:52<00:00, 12.74it/s]
提取 BERT 文本特征: 100%|██████████| 69/69 [00:03<00:00, 19.02it/s]
提取 BERT 文本特征: 100%|██████████| 69/69 [00:03<00:00, 20.23it/s]

BERT 文本特征提取完成。





In [2]:
# 合并所有特征 # Merge all features
all_features = np.concatenate([
    df[numerical_features].values,
    np.stack(df['text_desc_features'].values),
    np.stack(df['vit_features'].values),
    np.stack(df['other_text_features'].values)
], axis=1)

# 定义目标变量 # Define target variable
targets = df['收入'].to_numpy()

# 定义评估指标函数 # Define evaluation metric function
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, predicted)
    return mae, rmse, r2

# 设置随机种子列表  # Set random seed list
SEEDS = [42, 23, 15, 34, 18, 32, 47, 27, 8, 52]

# 定义 CustomDataset 类 # Define the CustomDataset class
class CustomDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return {
            'features': torch.tensor(self.features[idx], dtype=torch.float32),
            'targets': torch.tensor(self.targets[idx], dtype=torch.float32),
        }

# 定义 ANN 回归模型  #Define ANN regression model
class ANNRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, activation_function, dropout):
        super(ANNRegressor, self).__init__()
        
        layers = []
        previous_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(previous_dim, hidden_dim))
            layers.append(activation_function)
            layers.append(nn.Dropout(dropout))
            previous_dim = hidden_dim
        
        layers.append(nn.Linear(previous_dim, output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

# 定义训练和评估函数 #Define training and evaluation functions
def train_and_evaluate_model(learning_rate, epochs, activation_function, random_seed, hidden_dims, dropout, all_features, targets):
    # 设置随机种子  #Set random seeds
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)
    
    # 数据分割  #Data segmentation
    X_train, X_test, y_train, y_test = train_test_split(all_features, targets, test_size=0.01, random_state=random_seed)
    
    # 创建Dataset和DataLoader  #Create Dataset and DataLoader
    train_dataset = CustomDataset(X_train, y_train)
    test_dataset = CustomDataset(X_test, y_test)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    
    input_dim = all_features.shape[1]
    output_dim = 1  # 回归任务  #Return task
    
    # 初始化模型 #Initialize the model
    model = ANNRegressor(input_dim, hidden_dims, output_dim, activation_function, dropout).to(device)
    
    # 定义损失函数和优化器 #Define loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # 训练模型 #Training model
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0
        for batch in train_loader:
            features = batch['features'].to(device)
            targets_batch = batch['targets'].to(device).unsqueeze(1)  # [batch_size, 1]
            
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, targets_batch)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item() * features.size(0)
        epoch_loss /= len(train_loader.dataset)
        if (epoch + 1) % 100 == 0 or epoch == 0:
            print(f"Seed {random_seed} Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}")
    
    # 测试模型 # 测试模型
    model.eval()
    all_true = []
    all_predicted = []
    with torch.no_grad():
        for batch in test_loader:
            features = batch['features'].to(device)
            targets_batch = batch['targets'].to(device).unsqueeze(1)
            outputs = model(features)
            all_true.extend(targets_batch.cpu().numpy())
            all_predicted.extend(outputs.cpu().numpy())
    
    # 转换回原始收入值  #Convert back to the original income value
    all_true = 10 ** np.array(all_true).flatten()
    all_predicted = 10 ** np.array(all_predicted).flatten()
    
    # 计算评估指标 #Calculate evaluation indicators
    mae, rmse, r2 = evaluate_model(all_true, all_predicted)
    return mae, rmse, r2

# 网格搜索超参数 #Grid search hyperparameters
best_params = None
best_r2 = float('-inf')  # 使用 R2 最大化作为选择标准
best_metrics = {}
param_grid = {
    'learning_rate': [0.001],
    'epochs': [500],
    'activation_function': [ Sigmoid()],
}
results = []  # 保存所有组合和种子实验结果  #Save all combinations and seed experiment results

# 固定的 hidden_dims 和 dropout
fixed_hidden_dims = [256, 128, 64]  # 根据需求调整 #Adjust according to demand
fixed_dropout = 0.1  # 根据需求调整 #Adjust according to demand
# 存储实验结果  #Store experimental results
experiment_results = {
    'seed': [],
    'RMSE': [],
    'MAE': [],
    'R2': []
}

for params in tqdm(ParameterGrid(param_grid), desc="网格搜索超参数"):
    learning_rate = params['learning_rate']
    epochs = params['epochs']
    activation_function = params['activation_function']
    seed_metrics = []  # 保存当前超参数组合下所有随机种子的结果 #Save the results of all random seeds under the current hyperparameter combination
    print(f"正在训练模型: {params}")
    
    for seed in SEEDS:
        mae, rmse, r2 = train_and_evaluate_model(
            learning_rate=learning_rate,
            epochs=epochs,
            activation_function=activation_function,
            random_seed=seed,
            hidden_dims=fixed_hidden_dims,
            dropout=fixed_dropout,
            all_features=all_features,
            targets=targets
        )
        # 记录最终的训练和验证损失以及评估指标   #Record the final training and validation losses, as well as evaluation metrics
        experiment_results['seed'].append(seed)
        experiment_results['RMSE'].append(rmse)
        experiment_results['MAE'].append(mae)
        experiment_results['R2'].append(r2)
        
        results.append({'seed': seed, 'MAE': mae, 'RMSE': rmse, 'R2': r2})
        print(f"Seed {seed} -> MAE: {mae:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")

# 将实验结果转换为DataFrame  #Convert the experimental results into a DataFrame
results_df = pd.DataFrame(experiment_results)
print("\n=== 实验结果汇总 ===")
print(results_df)

# 可选：保存实验结果到CSV文件   #Optional: Save experimental results to CSV file
results_df.to_csv("ANN_best_results.csv", index=False, encoding='utf-8-sig')
print("实验结果已保存到 'experiment_results.csv'")


网格搜索超参数:   0%|          | 0/1 [00:00<?, ?it/s]

正在训练模型: {'activation_function': Sigmoid(), 'epochs': 500, 'learning_rate': 0.001}
Seed 42 Epoch [1/500], Loss: 1.5528
Seed 42 Epoch [100/500], Loss: 0.0342
Seed 42 Epoch [200/500], Loss: 0.0165
Seed 42 Epoch [300/500], Loss: 0.0090
Seed 42 Epoch [400/500], Loss: 0.0061
Seed 42 Epoch [500/500], Loss: 0.0052
Seed 42 -> MAE: 595.6324, RMSE: 1239.9548, R2: -0.1923
Seed 23 Epoch [1/500], Loss: 1.7147
Seed 23 Epoch [100/500], Loss: 0.0327
Seed 23 Epoch [200/500], Loss: 0.0172
Seed 23 Epoch [300/500], Loss: 0.0095
Seed 23 Epoch [400/500], Loss: 0.0061
Seed 23 Epoch [500/500], Loss: 0.0050
Seed 23 -> MAE: 334.5562, RMSE: 537.5737, R2: 0.6173
Seed 15 Epoch [1/500], Loss: 1.8482
Seed 15 Epoch [100/500], Loss: 0.0381
Seed 15 Epoch [200/500], Loss: 0.0185
Seed 15 Epoch [300/500], Loss: 0.0102
Seed 15 Epoch [400/500], Loss: 0.0059
Seed 15 Epoch [500/500], Loss: 0.0053
Seed 15 -> MAE: 495.7730, RMSE: 772.4233, R2: 0.6342
Seed 34 Epoch [1/500], Loss: 2.2519
Seed 34 Epoch [100/500], Loss: 0.0309
Seed 

网格搜索超参数: 100%|██████████| 1/1 [19:49<00:00, 1189.24s/it]

Seed 52 Epoch [500/500], Loss: 0.0048
Seed 52 -> MAE: 843.5557, RMSE: 1530.9725, R2: -0.0122

=== 实验结果汇总 ===
   seed         RMSE         MAE        R2
0    42  1239.954834  595.632385 -0.192331
1    23   537.573730  334.556152  0.617346
2    15   772.423279  495.772980  0.634166
3    34  1649.968994  831.792236  0.285757
4    18  1645.705566  915.271240  0.299583
5    32  1334.098389  733.171204  0.438771
6    47  1928.791382  780.542419  0.339567
7    27  2182.605957  852.798218  0.299138
8     8  1316.023560  644.689697  0.556439
9    52  1530.972534  843.555725 -0.012160
实验结果已保存到 'experiment_results.csv'



