In [1]:
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler

# 定义设备  #Define equipment
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 读取CSV文件  #Read CSV file
df = pd.read_csv("最终的记录.csv", encoding='utf-8', encoding_errors='ignore')

# 重命名列（确保与之前一致） #Rename the list (ensure consistency with previous)
df.columns = [
    '商品名称', '一级种类', '二级种类', '图片地址', 'sku', '文本描述', '折扣率',
    '折扣价', '价格', '星级', '销量', '收入', '评论', 'cc-1', 'cc-2',
    'cc-3', 'DRC-1', 'DRC-2', 'DRC-3', 'RCV-1', 'RCV-2',
    'RCV-3', 'RSV-1', 'RSV-2', 'RSV-3'
]


# 转换数据类型  #Convert data type
numeric_columns = ['星级', '折扣率', '折扣价', '价格', '销量', '收入']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 重新计算收入（如果需要） #Recalculate income (if necessary)
df['收入'] = df['折扣价'] * df['销量']

# 数值特征  #Numerical characteristics
numerical_features = ['星级', '折扣价', '折扣率', '价格']

# 标准化数值特征 #Standardized numerical characteristics
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt


# 假设 df 是您已经加载的 DataFrame，并包含以下列：
# ['星级', '折扣价', '折扣率', '价格', '收入']

#Assuming df is the DataFrame you have already loaded and contains the following columns:
#Star rating, discount price, discount rate, price, revenue

# 提取特征和目标变量  #Extract features and target variables
X = df[[ '折扣价', '折扣率', '价格']]
y = df['收入']

# 定义随机种子列表  #Define a random seed list
random_states =  [ 42, 23, 15, 34, 18, 32, 47, 27,8, 52]

# 定义岭回归的 alpha 参数  #Define the alpha parameter for ridge regression
alpha_value = 1.0  # 您可以根据需要调整正则化参数   #You can adjust the regularization parameters as needed

# 初始化结果存储结构  #Initialization result storage structure
results = {
    'random_state': [],
    'MAE': [],
    'RMSE': [],
    'R²': [],
    'Coefficients': [],
    'Intercept': []
}

# 遍历每个随机种子   #Traverse each random seed
for state in random_states:
    print(f"\n=== 使用 random_state = {state} ===")
    
    # 划分训练集和测试集  #Divide the training set and testing set
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=state
    )
    
    # 创建岭回归模型  #Create Ridge Regression Model
    ridge_model = Ridge(alpha=alpha_value)
    
    # 训练模型 #Training model
    ridge_model.fit(X_train, y_train)
    
    # 进行预测  #Make predictions
    y_pred_ridge = ridge_model.predict(X_test)
     
    # 计算评估指标  #Calculate evaluation indicators
    mae = mean_absolute_error(y_test, y_pred_ridge)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
    r2 = r2_score(y_test, y_pred_ridge)
    
    print(f"Ridge Regression MAE: {mae:.4f}")
    print(f"Ridge Regression RMSE: {rmse:.4f}")
    print(f"Ridge Regression R²: {r2:.4f}")
    
    # 输出模型的系数和截距   #Output the coefficients and intercepts of the model
    print(f"Coefficients: {ridge_model.coef_}")
    print(f"Intercept: {ridge_model.intercept_}")
    
    # 记录结果  #Record results
    results['random_state'].append(state)
    results['MAE'].append(mae)
    results['RMSE'].append(rmse)
    results['R²'].append(r2)
    results['Coefficients'].append(ridge_model.coef_.tolist())
    results['Intercept'].append(ridge_model.intercept_)

# 将结果转换为 DataFrame #Convert the result to a DataFrame
results_df = pd.DataFrame(results)

# 展示汇总结果    #Display summary results
print("\n=== 所有随机种子的实验结果汇总 ===")
print(results_df)

# 可选：保存实验结果到 CSV 文件   #Optional: Save experimental results to CSV file
results_df.to_csv("ridge_regression_results-1.0.csv", index=False, encoding='utf-8-sig')
print("\n实验结果已保存到 'ridge_regression_results-1.0.csv'")

# 可选：统计分析   #Optional: Statistical analysis
print("\n=== 评估指标统计 ===")
print(results_df[['MAE', 'RMSE', 'R²']].describe())



=== 使用 random_state = 42 ===
Ridge Regression MAE: 1466.9679
Ridge Regression RMSE: 2108.6795
Ridge Regression R²: 0.0630
Coefficients: [ 297.91160884 -139.01464485  291.90578831]
Intercept: 1829.684559318428

=== 使用 random_state = 23 ===
Ridge Regression MAE: 1740.9243
Ridge Regression RMSE: 6259.5828
Ridge Regression R²: 0.0001
Coefficients: [ 540.54977229 -118.08368048  111.19314624]
Intercept: 1736.2655803164591

=== 使用 random_state = 15 ===
Ridge Regression MAE: 1499.5470
Ridge Regression RMSE: 2527.9148
Ridge Regression R²: 0.0379
Coefficients: [ 439.50171795 -107.28951673  176.85022869]
Intercept: 1804.2088704354865

=== 使用 random_state = 34 ===
Ridge Regression MAE: 1725.2351
Ridge Regression RMSE: 5360.4321
Ridge Regression R²: 0.0134
Coefficients: [ 152.85489277 -212.78772125  421.93042616]
Intercept: 1718.0023685805315

=== 使用 random_state = 18 ===
Ridge Regression MAE: 1745.1030
Ridge Regression RMSE: 5157.1650
Ridge Regression R²: 0.0033
Coefficients: [ 380.93224792 -184.

In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

# 假设 df 是您已经加载的 DataFrame，并包含以下列：
# ['星级', '折扣价', '折扣率', '价格', '收入']

#Assuming df is the DataFrame you have already loaded and contains the following columns:
#Star rating, discount price, discount rate, price, revenue

# 提取特征和目标变量     #Extract features and target variables
X = df[['折扣价', '折扣率', '价格']]
y = df['收入']

# 定义随机种子列表     #Define a random seed list
random_states =  [ 42, 23, 15, 34, 18, 32, 47, 27,8, 52]

# 定义岭回归的 alpha 参数列表    #Define the alpha parameter list for ridge regression
alpha_values = [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]

# 初始化结果存储结构  #Initialization result storage structure
results = {
    'alpha': [],
    'random_state': [],
    'MAE': [],
    'RMSE': [],
    'R²': [],
    'Coefficients': [],
    'Intercept': []
}

# 初始化变量以跟踪最佳结果    #Initialize variables to track optimal results
best_r2 = -np.inf
best_params = {}

# 遍历每个 alpha 值   #Traverse each alpha value
for alpha in alpha_values:
    # 遍历每个随机种子     #Traverse each random seed
    for state in random_states:
        print(f"\n=== alpha = {alpha}, random_state = {state} ===")
        
        # 划分训练集和测试集     #Divide the training set and testing set
        X_train, X_test, y_train, y_test = train_test_split(
            X,
            y,
            test_size=0.2,
            random_state=state
        )
        
        # 创建岭回归模型    #Create Ridge Regression Model
        ridge_model = Ridge(alpha=alpha)
        
        # 训练模型  #Training model
        ridge_model.fit(X_train, y_train)
        
        # 进行预测  #Make predictions
        y_pred_ridge = ridge_model.predict(X_test)
        
        # 计算评估指标  #Calculate evaluation indicators
        mae = mean_absolute_error(y_test, y_pred_ridge)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
        r2 = r2_score(y_test, y_pred_ridge)
        
        print(f"Ridge Regression MAE: {mae:.4f}")
        print(f"Ridge Regression RMSE: {rmse:.4f}")
        print(f"Ridge Regression R²: {r2:.4f}")
        print(f"Coefficients: {ridge_model.coef_}")
        print(f"Intercept: {ridge_model.intercept_}")
        
        # 记录结果   #Record results
        results['alpha'].append(alpha)
        results['random_state'].append(state)
        results['MAE'].append(mae)
        results['RMSE'].append(rmse)
        results['R²'].append(r2)
        results['Coefficients'].append(ridge_model.coef_.tolist())
        results['Intercept'].append(ridge_model.intercept_)
        
        # 检查是否为最佳结果   #Check if it is the best result
        if r2 > best_r2:
            best_r2 = r2
            best_params = {
                'alpha': alpha,
                'random_state': state,
                'MAE': mae,
                'RMSE': rmse,
                'R²': r2,
                'Coefficients': ridge_model.coef_.tolist(),
                'Intercept': ridge_model.intercept_
            }

# 将结果转换为 DataFrame  #Convert the result to a DataFrame
results_df = pd.DataFrame(results)

# 展示汇总结果  #Display summary results
print("\n=== 所有实验的结果汇总 ===")
print(results_df)

# 可选：保存实验结果到 CSV 文件 #Optional: Save experimental results to CSV file
results_df.to_csv("ridge_regression_results_tuned.csv", index=False, encoding='utf-8-sig')
print("\n实验结果已保存到 'ridge_regression_results_tuned.csv'")





=== alpha = 0.1, random_state = 42 ===
Ridge Regression MAE: 1466.9542
Ridge Regression RMSE: 2108.6737
Ridge Regression R²: 0.0630
Coefficients: [ 297.39503268 -139.33372026  292.59874417]
Intercept: 1829.684865929386

=== alpha = 0.1, random_state = 23 ===
Ridge Regression MAE: 1740.9107
Ridge Regression RMSE: 6259.6197
Ridge Regression R²: 0.0001
Coefficients: [ 545.80842593 -116.17010382  105.92077073]
Intercept: 1736.2612154279768

=== alpha = 0.1, random_state = 15 ===
Ridge Regression MAE: 1499.5512
Ridge Regression RMSE: 2527.9380
Ridge Regression R²: 0.0379
Coefficients: [ 442.0736935  -106.39636532  174.39392346]
Intercept: 1804.216566401205

=== alpha = 0.1, random_state = 34 ===
Ridge Regression MAE: 1725.2979
Ridge Regression RMSE: 5360.4598
Ridge Regression R²: 0.0134
Coefficients: [ 148.61584667 -214.55713358  426.43874287]
Intercept: 1718.0029920006614

=== alpha = 0.1, random_state = 18 ===
Ridge Regression MAE: 1745.0948
Ridge Regression RMSE: 5157.1748
Ridge Regress

In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt



# 假设 df 是您已经加载的 DataFrame，并包含以下列：
# ['星级', '折扣价', '折扣率', '价格', '收入']

#Assuming df is the DataFrame you have already loaded and contains the following columns:
#Star rating, discount price, discount rate, price, revenue

# 提取特征和目标变量     #Extract features and target variables
X = df[['折扣价', '折扣率', '价格']]
y = df['收入']


# 定义随机种子列表     #Define a random seed list
random_states =  [ 42, 23, 15, 34, 18, 32, 47, 27,8, 52]

# 定义 Lasso 回归的 alpha 参数列表     #Define the alpha parameter list for lasso regression
alpha_values = [0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]

# 初始化结果存储结构     #Initialization result storage structure
results = {
    'alpha': [],
    'random_state': [],
    'MAE': [],
    'RMSE': [],
    'R²': [],
    'Coefficients': [],
    'Intercept': []
}

# 初始化变量以跟踪最佳结果        #Initialize variables to track optimal results
best_r2 = -np.inf
best_params = {}

# 遍历每个 alpha 值   #Traverse each alpha value
for alpha in alpha_values:
    # 遍历每个随机种子 #Traverse each random seed
    for state in random_states:
        print(f"\n=== alpha = {alpha}, random_state = {state} ===")
        
        # 划分训练集和测试集    #Divide the training set and testing set
        X_train, X_test, y_train, y_test = train_test_split(
            X,
            y,
            test_size=0.2,
            random_state=state
        )
        
        # 创建 Lasso 回归模型     #Create Lasso regression model
        lasso_model = Lasso(alpha=alpha, random_state=state, max_iter=10000)
        
        # 训练模型      #Training model
        lasso_model.fit(X_train, y_train)
        
        # 进行预测   #Make predictions
        y_pred_lasso = lasso_model.predict(X_test)
        
        # 计算评估指标    #Calculate evaluation indicators
        mae = mean_absolute_error(y_test, y_pred_lasso)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
        r2 = r2_score(y_test, y_pred_lasso)
        
        print(f"Lasso Regression MAE: {mae:.4f}")
        print(f"Lasso Regression RMSE: {rmse:.4f}")
        print(f"Lasso Regression R²: {r2:.4f}")
        print(f"Coefficients: {lasso_model.coef_}")
        print(f"Intercept: {lasso_model.intercept_}")
        
        # 记录结果   #Record results
        results['alpha'].append(alpha)
        results['random_state'].append(state)
        results['MAE'].append(mae)
        results['RMSE'].append(rmse)
        results['R²'].append(r2)
        results['Coefficients'].append(lasso_model.coef_.tolist())
        results['Intercept'].append(lasso_model.intercept_)
        
        # 检查是否为最佳结果    #Check if it is the best result
        if r2 > best_r2:
            best_r2 = r2
            best_params = {
                'alpha': alpha,
                'random_state': state,
                'MAE': mae,
                'RMSE': rmse,
                'R²': r2,
                'Coefficients': lasso_model.coef_.tolist(),
                'Intercept': lasso_model.intercept_
            }

# 将结果转换为 DataFrame  #Convert the result to a DataFrame
results_df = pd.DataFrame(results)

# 展示汇总结果   #Display summary results
print("\n=== 所有实验的结果汇总 ===")
print(results_df)

# 可选：保存实验结果到 CSV 文件
results_df.to_csv("lasso_regression_results.csv", index=False, encoding='utf-8-sig')
print("\n实验结果已保存到 'lasso_regression_results.csv'")

# 打印最佳参数组合
print("\n=== 最佳参数组合 ===")
print(f"最佳 alpha: {best_params['alpha']}")
print(f"最佳 random_state: {best_params['random_state']}")
print(f"MAE: {best_params['MAE']:.4f}")
print(f"RMSE: {best_params['RMSE']:.4f}")
print(f"R²: {best_params['R²']:.4f}")
print(f"Coefficients: {best_params['Coefficients']}")
print(f"Intercept: {best_params['Intercept']}")

# 可选：将最佳结果保存到单独的 CSV 文件         #Optional: Save experimental results to CSV file
best_results_df = pd.DataFrame([best_params])
best_results_df.to_csv("lasso_regression_best_parameters.csv", index=False, encoding='utf-8-sig')
print("最佳参数已保存到 'lasso_regression_best_parameters.csv'")

# 可选：统计分析
print("\n=== 评估指标统计 ===")
print(results_df[['MAE', 'RMSE', 'R²']].describe())



=== alpha = 0.01, random_state = 42 ===
Lasso Regression MAE: 1466.9533
Lasso Regression RMSE: 2108.6735
Lasso Regression R²: 0.0630
Coefficients: [ 297.43020228 -139.31969531  292.56999575]
Intercept: 1829.684965662263

=== alpha = 0.01, random_state = 23 ===
Lasso Regression MAE: 1740.9089
Lasso Regression RMSE: 6259.6240
Lasso Regression R²: 0.0001
Coefficients: [ 546.53460304 -115.88998353  105.17629372]
Intercept: 1736.2602604592732

=== alpha = 0.01, random_state = 15 ===
Lasso Regression MAE: 1499.5525
Lasso Regression RMSE: 2527.9412
Lasso Regression R²: 0.0379
Coefficients: [ 442.46542327 -106.24313958  174.00138643]
Intercept: 1804.2172538738812

=== alpha = 0.01, random_state = 34 ===
Lasso Regression MAE: 1725.3050
Lasso Regression RMSE: 5360.4625
Lasso Regression R²: 0.0134
Coefficients: [ 148.23414239 -214.70695324  426.83718949]
Intercept: 1718.0028708178888

=== alpha = 0.01, random_state = 18 ===
Lasso Regression MAE: 1745.0917
Lasso Regression RMSE: 5157.1753
Lasso R

In [5]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 假设 df 是您已经加载的 DataFrame
# 提取特征和目标变量

# Assume that df is the DataFrame you have already loaded
# Extracting features and target variables
X = df[[ '折扣价', '折扣率', '价格']]
y = np.log10(df['收入'].replace(0, 1)) 

# 定义参数网格  # Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5], #不设置最大层数
    'learning_rate': [0.01, 0.1], ###学习率少
    'subsample': [0.8],
    'colsample_bytree': [0.8]
}

# 定义随机种子   # Define random seed
random_states =  [ 42, 23, 15, 34, 18, 32, 47, 27,8, 52]

# 初始化结果存储  # Initialize result storage
results = []

# 遍历参数网格   # Traverse parameter grid
for n_estimators in param_grid['n_estimators']:
    for max_depth in param_grid['max_depth']:
        for learning_rate in param_grid['learning_rate']:
            for subsample in param_grid['subsample']:
                for colsample_bytree in param_grid['colsample_bytree']:
                    # 用于存储当前参数组合的随机种子实验结果  # Used to store the random seed experiment results of the current parameter combination
                    temp_results = {
                        'n_estimators': n_estimators,
                        'max_depth': max_depth,
                        'learning_rate': learning_rate,
                        'subsample': subsample,
                        'colsample_bytree': colsample_bytree,
                        'random_state_metrics': []
                    }

                    # 对每个随机种子运行 #Run for each random seed
                    for state in random_states:
                        # 划分训练集和测试集
                        X_train, X_test, y_train, y_test = train_test_split(
                            X, y, test_size=0.1, random_state=state
                        )

                        # 创建 XGBoost 模型   # Creating an XGBoost model
                        xgb_model = XGBRegressor(
                            n_estimators=n_estimators,
                            max_depth=max_depth,
                            learning_rate=learning_rate,
                            subsample=subsample,
                            colsample_bytree=colsample_bytree,
                            objective='reg:squarederror',
                            random_state=state
                        )

                        # 训练模型   # Train Model
                        xgb_model.fit(X_train, y_train)

                        # 预测  #Prediction
                        y_pred = xgb_model.predict(X_test)
                        
                        # 转换为NumPy数组  # Convert to NumPy array
                        all_predictions =10**np.array(y_pred).flatten()
                        all_targets = 10**np.array(y_test).flatten()

                        # 计算指标 # Calculation Metrics
                        mae = mean_absolute_error(all_targets, all_predictions)
                        rmse = np.sqrt(mean_squared_error(all_targets, all_predictions))
                        r2 = r2_score(all_targets, all_predictions)

                        # 记录当前随机种子的指标  # Record the metrics of the current random seed
                        temp_results['random_state_metrics'].append({
                            'random_state': state,
                            'MAE': mae,
                            'RMSE': rmse,
                            'R²': r2
                        })

                    # 计算该参数组合的平均指标  # Calculate the average metric for this parameter combination
                    metrics = pd.DataFrame(temp_results['random_state_metrics'])
                    temp_results['avg_MAE'] = metrics['MAE'].mean()
                    temp_results['avg_RMSE'] = metrics['RMSE'].mean()
                    temp_results['avg_R²'] = metrics['R²'].mean()

                    # 添加到总结果  # Add to total result
                    results.append(temp_results)

# 将结果转换为 DataFrame    # Convert the results to a DataFrame
results_df = pd.DataFrame(results)

# 保存所有实验结果到 CSV 文件   # Save all experimental results to a CSV file
results_df.to_csv("xgboost_grid_search_results.csv", index=False, encoding='utf-8-sig')

# 输出最佳参数组合    # Output the optimal parameter combination
best_result = results_df.loc[results_df['avg_R²'].idxmax()]
print("\n=== 最佳参数组合 ===")
print(best_result)

# 保存最佳参数组合到 CSV 文件    # Save the optimal parameter combination to a CSV file
best_result.to_frame().T.to_csv("xgboost_best_parameters.csv", index=False, encoding='utf-8-sig')

# 打印最佳模型的随机种子具体结果  # Print the specific results of the random seed for the best model
print("\n=== 最佳参数的详细随机种子结果 ===")
best_random_state_metrics = pd.DataFrame(best_result['random_state_metrics'])
print(best_random_state_metrics)

# 保存最佳模型的随机种子结果到 CSV 文件   # Save the random seed results of the best model to a CSV file
best_random_state_metrics.to_csv("xgboost_best_random_state_metrics.csv", index=False, encoding='utf-8-sig')



=== 最佳参数组合 ===
n_estimators                                                          100
max_depth                                                               5
learning_rate                                                         0.1
subsample                                                             0.8
colsample_bytree                                                      0.8
random_state_metrics    [{'random_state': 42, 'MAE': 821.2740143654564...
avg_MAE                                                       1063.323401
avg_RMSE                                                      2724.186336
avg_R²                                                           0.155493
Name: 3, dtype: object

=== 最佳参数的详细随机种子结果 ===
   random_state          MAE         RMSE        R²
0            42   821.274014  1456.123240  0.290664
1            23   988.503596  1686.379713  0.173006
2            15  1140.877899  2784.626620  0.060796
3            34   922.804796  1639.637918  0.188992
4           