In [4]:
import pandas as pd

# 读取CSV文件
futures_gold_df= pd.read_csv('../data_files/ODS/all_commodities_data.csv')
futures_gold_df1=futures_gold_df.drop("ticker", axis=1)
futures_fuels_df= pd.read_csv('../data_files/ODS/all_fuels_data.csv')
futures_fuels_df1=futures_fuels_df.drop("ticker", axis=1)
# 将两个DataFrame在垂直方向上拼接
concatenated_df = pd.concat([futures_gold_df1, futures_fuels_df1])
concatenated_df["commodity"].value_counts()

commodity
Crude Oil          5866
Natural Gas        5862
Copper             5861
Heating Oil        5859
Silver             5858
Gold               5857
RBOB Gasoline      5820
Palladium          5565
Platinum           5325
Brent Crude Oil    4078
Name: count, dtype: int64

### 对非数值型特征的序列编码处理
用from sklearn.preprocessing的LabelEncoder对象对非数值型特征"commodity"进行序列编码处理，即值映射处理。(注意：这种方法会添加来自于自然数的额外顺序信息)


In [6]:
from sklearn.preprocessing import LabelEncoder
# 创建LabelEncoder对象
label_encoder = LabelEncoder()
concatenated_df["commodity_encoded"] = label_encoder.fit_transform(concatenated_df["commodity"])
concatenated_df

Unnamed: 0,commodity,date,open,high,low,close,volume,commodity_encoded
0,Gold,2000-08-30,273.899994,273.899994,273.899994,273.899994,0,3
1,Gold,2000-08-31,274.799988,278.299988,274.799988,278.299988,0,3
2,Gold,2000-09-01,277.000000,277.000000,277.000000,277.000000,0,3
3,Gold,2000-09-05,275.799988,275.799988,275.799988,275.799988,2,3
4,Gold,2000-09-06,274.200012,274.200012,274.200012,274.200012,0,3
...,...,...,...,...,...,...,...,...
27480,Brent Crude Oil,2023-12-27,80.739998,81.320000,79.489998,79.650002,8282,0
27481,Brent Crude Oil,2023-12-28,79.839996,79.959999,78.339996,78.389999,24301,0
27482,Brent Crude Oil,2023-12-29,77.419998,77.970001,76.750000,77.040001,20115,0
27483,Brent Crude Oil,2024-01-02,77.209999,79.040001,75.599998,75.889999,28591,0


In [7]:
table_futures_df=concatenated_df[["commodity","commodity_encoded"]]
duplicates=table_futures_df.duplicated()
df_no_duplicates = table_futures_df[~duplicates]
df_no_duplicates

Unnamed: 0,commodity,commodity_encoded
0,Gold,3
5857,Silver,9
11715,Platinum,7
17040,Copper,1
22901,Palladium,6
0,Crude Oil,2
5866,Heating Oil,4
11725,Natural Gas,5
17587,RBOB Gasoline,8
23407,Brent Crude Oil,0


### 对时间日期数据的时间特征提取
将为字符串类型时间日期特征date拆分为year, month,day三个数值型特，提取出时间信息；

In [9]:
# 使用str.split()函数按斜杠分割特征，并将结果存储在新的列中
concatenated_df[['year', 'month','day']] = concatenated_df['date'].str.split('-', expand=True)
concatenated_df

Unnamed: 0,commodity,date,open,high,low,close,volume,commodity_encoded,year,month,day
0,Gold,2000-08-30,273.899994,273.899994,273.899994,273.899994,0,3,2000,08,30
1,Gold,2000-08-31,274.799988,278.299988,274.799988,278.299988,0,3,2000,08,31
2,Gold,2000-09-01,277.000000,277.000000,277.000000,277.000000,0,3,2000,09,01
3,Gold,2000-09-05,275.799988,275.799988,275.799988,275.799988,2,3,2000,09,05
4,Gold,2000-09-06,274.200012,274.200012,274.200012,274.200012,0,3,2000,09,06
...,...,...,...,...,...,...,...,...,...,...,...
27480,Brent Crude Oil,2023-12-27,80.739998,81.320000,79.489998,79.650002,8282,0,2023,12,27
27481,Brent Crude Oil,2023-12-28,79.839996,79.959999,78.339996,78.389999,24301,0,2023,12,28
27482,Brent Crude Oil,2023-12-29,77.419998,77.970001,76.750000,77.040001,20115,0,2023,12,29
27483,Brent Crude Oil,2024-01-02,77.209999,79.040001,75.599998,75.889999,28591,0,2024,01,02


In [10]:
futures_gold_df2=concatenated_df

### 对时间序列数据的滑动窗口处理
对相邻7个对象的部分相同特征进行标准差，平均值，步长为1，得到扩展特征，提取出时间信息；

In [11]:
# 定义滚动窗口大小
window_size = 7

# 计算滚动窗口内的均值特征
futures_gold_df2['open_rolling_mean'] = futures_gold_df2['open'].rolling(window_size).mean()

# 计算滚动窗口内的标准差特征
futures_gold_df2['open_rolling_std'] = futures_gold_df2['open'].rolling(window_size).std()

# 计算滚动窗口内的均值特征
futures_gold_df2['close_rolling_mean'] = futures_gold_df2['close'].rolling(window_size).mean()

# 计算滚动窗口内的标准差特征
futures_gold_df2['close_rolling_std'] = futures_gold_df2['close'].rolling(window_size).std()

# 计算滚动窗口内的均值特征
futures_gold_df2['high_rolling_mean'] = futures_gold_df2['high'].rolling(window_size).mean()

# 计算滚动窗口内的标准差特征
futures_gold_df2['high_rolling_std'] = futures_gold_df2['high'].rolling(window_size).std()

# 计算滚动窗口内的均值特征
futures_gold_df2['low_rolling_mean'] = futures_gold_df2['low'].rolling(window_size).mean()

# 计算滚动窗口内的标准差特征
futures_gold_df2['low_rolling_std'] = futures_gold_df2['low'].rolling(window_size).std()

# 输出结果
futures_gold_df2

Unnamed: 0,commodity,date,open,high,low,close,volume,commodity_encoded,year,month,day,open_rolling_mean,open_rolling_std,close_rolling_mean,close_rolling_std,high_rolling_mean,high_rolling_std,low_rolling_mean,low_rolling_std
0,Gold,2000-08-30,273.899994,273.899994,273.899994,273.899994,0,3,2000,08,30,,,,,,,,
1,Gold,2000-08-31,274.799988,278.299988,274.799988,278.299988,0,3,2000,08,31,,,,,,,,
2,Gold,2000-09-01,277.000000,277.000000,277.000000,277.000000,0,3,2000,09,01,,,,,,,,
3,Gold,2000-09-05,275.799988,275.799988,275.799988,275.799988,2,3,2000,09,05,,,,,,,,
4,Gold,2000-09-06,274.200012,274.200012,274.200012,274.200012,0,3,2000,09,06,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27480,Brent Crude Oil,2023-12-27,80.739998,81.320000,79.489998,79.650002,8282,0,2023,12,27,78.968572,1.180104,79.437143,0.928453,80.471430,0.815462,78.181428,1.276511
27481,Brent Crude Oil,2023-12-28,79.839996,79.959999,78.339996,78.389999,24301,0,2023,12,28,79.378572,0.810789,79.500000,0.819492,80.535715,0.741413,78.547142,0.718673
27482,Brent Crude Oil,2023-12-29,77.419998,77.970001,76.750000,77.040001,20115,0,2023,12,29,79.288571,0.996534,79.187143,1.246524,80.294286,1.204586,78.449999,0.914203
27483,Brent Crude Oil,2024-01-02,77.209999,79.040001,75.599998,75.889999,28591,0,2024,01,02,78.997142,1.270364,78.642857,1.725174,80.070001,1.279817,77.952856,1.354630


In [12]:
tomorrow_close=0
list_tomorrow_close=[]
# 使用itertuples()方法遍历DataFrame的数据
for row in futures_gold_df2.itertuples():
    list_tomorrow_close.append(tomorrow_close)
    tomorrow_close=row.close
data = {'tomorrow_close':list_tomorrow_close}
df = pd.DataFrame(data)
futures_gold_df2['tomorrow_close'] = df['tomorrow_close']
futures_gold_df2

Unnamed: 0,commodity,date,open,high,low,close,volume,commodity_encoded,year,month,day,open_rolling_mean,open_rolling_std,close_rolling_mean,close_rolling_std,high_rolling_mean,high_rolling_std,low_rolling_mean,low_rolling_std,tomorrow_close
0,Gold,2000-08-30,273.899994,273.899994,273.899994,273.899994,0,3,2000,08,30,,,,,,,,,0.000000
1,Gold,2000-08-31,274.799988,278.299988,274.799988,278.299988,0,3,2000,08,31,,,,,,,,,273.899994
2,Gold,2000-09-01,277.000000,277.000000,277.000000,277.000000,0,3,2000,09,01,,,,,,,,,278.299988
3,Gold,2000-09-05,275.799988,275.799988,275.799988,275.799988,2,3,2000,09,05,,,,,,,,,277.000000
4,Gold,2000-09-06,274.200012,274.200012,274.200012,274.200012,0,3,2000,09,06,,,,,,,,,275.799988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27480,Brent Crude Oil,2023-12-27,80.739998,81.320000,79.489998,79.650002,8282,0,2023,12,27,78.968572,1.180104,79.437143,0.928453,80.471430,0.815462,78.181428,1.276511,2237.899902
27481,Brent Crude Oil,2023-12-28,79.839996,79.959999,78.339996,78.389999,24301,0,2023,12,28,79.378572,0.810789,79.500000,0.819492,80.535715,0.741413,78.547142,0.718673,2245.699951
27482,Brent Crude Oil,2023-12-29,77.419998,77.970001,76.750000,77.040001,20115,0,2023,12,29,79.288571,0.996534,79.187143,1.246524,80.294286,1.204586,78.449999,0.914203,2348.600098
27483,Brent Crude Oil,2024-01-02,77.209999,79.040001,75.599998,75.889999,28591,0,2024,01,02,78.997142,1.270364,78.642857,1.725174,80.070001,1.279817,77.952856,1.354630,2342.000000


In [13]:
# 将NaN值替换为0
futures_gold_df2.fillna(0, inplace=True)

### 对时间序列数据的训练集与测试集的划分
对时间序列数据的训练集与测试集的划分,按年份划分，而非比例；

In [14]:
import numpy as np
from deepforest import CascadeForestRegressor
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import pickle
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
def drop_split_future(data,drop_future,target):#删除无关特征，提取标签
    target_future=data[target]
    # 删除指定的列特征
    data = data.drop(drop_future, axis=1)
    data = data.round(2)
    return data,target_future

df=pd.read_csv("../data_files/ADS/futures_data3.csv")
X,y=drop_split_future(df,["commodity","date"],"tomorrow_close")
 # 将数据集分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### 获取模型的最优参数的随机搜索

使用了RandomizedSearchCV函数来执行随机搜索，
并指定了参数空间param_dist（部分参数在参数空间中，进行评估）。
n_iter参数指定了随机搜索中要尝试的参数组合数量，
cv参数指定了交叉验证的折数，
scoring参数则指定了评估指标。执行完随机搜索后，
我们可以从best_params_属性中获取最佳参数组合，
并使用best_estimator_属性获取使用这些参数训练的模型。


In [15]:
param_dist = {
        "n_estimators": randint(low=10, high=100),#树的数量
        "max_depth": randint(low=1, high=3),#每个树的最大深度
        "max_layers":randint(low=5, high=20),#级联森林的最大层数
        "n_trees": randint(low=5, high=100),#在级联森林的每一层中，每个随机森林包含的树的数量
        "n_tolerant_rounds": randint(low=1, high=10),#容忍连续多少个回合的层生长性能不提升，然后停止增加更多层
    }
# 创建深度森林对象
forest = CascadeForestRegressor()
# 创建Random Search对象
random_search = RandomizedSearchCV( forest,#deepforest模型对象
                                        param_distributions=param_dist,#超参数的取值空间
                                        n_iter=100,#随机搜索中要尝试的参数组合数量
                                        cv=5,#交叉验证的折数
                                        scoring='accuracy')#scoring参数则指定了评估指标

# 输出最佳参数组合
random_search.fit(X, y)
print("Best parameters set found on development set:")
print()
print(random_search.best_params_)
print()
print("Grid scores on development set:")
print()
means = random_search.cv_results_['mean_test_score']
stds = random_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, random_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
print()

# 使用最佳参数重新训练模型

best_clf = random_search.best_estimator_

best_clf.fit(X_train, y_train)

# 使用训练好的模型进行预测
y_pred = best_clf.predict(X_test)
# 保存模型
best_clf.save()

[2024-05-19 08:34:05.029] Start to fit the model:
[2024-05-19 08:34:05.029] Fitting cascade layer = 0 
[2024-05-19 08:34:05.170] Start to fit the model:
[2024-05-19 08:34:05.170] Fitting cascade layer = 0 
[2024-05-19 08:34:05.342] Start to fit the model:
[2024-05-19 08:34:05.342] Fitting cascade layer = 0 
[2024-05-19 08:34:05.495] Start to fit the model:
[2024-05-19 08:34:05.495] Fitting cascade layer = 0 
[2024-05-19 08:34:05.734] Start to fit the model:
[2024-05-19 08:34:05.734] Fitting cascade layer = 0 
[2024-05-19 08:34:05.969] Start to fit the model:
[2024-05-19 08:34:05.969] Fitting cascade layer = 0 
[2024-05-19 08:34:06.182] Start to fit the model:
[2024-05-19 08:34:06.182] Fitting cascade layer = 0 
[2024-05-19 08:34:06.406] Start to fit the model:
[2024-05-19 08:34:06.406] Fitting cascade layer = 0 
[2024-05-19 08:34:06.618] Start to fit the model:
[2024-05-19 08:34:06.618] Fitting cascade layer = 0 
[2024-05-19 08:34:06.825] Start to fit the model:
[2024-05-19 08:34:06.82

KeyboardInterrupt: 