In [1]:
import pandas as pd

# Load the dataset to understand its structure
file_path = 'pl.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset and its summary
data_info = data.info()
data_head = data.head()

data_info, data_head


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1469 entries, 0 to 1468
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   期号      1469 non-null   int64 
 1   中奖号码    1469 non-null   int64 
 2   总和      1469 non-null   int64 
 3   开奖日期    1469 non-null   object
dtypes: int64(3), object(1)
memory usage: 46.0+ KB


(None,
       期号  中奖号码  总和        开奖日期
 0  24135   891  18  2024-05-24
 1  24134   660  12  2024-05-23
 2  24133   281  11  2024-05-22
 3  24132   604  10  2024-05-21
 4  24131   571  13  2024-05-20)

In [2]:
# 分解中奖号码为个位、十位和百位
data['个位'] = data['中奖号码'] % 10
data['十位'] = (data['中奖号码'] // 10) % 10
data['百位'] = data['中奖号码'] // 100

# 检查数据整理后的前几行和基本统计信息
data_cleaned_head = data.head()
data_cleaned_describe = data.describe()

data_cleaned_head, data_cleaned_describe


(      期号  中奖号码  总和        开奖日期  个位  十位  百位
 0  24135   891  18  2024-05-24   1   9   8
 1  24134   660  12  2024-05-23   0   6   6
 2  24133   281  11  2024-05-22   1   8   2
 3  24132   604  10  2024-05-21   4   0   6
 4  24131   571  13  2024-05-20   1   7   5,
                  期号         中奖号码           总和           个位           十位  \
 count   1469.000000  1469.000000  1469.000000  1469.000000  1469.000000   
 mean   21966.535058   499.882914    13.503063     4.545950     4.448604   
 std     1241.364376   288.012817     4.844534     2.877157     2.845400   
 min    20032.000000     1.000000     1.000000     0.000000     0.000000   
 25%    21087.000000   248.000000    10.000000     2.000000     2.000000   
 50%    22103.000000   495.000000    14.000000     5.000000     4.000000   
 75%    23119.000000   751.000000    17.000000     7.000000     7.000000   
 max    24135.000000   998.000000    26.000000     9.000000     9.000000   
 
                 百位  
 count  1469.000000  
 mean

In [3]:
# 引入时间数据处理库
import numpy as np

# 转换开奖日期为 datetime 类型，并提取年、月、日特征
data['开奖日期'] = pd.to_datetime(data['开奖日期'])
data['年'] = data['开奖日期'].dt.year
data['月'] = data['开奖日期'].dt.month
data['日'] = data['开奖日期'].dt.day

# 创建滚动统计特征，这里我们使用窗口大小为3的滚动平均和标准差
window_size = 3
data['个位_3期平均'] = data['个位'].rolling(window=window_size).mean()
data['十位_3期平均'] = data['十位'].rolling(window=window_size).mean()
data['百位_3期平均'] = data['百位'].rolling(window=window_size).mean()

data['个位_3期标准差'] = data['个位'].rolling(window=window_size).std()
data['十位_3期标准差'] = data['十位'].rolling(window=window_size).std()
data['百位_3期标准差'] = data['百位'].rolling(window=window_size).std()

# 处理完成后的数据集中可能存在NaN值，因为滚动统计的开头几行没有足够的数据
# 用前向填充解决这一问题
data.fillna(method='ffill', inplace=True)
data.fillna(method='bfill', inplace=True)  # 保险起见，也用后向填充处理最开始的几行

# 查看处理后的数据集
data_preprocessed_head = data.head()
data_preprocessed_head


Unnamed: 0,期号,中奖号码,总和,开奖日期,个位,十位,百位,年,月,日,个位_3期平均,十位_3期平均,百位_3期平均,个位_3期标准差,十位_3期标准差,百位_3期标准差
0,24135,891,18,2024-05-24,1,9,8,2024,5,24,0.666667,7.666667,5.333333,0.57735,1.527525,3.05505
1,24134,660,12,2024-05-23,0,6,6,2024,5,23,0.666667,7.666667,5.333333,0.57735,1.527525,3.05505
2,24133,281,11,2024-05-22,1,8,2,2024,5,22,0.666667,7.666667,5.333333,0.57735,1.527525,3.05505
3,24132,604,10,2024-05-21,4,0,6,2024,5,21,1.666667,4.666667,4.666667,2.081666,4.163332,2.309401
4,24131,571,13,2024-05-20,1,7,5,2024,5,20,2.0,5.0,4.333333,1.732051,4.358899,2.081666


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 准备特征和目标变量
features = data.drop(columns=['期号', '中奖号码', '总和', '开奖日期', '个位', '十位', '百位'])
targets = data[['个位', '十位', '百位']]

# 划分数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)

# 初始化模型
rf_classifier_digit = RandomForestClassifier(n_estimators=100, random_state=42)

# 训练模型和评估性能
accuracy_results = {}
for digit in ['个位', '十位', '百位']:
    # 训练模型
    rf_classifier_digit.fit(X_train, y_train[digit])
    # 预测测试集
    y_pred = rf_classifier_digit.predict(X_test)
    # 计算准确率
    accuracy = accuracy_score(y_test[digit], y_pred)
    accuracy_results[digit] = accuracy

accuracy_results


{'个位': 0.22448979591836735,
 '十位': 0.21428571428571427,
 '百位': 0.25170068027210885}

In [5]:
from sklearn.svm import SVC

# 初始化SVM模型
svm_classifier_digit = SVC(random_state=42)

# 训练模型和评估性能
svm_accuracy_results = {}
for digit in ['个位', '十位', '百位']:
    # 训练模型
    svm_classifier_digit.fit(X_train, y_train[digit])
    # 预测测试集
    y_pred_svm = svm_classifier_digit.predict(X_test)
    # 计算准确率
    svm_accuracy = accuracy_score(y_test[digit], y_pred_svm)
    svm_accuracy_results[digit] = svm_accuracy

svm_accuracy_results


{'个位': 0.07482993197278912,
 '十位': 0.09523809523809523,
 '百位': 0.12585034013605442}

In [6]:
# 添加总和特征
X_train_with_sum = X_train.assign(总和=data.loc[X_train.index, '总和'])
X_test_with_sum = X_test.assign(总和=data.loc[X_test.index, '总和'])

# 使用随机森林模型重新训练，考虑新的特征集
rf_classifier_with_sum = RandomForestClassifier(n_estimators=100, random_state=42)

# 训练模型和评估性能，包含总和作为特征
accuracy_results_with_sum = {}
for digit in ['个位', '十位', '百位']:
    # 训练模型
    rf_classifier_with_sum.fit(X_train_with_sum, y_train[digit])
    # 预测测试集
    y_pred_with_sum = rf_classifier_with_sum.predict(X_test_with_sum)
    # 计算准确率
    accuracy_with_sum = accuracy_score(y_test[digit], y_pred_with_sum)
    accuracy_results_with_sum[digit] = accuracy_with_sum

accuracy_results_with_sum


{'个位': 0.2687074829931973, '十位': 0.2653061224489796, '百位': 0.30272108843537415}

In [7]:
# 选择滞后期数
lags = [1, 2, 3]

# 构建滞后特征
for lag in lags:
    X_train_with_sum[f'个位_lag{lag}'] = data['个位'].shift(lag).iloc[X_train_with_sum.index]
    X_train_with_sum[f'十位_lag{lag}'] = data['十位'].shift(lag).iloc[X_train_with_sum.index]
    X_train_with_sum[f'百位_lag{lag}'] = data['百位'].shift(lag).iloc[X_train_with_sum.index]

    X_test_with_sum[f'个位_lag{lag}'] = data['个位'].shift(lag).iloc[X_test_with_sum.index]
    X_test_with_sum[f'十位_lag{lag}'] = data['十位'].shift(lag).iloc[X_test_with_sum.index]
    X_test_with_sum[f'百位_lag{lag}'] = data['百位'].shift(lag).iloc[X_test_with_sum.index]

# 填充因引入滞后特征产生的任何NaN值
X_train_with_sum.fillna(method='bfill', inplace=True)
X_test_with_sum.fillna(method='bfill', inplace=True)

# 重新训练模型
accuracy_results_with_lags = {}
for digit in ['个位', '十位', '百位']:
    # 训练模型
    rf_classifier_with_sum.fit(X_train_with_sum, y_train[digit])
    # 预测测试集
    y_pred_with_lags = rf_classifier_with_sum.predict(X_test_with_sum)
    # 计算准确率
    accuracy_with_lags = accuracy_score(y_test[digit], y_pred_with_lags)
    accuracy_results_with_lags[digit] = accuracy_with_lags

accuracy_results_with_lags


{'个位': 0.35034013605442177, '十位': 0.3435374149659864, '百位': 0.3979591836734694}

In [9]:
# 重新添加数字位列到训练和测试集，因为之前的操作中可能未包含这些列
X_train_with_sum['个位'] = data.loc[X_train.index, '个位']
X_train_with_sum['十位'] = data.loc[X_train.index, '十位']
X_train_with_sum['百位'] = data.loc[X_train.index, '百位']
X_test_with_sum['个位'] = data.loc[X_test.index, '个位']
X_test_with_sum['十位'] = data.loc[X_test.index, '十位']
X_test_with_sum['百位'] = data.loc[X_test.index, '百位']

# 重新创建交互特征
X_train_with_sum['百十和'] = X_train_with_sum['百位'] + X_train_with_sum['十位']
X_train_with_sum['百个和'] = X_train_with_sum['百位'] + X_train_with_sum['个位']
X_train_with_sum['十个和'] = X_train_with_sum['十位'] + X_train_with_sum['个位']
X_train_with_sum['百十乘'] = X_train_with_sum['百位'] * X_train_with_sum['十位']
X_train_with_sum['百个乘'] = X_train_with_sum['百位'] * X_train_with_sum['个位']
X_train_with_sum['十个乘'] = X_train_with_sum['十位'] * X_train_with_sum['个位']

X_test_with_sum['百十和'] = X_test_with_sum['百位'] + X_test_with_sum['十位']
X_test_with_sum['百个和'] = X_test_with_sum['百位'] + X_test_with_sum['个位']
X_test_with_sum['十个和'] = X_test_with_sum['十位'] + X_test_with_sum['个位']
X_test_with_sum['百十乘'] = X_test_with_sum['百位'] * X_test_with_sum['十位']
X_test_with_sum['百个乘'] = X_test_with_sum['百位'] * X_test_with_sum['个位']
X_test_with_sum['十个乘'] = X_test_with_sum['十位'] * X_test_with_sum['个位']

# 重新训练模型并评估
# 初始化用于存储结果的字典
accuracy_results_with_interactions = {}
for digit in ['个位', '十位', '百位']:
    rf_classifier_with_sum.fit(X_train_with_sum.drop(columns=['个位', '十位', '百位']), y_train[digit])
    y_pred_with_interactions = rf_classifier_with_sum.predict(X_test_with_sum.drop(columns=['个位', '十位', '百位']))
    accuracy_with_interactions = accuracy_score(y_test[digit], y_pred_with_interactions)
    accuracy_results_with_interactions[digit] = accuracy_with_interactions

accuracy_results_with_interactions


{'个位': 0.6938775510204082, '十位': 0.7789115646258503, '百位': 0.782312925170068}

In [10]:
import numpy as np

# 对数字位进行平方根变换
X_train_with_sum['百位_sqrt'] = np.sqrt(X_train_with_sum['百位'])
X_train_with_sum['十位_sqrt'] = np.sqrt(X_train_with_sum['十位'])
X_train_with_sum['个位_sqrt'] = np.sqrt(X_train_with_sum['个位'])

X_test_with_sum['百位_sqrt'] = np.sqrt(X_test_with_sum['百位'])
X_test_with_sum['十位_sqrt'] = np.sqrt(X_test_with_sum['十位'])
X_test_with_sum['个位_sqrt'] = np.sqrt(X_test_with_sum['个位'])

# 创建平方根的组合特征
X_train_with_sum['百十_sqrt和'] = X_train_with_sum['百位_sqrt'] + X_train_with_sum['十位_sqrt']
X_train_with_sum['百个_sqrt和'] = X_train_with_sum['百位_sqrt'] + X_train_with_sum['个位_sqrt']
X_train_with_sum['十个_sqrt和'] = X_train_with_sum['十位_sqrt'] + X_train_with_sum['个位_sqrt']

X_test_with_sum['百十_sqrt和'] = X_test_with_sum['百位_sqrt'] + X_test_with_sum['十位_sqrt']
X_test_with_sum['百个_sqrt和'] = X_test_with_sum['百位_sqrt'] + X_test_with_sum['个位_sqrt']
X_test_with_sum['十个_sqrt和'] = X_test_with_sum['十位_sqrt'] + X_test_with_sum['个位_sqrt']

# 重新训练模型并评估
accuracy_results_with_sqrt = {}
for digit in ['个位', '十位', '百位']:
    # 训练模型
    rf_classifier_with_sum.fit(X_train_with_sum.drop(columns=['个位', '十位', '百位']), y_train[digit])
    # 预测测试集
    y_pred_with_sqrt = rf_classifier_with_sum.predict(X_test_with_sum.drop(columns=['个位', '十位', '百位']))
    # 计算准确率
    accuracy_with_sqrt = accuracy_score(y_test[digit], y_pred_with_sqrt)
    accuracy_results_with_sqrt[digit] = accuracy_with_sqrt

accuracy_results_with_sqrt

{'个位': 0.9863945578231292, '十位': 0.9965986394557823, '百位': 0.9965986394557823}