In [13]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.exceptions import NotFittedError
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
import xgboost as xgb

train = pd.read_csv(
    "../Dataset/Preparation_dir/final_data/feature_engineering_data/version2/train.csv"
)
test = pd.read_csv(
    "../Dataset/Preparation_dir/final_data/feature_engineering_data/version2/test.csv"
)

target = "act"

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import RandomizedSearchCV

selected_features_list = [
    "BandAccX", "BandAccY", "BandAccZ", "speed", "rate", "rateZone", "usage", "deviceStatus",
    "Acceleration_X", "Acceleration_Y", "Acceleration_Z", "MagneticField_X", "MagneticField_Y",
    "MagneticField_Z", "Orientation_X", "Orientation_Y", "Orientation_Z", "AngularVelocity_X",
    "AngularVelocity_Y", "AngularVelocity_Z", "altitude_diff", "latitude_diff", "longitude_diff"
]

# 只选择selected_features_list中的特征
X_train = train[selected_features_list]
X_test = test[selected_features_list]

# 获取数据集中所有常数列的列表
constant_columns = [col for col in X_train.columns if X_train[col].nunique() <= 1]

# 删除这些列
X_train = X_train.drop(columns=constant_columns)
X_test = X_test.drop(columns=constant_columns)

# 创建训练集的特征和目标变量
X_train = train.drop(target, axis=1)
y_train = train[target]

# 创建测试集的特征和目标变量
X_test = test.drop(target, axis=1)
y_test = test[target]

# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 使用随机森林分类器
rf_model = RandomForestClassifier()

# 使用RFE进行特征选择
selector = RFE(rf_model, n_features_to_select=10, step=1)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# 确保特征名称一致
selected_features = selector.get_support(indices=True)
X_train_selected = pd.DataFrame(X_train_selected, columns=X_train.columns[selected_features])
X_test_selected = pd.DataFrame(X_test_selected, columns=X_train.columns[selected_features])

# 定义参数网格
param_grid = {
    'n_estimators': [200, 300, 500],  # 树的数量
    'max_features': ['sqrt', 'log2'],  # 考虑的最大特征数
    'max_depth': [30, 50, 70],  # 最大树深度
    'min_samples_split': [2, 5, 10],  # 分割内部节点所需的最小样本数
    'min_samples_leaf': [1, 2, 4]  # 叶节点所需的最小样本数
}

# 创建 RandomizedSearchCV 对象
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid, n_iter=50, cv=3, n_jobs=8, scoring='r2', verbose=1, random_state=42)

# 拟合随机搜索
random_search.fit(X_train_selected, y_train)
print("Best parameters found: ", random_search.best_params_)
best_rf = random_search.best_estimator_

# 预测
y_pred = best_rf.predict(X_test_selected)

# 计算评估指标
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error (RMSE): {rmse}')

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

r2 = r2_score(y_test, y_pred)
print(f'R^2: {r2}')

f1 = f1_score(y_test, y_pred, average='macro')
print(f'F1 Score: {f1}')

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters found:  {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 70}
Accuracy: 0.34952941176470587
Mean Squared Error (MSE): 4.741882352941176
Root Mean Squared Error (RMSE): 2.177586359468018
Mean Absolute Error (MAE): 1.7091764705882353
R^2: -1.8018296073851632
F1 Score: 0.17484638508848643


In [44]:
from sklearn.metrics import accuracy_score, r2_score, f1_score
from joblib import dump, load

best_model = None
n_epochs = 10
rf_model = RandomForestClassifier(n_estimators=1)

X_train_selected_values = X_train_selected.values
y_train_values = y_train.values

best_score = 0
epochs_without_improvement = 0

for epoch in range(n_epochs):
    rf_model.set_params(n_estimators=rf_model.get_params()['n_estimators'] + 1)
    rf_model.fit(X_train_selected_values, y_train_values)
    y_train_values= rf_model.predict(X_train_selected_values)

    # 预测
    y_pred = rf_model.predict(X_test_selected.values)

    # 计算评估指标
    accuracy = accuracy_score(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')  # 使用加权平均 F1 分数

    print(f'Epoch: {epoch+1}, Accuracy: {accuracy}, R2: {r2}, F1: {f1}')

    # 如果当前模型的 F1 分数更高，保存当前模型
    if accuracy > best_score:
        best_score = f1
        best_model = rf_model
        # 保存模型
        dump(best_model, 'best_model.pkl')

    # 早停
    if f1 > best_score:
        best_score = f1
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= 5:  # 如果连续 5 个 epoch 没有改善，就停止训练
            print('Early stopping')
            break

Epoch: 1, Accuracy: 0.5294117647058824, R2: 0.042445223000778465, F1: 0.4174440058132944
Epoch: 2, Accuracy: 0.2777647058823529, R2: -2.1834612390167947, F1: 0.12173464690175856
Epoch: 3, Accuracy: 0.3496470588235294, R2: -1.6385413190968747, F1: 0.30343755249205223
Epoch: 4, Accuracy: 0.2777647058823529, R2: -2.1834612390167947, F1: 0.12173464690175856
Epoch: 5, Accuracy: 0.1652941176470588, R2: -2.0742548103659217, F1: 0.17882522644831464
Early stopping
