In [10]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.exceptions import NotFittedError
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
import xgboost as xgb
class RF_Classifier(BaseEstimator, TransformerMixin):
    def __init__(self, n_estimators=100, max_depth=None, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state

    def fit(self, X, y):
        self.rf = RandomForestClassifier(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            random_state=self.random_state
        )
        self.rf.fit(X, y)
        self.feature_importances_ = self.rf.feature_importances_
        return self

    def transform(self, X):
        try:
            getattr(self, "rf")
        except AttributeError:
            raise NotFittedError("This RandomForest CLassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.")
        return self.rf.predict(X)

train = pd.read_csv(
    "../Dataset/Preparation_dir/final_data/feature_engineering_data/version2/train.csv"
)
test = pd.read_csv(
    "../Dataset/Preparation_dir/final_data/feature_engineering_data/version2/test.csv"
)

target = "act"

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import RandomizedSearchCV

selected_features_list = [
    "BandAccX", "BandAccY", "BandAccZ", "speed", "rate", "rateZone", "usage", "deviceStatus",
    "Acceleration_X", "Acceleration_Y", "Acceleration_Z", "MagneticField_X", "MagneticField_Y",
    "MagneticField_Z", "Orientation_X", "Orientation_Y", "Orientation_Z", "AngularVelocity_X",
    "AngularVelocity_Y", "AngularVelocity_Z", "altitude_diff", "latitude_diff", "longitude_diff"
]

# 只选择selected_features_list中的特征
X_train = train[selected_features_list]
X_test = test[selected_features_list]

# 获取数据集中所有常数列的列表
constant_columns = [col for col in X_train.columns if X_train[col].nunique() <= 1]

# 删除这些列
X_train = X_train.drop(columns=constant_columns)
X_test = X_test.drop(columns=constant_columns)

# 创建训练集的特征和目标变量
X_train = train.drop(target, axis=1)
y_train = train[target]

# 创建测试集的特征和目标变量
X_test = test.drop(target, axis=1)
y_test = test[target]

# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 使用随机森林分类器
rf_model = RandomForestClassifier()

# 使用RFE进行特征选择
selector = RFE(rf_model, n_features_to_select=10, step=1)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# 确保特征名称一致
selected_features = selector.get_support(indices=True)
X_train_selected = pd.DataFrame(X_train_selected, columns=X_train.columns[selected_features])
X_test_selected = pd.DataFrame(X_test_selected, columns=X_train.columns[selected_features])

# 定义参数网格
param_grid = {
    'n_estimators': [50, 100, 200],  # 树的数量
    'max_features': ['sqrt', 'log2'],  # 考虑的最大特征数
    'max_depth': [10, 20, 30],  # 最大树深度
    'min_samples_split': [2, 5, 10],  # 分割内部节点所需的最小样本数
    'min_samples_leaf': [1, 2, 4]  # 叶节点所需的最小样本数
}

# 创建 RandomizedSearchCV 对象
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid, n_iter=50, cv=3, n_jobs=8, scoring='r2', verbose=1, random_state=42)

# 拟合随机搜索
random_search.fit(X_train_selected, y_train)
print("Best parameters found: ", random_search.best_params_)
best_rf = random_search.best_estimator_

# 预测
y_pred = best_rf.predict(X_test_selected)

# 计算评估指标
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error (RMSE): {rmse}')

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

r2 = r2_score(y_test, y_pred)
print(f'R^2: {r2}')

f1 = f1_score(y_test, y_pred, average='macro')
print(f'F1 Score: {f1}')

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters found:  {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 30}
Accuracy: 0.34952941176470587
Mean Squared Error (MSE): 4.741882352941176
Root Mean Squared Error (RMSE): 2.177586359468018
Mean Absolute Error (MAE): 1.7091764705882353
R^2: -1.8018296073851632
F1 Score: 0.17484638508848643
