In [1]:
import pandas as pd
import numpy as np
from math import pi
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost.sklearn import XGBClassifier as XGBC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,roc_curve,roc_auc_score
from sklearn.metrics import f1_score, recall_score

In [21]:
raw_df = pd.read_csv('./Data/full_name_with_price_label.csv')

In [22]:
def get_feature(raw_df):
    features = []
    labels = []
    for i in range(len(raw_df['name'])):
        feature = []
        feature.append(raw_df['maker'][i])
        feature.append(raw_df['deliver_time'][i])
        feature.append(raw_df['deliver_way'][i])
        feature.append(raw_df['discount'][i])
        feature.append(raw_df['category'][i])
        feature.append(raw_df['sub_category'][i])
        feature.append(raw_df['price'][i])
        
        features.append(feature)
        labels.append(raw_df['label'][i])
    
    return features,labels

In [23]:
features,labels = get_feature(raw_df)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 使用 SMOTE 进行过采样
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

In [24]:
def train_xgboost(x_train,x_test,y_train,y_test):
    xgb_model = XGBC()
    xgb_model.fit(x_train,y_train)

    y_pred = xgb_model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("XGBC ACC Score:", accuracy)
    y_pred_prob = xgb_model.predict_proba(x_test)[:, 1] 
    auc_score = roc_auc_score(y_test, y_pred_prob)
    print("XGBC AUC Score:", auc_score)
    f1 = f1_score(y_test, y_pred)
    print("XGBC F1 Score:", f1)
    
    return xgb_model
    
    #print(xgb_model.feature_importances_)

In [26]:
print("original:")
train_xgboost(X_train,X_test,y_train,y_test)

print("\nstand:")
train_xgboost(X_train_scaled,X_test_scaled,y_train,y_test)

print("\nsmote:")
xgb = train_xgboost(X_resampled,X_test_scaled,y_resampled,y_test)

original:
XGBC ACC Score: 0.7647058823529411
XGBC AUC Score: 0.8335516875662862
XGBC F1 Score: 0.8408304498269895

stand:
XGBC ACC Score: 0.7647058823529411
XGBC AUC Score: 0.8335516875662862
XGBC F1 Score: 0.8408304498269895

smote:
XGBC ACC Score: 0.7851662404092071
XGBC AUC Score: 0.8392288976230582
XGBC F1 Score: 0.8510638297872339


In [9]:
# 设置XGBoost的参数
param_grid = {
    'max_depth': [5, 7, 9],
    'learning_rate': [0.01,0.1],
    'subsample': [0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'n_estimators': [50, 100, 200]
}

# 创建XGBoost分类器
xgb_model = XGBC()

# 使用GridSearchCV进行网格搜索
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='roc_auc', cv=2)
grid_search.fit(X_resampled, y_resampled)

# 输出最佳参数
print("Best Parameters:", grid_search.best_params_)

# 在测试集上评估最佳模型
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on the test set:", accuracy)
y_pred_prob = best_model.predict_proba(X_test_scaled)[:, 1]  # 获取正类别的概率
roc_auc = roc_auc_score(y_test, y_pred_prob)
print("AUC on the test set:", roc_auc)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 200, 'subsample': 0.7}
Accuracy on the test set: 0.7723785166240409
AUC on the test set: 0.8408821511011292
