

## Content:
### 1. EDA

https://www.sciencedirect.com/science/article/pii/S2352340918315191

Setup, data inspection and cleanup are hidden for easier reading. Click the Code/Output buttons if you are curious.

In [None]:
pip install eli5


In [None]:
!pip install scikit-learn

In [None]:
!pip install shap --no-cache-dir


In [None]:
!pip install boruta


In [None]:
!pip install Optuna

In [None]:
pip install pytorch-tabnet


In [None]:
# Setup

# common:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import folium

# for ML:
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
#import eli5 # Feature importance evaluation    cannot use, ignore



# set some display options:
sns.set(style="whitegrid")
pd.set_option("display.max_columns", 36)


load data from drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import zipfile
import os

# 設置 .zip 文件和解壓縮後的目錄路徑
zip_path = '/content/drive/MyDrive/Colab Notebooks/Hotel cancellation prediction/Dataset.zip'  # 請根據你的文件夾名稱和文件名修改此路徑
extract_to = '/content/drive/MyDrive/Colab Notebooks/Hotel cancellation prediction'  # 可以是同一個文件夾，或者新建一個文件夾

# 解壓縮文件
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

# 列出解壓縮後的文件以確認
os.listdir(extract_to)


In [None]:

import os


# 读取数据文件
try:
    data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Hotel cancellation prediction/hotel_bookings.csv')
    print(data.head())  # 查看数据的前几行以确认加载成功
except FileNotFoundError:
    print(f'文件未找到：{data_path}')
except Exception as e:
    print(f'读取数据时发生错误：{e}')


In [None]:
data_types=data.dtypes
print("\nData Types：")
print(data_types)

# Missing value

In [None]:
# check for missing values
data.isnull().sum()

In [None]:
# Replace missing values:
# agent: If no agency is given, booking was most likely made without one.
# company: If none given, it was most likely private.
# rest should be self-explanatory.
nan_replacements = {"children:": 0.0,"country": "Unknown", "agent": 0, "company": 0}
data_cln = data.fillna(nan_replacements)

# "meal" contains values "Undefined", which is equal to SC.
data_cln["meal"].replace("Undefined", "SC", inplace=True)

# Some rows contain entreis with 0 adults, 0 children and 0 babies.
# I'm dropping these entries with no guests.
zero_guests = list(data_cln.loc[data_cln["adults"]
                   + data_cln["children"]
                   + data_cln["babies"]==0].index)
data_cln.drop(data_cln.index[zero_guests], inplace=True)

In [None]:

print(data_cln.isnull().sum())


In [None]:
data_cln['children'].fillna(0.0, inplace=True)


In [None]:
print(data_cln['children'].isnull().sum())


In [None]:


# double check
print(data_cln.isnull().sum())


In [None]:
# How much data is left?
data_cln.shape

# 1.EDA




In [None]:
# After cleaning, separate Resort and City hotel
# To know the acutal visitor numbers, only bookings that were not canceled are included.
rh = data_cln.loc[(data_cln["hotel"] == "Resort Hotel") & (data_cln["is_canceled"] == 0)]
ch = data_cln.loc[(data_cln["hotel"] == "City Hotel") & (data_cln["is_canceled"] == 0)]

### Bookings by market segment

In [None]:
# total bookings per market segment (incl. canceled)
segments=data_cln["market_segment"].value_counts()

# pie plot
fig = px.pie(segments,
             values=segments.values,
             names=segments.index,
             title="Bookings per market segment",
             template="seaborn")
fig.update_traces(rotation=-90, textinfo="percent+label")
fig.show()

In [None]:
# total bookings per market segment (incl. canceled)
segments=data_cln["market_segment"].value_counts()

# pie plot
fig = px.pie(segments,
             values=segments.values,
             names=segments.index,
             title="Bookings per market segment",
             template="seaborn")
fig.update_traces(rotation=-90, textinfo="percent+label")
fig.show()

In [None]:

# First, separate bookings for City Hotel and Resort Hotel including cancelled ones
city_hotel_segments = data_cln[data_cln["hotel"] == "City Hotel"]["market_segment"].value_counts()
resort_hotel_segments = data_cln[data_cln["hotel"] == "Resort Hotel"]["market_segment"].value_counts()

# Pie chart for City Hotel
fig_city = px.pie(names=city_hotel_segments.index, values=city_hotel_segments.values,
                  title="City Hotel - Bookings per Market Segment",
                  template="seaborn")
fig_city.update_traces(rotation=-90, textinfo="percent+label")
fig_city.show()

# Pie chart for Resort Hotel
fig_resort = px.pie(names=resort_hotel_segments.index, values=resort_hotel_segments.values,
                    title="Resort Hotel - Bookings per Market Segment",
                    template="seaborn")
fig_resort.update_traces(rotation=-90, textinfo="percent+label")
fig_resort.show()


### How many bookings were canceled?

In [None]:
# absolute cancelations:
total_cancelations = data_cln["is_canceled"].sum()
rh_cancelations = data_cln.loc[data_cln["hotel"] == "Resort Hotel"]["is_canceled"].sum()
ch_cancelations = data_cln.loc[data_cln["hotel"] == "City Hotel"]["is_canceled"].sum()

# as percent:
rel_cancel = total_cancelations / data_cln.shape[0] * 100
rh_rel_cancel = rh_cancelations / data_cln.loc[data_cln["hotel"] == "Resort Hotel"].shape[0] * 100
ch_rel_cancel = ch_cancelations / data_cln.loc[data_cln["hotel"] == "City Hotel"].shape[0] * 100

print(f"Total bookings canceled: {total_cancelations:,} ({rel_cancel:.0f} %)")
print(f"Resort hotel bookings canceled: {rh_cancelations:,} ({rh_rel_cancel:.0f} %)")
print(f"City hotel bookings canceled: {ch_cancelations:,} ({ch_rel_cancel:.0f} %)")

# 2.Data Cleaning& preprocessing

# correlation matrix

LabelEncoder

In [None]:

data_cln = data_cln.copy()

# object
object_columns = data_cln.select_dtypes(include=['object']).columns


label_encoders = {}
for col in object_columns:
    le = LabelEncoder()
    data_cln[col] = le.fit_transform(data_cln[col])
    label_encoders[col] = le


print(data_cln.dtypes)

print(data_cln.head())

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats import fisher_exact
import warnings
warnings.filterwarnings('ignore')

def analyze_categorical_relationships(data, target_col='is_canceled'):

    results = []

    for column in data.columns:
        if column != target_col:

            contingency_table = pd.crosstab(data[column], data[target_col])


            chi2, p_value, dof, expected = chi2_contingency(contingency_table)

            # Cramer's V
            n = contingency_table.sum().sum()
            min_dim = min(contingency_table.shape) - 1
            cramer_v = np.sqrt(chi2 / (n * min_dim))

            results.append({
                'Feature': column,
                'Chi2': chi2,
                'P_value': p_value,
                'Cramer_V': cramer_v
            })

    # DataFrame
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('Cramer_V', ascending=False)

    return results_df

results = analyze_categorical_relationships(data_cln)


print("Significance level set to 0.05")
print("\nSignificantly correlated features:")
print(results[results['P_value'] < 0.05].to_string(index=False))

In [None]:
def find_significant_pairs(data, p_value_threshold=0.05):
    significant_pairs = []


    features = data.columns
    for i in range(len(features)):
        for j in range(i+1, len(features)):

            contingency = pd.crosstab(data[features[i]], data[features[j]])


            chi2, p_value, _, _ = chi2_contingency(contingency)


            if p_value < p_value_threshold:
                significant_pairs.append((features[i], features[j], p_value))


    significant_pairs.sort(key=lambda x: x[2])

    return significant_pairs


sig_pairs = find_significant_pairs(data_cln)

print("显著相关的特征对（p < 0.05）：")
for pair in sig_pairs:
    print(f"({pair[0]}, {pair[1]}) - p值: {pair[2]:.10f}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def calculate_cramers_v_matrix(data):
    n_features = len(data.columns)
    v_matrix = np.zeros((n_features, n_features))

    for i, col1 in enumerate(data.columns):
        for j, col2 in enumerate(data.columns):
            contingency = pd.crosstab(data[col1], data[col2])
            chi2, _, _, _ = chi2_contingency(contingency)
            n = contingency.sum().sum()
            min_dim = min(contingency.shape) - 1
            v = np.sqrt(chi2 / (n * min_dim))
            v_matrix[i,j] = v

    return pd.DataFrame(v_matrix, index=data.columns, columns=data.columns)


v_matrix = calculate_cramers_v_matrix(data_cln)
plt.figure(figsize=(12, 10))
sns.heatmap(v_matrix, annot=False, cmap='YlOrRd', vmin=0, vmax=1)
plt.title("Cramer's V Correlation Matrix")
plt.tight_layout()
plt.savefig('cramers_v_heatmap.pdf')

In [None]:
selected_columns = [
    'is_canceled', 'hotel', 'lead_time', 'arrival_date_month', 'stays_in_weekend_nights',
    'stays_in_week_nights', 'adults', 'children', 'babies', 'meal', 'country',
    'market_segment', 'distribution_channel', 'is_repeated_guest', 'previous_cancellations',
    'previous_bookings_not_canceled', 'reserved_room_type', 'booking_changes', 'deposit_type',
    'agent', 'company', 'days_in_waiting_list', 'customer_type', 'adr',
    'required_car_parking_spaces', 'total_of_special_requests'
]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency



def calculate_cramers_v_matrix(data, selected_columns):
    n_features = len(selected_columns)
    v_matrix = np.zeros((n_features, n_features))

    for i, col1 in enumerate(selected_columns):
        for j, col2 in enumerate(selected_columns):
            contingency = pd.crosstab(data[col1], data[col2])
            chi2, _, _, _ = chi2_contingency(contingency)
            n = contingency.sum().sum()
            min_dim = min(contingency.shape) - 1
            v = np.sqrt(chi2 / (n * min_dim))
            v_matrix[i,j] = v

    return pd.DataFrame(v_matrix, index=selected_columns, columns=selected_columns)

# 计算并绘制热图
plt.figure(figsize=(15, 12))
v_matrix = calculate_cramers_v_matrix(data_cln[selected_columns], selected_columns)
sns.heatmap(v_matrix, annot=True, cmap='YlOrRd', vmin=0, vmax=1, fmt='.2f',
            xticklabels=selected_columns, yticklabels=selected_columns)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.title("Cramer's V Correlation Matrix ")
plt.tight_layout()
plt.savefig('cramers_v_selected_features.pdf', bbox_inches='tight', dpi=300)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

plt.figure(figsize=(15, 12))
v_matrix = calculate_cramers_v_matrix(data_cln[selected_columns], selected_columns)
sns.heatmap(v_matrix, annot=False, cmap='YlOrRd', vmin=0, vmax=1,
            xticklabels=selected_columns, yticklabels=selected_columns)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.title("Cramer's V Correlation Matrix")
plt.tight_layout()
plt.savefig('cramers_v_selected_features.pdf', bbox_inches='tight', dpi=300)

In [None]:
print(selected_columns)

说明这两个中间肯定哪里变了selected_colums 把assigned room type放进去了

In [None]:
print(selected_columns)

# Stratified Split

In [None]:

print(data_cln['hotel'].unique())


In [None]:

print(data_cln.groupby('hotel')['is_canceled'].mean())


In [None]:
# H1+H
train_data, test_data = train_test_split(data_cln, test_size=0.2, stratify=data_cln['is_canceled'], random_state=42)

# City Hotel (H1)
H1_data = data_cln[data_cln['hotel'] == 0]
H1_train, H1_test = train_test_split(H1_data, test_size=0.2, stratify=H1_data['is_canceled'], random_state=42)

# Resort Hotel (H2)
H2_data = data_cln[data_cln['hotel'] == 1]
H2_train, H2_test = train_test_split(H2_data, test_size=0.2, stratify=H2_data['is_canceled'], random_state=42)


print(f"Total Train: {train_data.shape[0]}, Total Test: {test_data.shape[0]}")
print(f"City Hotel Train (H1): {H1_train.shape[0]}, City Hotel Test (H1): {H1_test.shape[0]}")
print(f"Resort Hotel Train (H2): {H2_train.shape[0]}, Resort Hotel Test (H2): {H2_test.shape[0]}")

In [None]:

H1_train, H1_test = train_test_split(H1_data, test_size=0.2, stratify=H1_data['is_canceled'], random_state=42)
H2_train, H2_test = train_test_split(H2_data, test_size=0.2, stratify=H2_data['is_canceled'], random_state=42)

# combined
combined_data = data_cln.copy()
combined_data['strat_var'] = combined_data['hotel'].astype(str) + '_' + combined_data['is_canceled'].astype(str)
combined_train, combined_test = train_test_split(combined_data, test_size=0.2,
                                               stratify=combined_data['strat_var'],
                                               random_state=42)

print("H1 Distribution:")
print(f"City Hotel Train (H1): {H1_train.shape[0]}, City Hotel Test (H1): {H1_test.shape[0]}")
print("\nH2 Distribution:")
print(f"Resort Hotel Train (H2): {H2_train.shape[0]}, Resort Hotel Test (H2): {H2_test.shape[0]}")
print("\nCombined Dataset Distribution:")
print(combined_train.groupby(['hotel', 'is_canceled']).size())
print(combined_test.groupby(['hotel', 'is_canceled']).size())


In [None]:
print("Train set size:", len(combined_train))
print("Test set size:", len(combined_test))

In [None]:
def check_missing_values(datasets_dict):

    for name, df in datasets_dict.items():
        missing = df.isna().sum().sum()
        print(f"Number of NaN values in {name}: {missing}")


        if missing > 0:
            print(f"Detailed missing values in {name}:")
            print(df.isna().sum()[df.isna().sum() > 0])
        print()

datasets = {
    'H1_train': H1_train,
    'H1_test': H1_test,
    'H2_train': H2_train,
    'H2_test': H2_test,
    'Combined_train': combined_train,
    'Combined_test': combined_test
}


check_missing_values(datasets)

#H1

 ## 1.Logistic Regression





In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# H1_train
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']
# H1_test
X_test = H1_test[selected_columns].drop(columns=['is_canceled'])
y_test = H1_test['is_canceled']


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic
log_reg_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
log_reg_model.fit(X_train_scaled, y_train)


y_pred = log_reg_model.predict(X_test_scaled)
y_pred_proba = log_reg_model.predict_proba(X_test_scaled)[:, 1]  # ROC-AUC


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"\nROC-AUC Score: {roc_auc:.2f}")

# ROC
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'Logistic Regression (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()



In [None]:

coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': log_reg_model.coef_[0]
}).sort_values(by='Coefficient', key=abs, ascending=False)


print("\nModel Coefficients:")
print(coefficients)


In [None]:
import shap
import matplotlib.pyplot as plt
import pandas as pd

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)

#SHAP explainer
explainer = shap.Explainer(log_reg_model, X_train_scaled_df)
shap_values = explainer(X_train_scaled_df)

#SHAP summary plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_train_scaled_df, plot_type='bar', max_display=15)  # max_display 15

#SHAP summary plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_train_scaled_df, max_display=15)  # max_display 15




### baseline model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# H1_train
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Logistic
log_reg_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)

# 3-fold
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(log_reg_model, X_train_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(log_reg_model, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]



conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})


for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"

        plt.text(j+ 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages')
plt.show()


print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

# ROC
plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    log_reg_model.fit(X_train_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = log_reg_model.predict_proba(X_train_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (3-fold CV)')
plt.legend(loc='lower right')
plt.show()


coefficients

In [None]:

coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': log_reg_model.coef_[0]
}).sort_values(by='Coefficient', key=abs, ascending=False)


print("\nModel Coefficients:")
print(coefficients)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt


coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': log_reg_model.coef_[0]
})


coefficients['Abs_Coefficient'] = coefficients['Coefficient'].abs()
coefficients_sorted = coefficients.sort_values(by='Abs_Coefficient', ascending=False)



plt.figure(figsize=(10, 8))
plt.barh(coefficients_sorted['Feature'], coefficients_sorted['Coefficient'])
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Feature Importance (Coefficients)')
plt.gca().invert_yaxis()
plt.show()


SHAP

In [None]:
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


explainer = shap.LinearExplainer(log_reg_model, X_train_scaled, feature_names=X_train.columns)
shap_values = explainer.shap_values(X_train_scaled)

shap_median_importance = np.median(np.abs(shap_values), axis=0)
shap_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Median SHAP Value': shap_median_importance
}).sort_values(by='Median SHAP Value', ascending=False)

plt.figure(figsize=(10, 8))
plt.barh(shap_importance_df['Feature'], shap_importance_df['Median SHAP Value'])
plt.xlabel('Median SHAP Value')
plt.ylabel('Feature')
plt.title('Feature Importance (Median SHAP Values)')
plt.gca().invert_yaxis()
plt.show()

#SHAP summary plot
shap.summary_plot(shap_values, X_train, plot_type='dot', show=True)


In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# H1_train
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)


rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
boruta_selector = BorutaPy(rf_model, n_estimators='auto', random_state=42)
boruta_selector.fit(X_train_scaled, y_train)


selected_features = X_train.columns[boruta_selector.support_]
print("Selected Features:", selected_features)

# DataFrame
feature_ranking = pd.DataFrame({
    'Feature': X_train.columns,
    'Ranking': boruta_selector.ranking_
}).sort_values(by='Ranking')


selected_feature_ranking = feature_ranking[feature_ranking['Ranking'] == 1]


### different feature combinations

TOP 10

coefficient

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# H1_train
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Logistic
log_reg_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
log_reg_model.fit(X_train_scaled, y_train)

# 10
coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': log_reg_model.coef_[0]
})
top_10_features = coefficients.reindex(coefficients.Coefficient.abs().sort_values(ascending=False).index).head(10)['Feature']
print("Top 10 Features:", top_10_features.values)

# top 10
X_train_top10 = H1_train[top_10_features]


X_train_top10_scaled = scaler.fit_transform(X_train_top10)

# Logistic
log_reg_model_top10 = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
log_reg_model_top10.fit(X_train_top10_scaled, y_train)

# 3-fold
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(log_reg_model_top10, X_train_top10_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(log_reg_model_top10, X_train_top10_scaled, y_train, cv=cv, method='predict_proba')[:, 1]


conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})


for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages')
plt.show()


print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

#ROC
plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_top10_scaled, y_train):
    log_reg_model_top10.fit(X_train_top10_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = log_reg_model_top10.predict_proba(X_train_top10_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (3-fold CV)')
plt.legend(loc='lower right')
plt.show()


SHAP

In [None]:
import shap
import numpy as np
import pandas as pd


explainer = shap.LinearExplainer(log_reg_model, X_train_scaled, feature_names=X_train.columns)
shap_values = explainer.shap_values(X_train_scaled)


shap_median_importance = np.median(np.abs(shap_values), axis=0)


shap_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Median SHAP Value': shap_median_importance
}).sort_values(by='Median SHAP Value', ascending=False)


top_10_features = shap_importance_df['Feature'].head(10).values
print("Top 10 SHAP Features:", top_10_features)


X_train_top10 = X_train[top_10_features]

X_train_top10_scaled = scaler.fit_transform(X_train_top10)


log_reg_model_top10 = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
log_reg_model_top10.fit(X_train_top10_scaled, y_train)


cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv_top10 = cross_val_predict(log_reg_model_top10, X_train_top10_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv_top10 = cross_val_predict(log_reg_model_top10, X_train_top10_scaled, y_train, cv=cv, method='predict_proba')[:, 1]


conf_matrix_top10 = confusion_matrix(y_train, y_pred_cv_top10)
conf_matrix_percentage_top10 = conf_matrix_top10 / conf_matrix_top10.sum(axis=1).reshape(-1, 1) * 100


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_top10, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_top10.shape[0]):
    for j in range(conf_matrix_top10.shape[1]):
        percentage_text = f"{conf_matrix_percentage_top10[i, j]:.1f}%"
        plt.text(j+0.2, i+0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Top 10 SHAP Features)')
plt.show()


print("\nClassification Report (Top 10 SHAP Features):")
print(classification_report(y_train, y_pred_cv_top10))

plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_top10_scaled, y_train):
    log_reg_model_top10.fit(X_train_top10_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold_top10 = log_reg_model_top10.predict_proba(X_train_top10_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold_top10)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold_top10)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (3-fold CV, Top 10 SHAP Features)')
plt.legend(loc='lower right')
plt.show()


Boruta

In [None]:

X_train_boruta = X_train[['lead_time', 'country', 'deposit_type', 'agent', 'adr', 'total_of_special_requests']]


X_train_boruta_scaled = scaler.fit_transform(X_train_boruta)


log_reg_model_boruta = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
log_reg_model_boruta.fit(X_train_boruta_scaled, y_train)


cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv_boruta = cross_val_predict(log_reg_model_boruta, X_train_boruta_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv_boruta = cross_val_predict(log_reg_model_boruta, X_train_boruta_scaled, y_train, cv=cv, method='predict_proba')[:, 1]


conf_matrix_boruta = confusion_matrix(y_train, y_pred_cv_boruta)
conf_matrix_percentage_boruta = conf_matrix_boruta / conf_matrix_boruta.sum(axis=1).reshape(-1, 1) * 100


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_boruta, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_boruta.shape[0]):
    for j in range(conf_matrix_boruta.shape[1]):
        percentage_text = f"{conf_matrix_percentage_boruta[i, j]:.1f}%"
        plt.text(j+0.2, i+0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Boruta Selected Features)')
plt.show()


print("\nClassification Report (Boruta Selected Features):")
print(classification_report(y_train, y_pred_cv_boruta))

plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_boruta_scaled, y_train):
    log_reg_model_boruta.fit(X_train_boruta_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold_boruta = log_reg_model_boruta.predict_proba(X_train_boruta_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold_boruta)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold_boruta)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (3-fold CV, Boruta Selected Features)')
plt.legend(loc='lower right')
plt.show()


### hyperparameter tuning

optuna

all features


In [None]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)


def objective(trial):
    C = trial.suggest_loguniform('C', 1e-4, 1e2)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])


    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        raise optuna.exceptions.TrialPruned()
    if penalty == 'l2' and solver not in ['liblinear', 'saga', 'lbfgs']:
        raise optuna.exceptions.TrialPruned()


    log_reg_model = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=2000, random_state=42)


    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred_proba_cv = cross_val_predict(log_reg_model, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]
    auc_score = roc_auc_score(y_train, y_pred_proba_cv)

    return auc_score


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)


print("Best Parameters (Optuna):", study.best_params)


best_log_reg_model = LogisticRegression(**study.best_params, max_iter=2000, random_state=42)


y_pred_cv = cross_val_predict(best_log_reg_model, X_train_scaled, y_train, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42), method='predict')
y_pred_proba_cv = cross_val_predict(best_log_reg_model, X_train_scaled, y_train, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42), method='predict_proba')[:, 1]


conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})


for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j+0.2, i+0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages')
plt.show()


print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))


plt.figure(figsize=(12, 8))
fold_count = 1
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    best_log_reg_model.fit(X_train_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = best_log_reg_model.predict_proba(X_train_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (3-fold CV with Best Parameters)')
plt.legend(loc='lower right')
plt.show()


In [None]:
print("Best Parameters (Optuna):", study.best_params)


top10 coefficient features

In [None]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd


X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)


log_reg_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
log_reg_model.fit(X_train_scaled, y_train)

coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': log_reg_model.coef_[0]
})
top_10_features = coefficients.reindex(coefficients.Coefficient.abs().sort_values(ascending=False).index).head(10)['Feature']
print("Top 10 Features:", top_10_features.values)


X_train_top10 = H1_train[top_10_features]
X_train_top10_scaled = scaler.fit_transform(X_train_top10)


def objective(trial):
    C = trial.suggest_float('C', 1e-4, 1e2, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])

    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        raise optuna.exceptions.TrialPruned()
    if penalty == 'l2' and solver not in ['liblinear', 'saga', 'lbfgs']:
        raise optuna.exceptions.TrialPruned()

    log_reg_model_optuna = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=2000, random_state=42)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred_proba_cv = cross_val_predict(log_reg_model_optuna, X_train_top10_scaled, y_train, cv=cv, method='predict_proba')[:, 1]
    auc_score = roc_auc_score(y_train, y_pred_proba_cv)

    return auc_score


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)


print("Best Parameters (Optuna):", study.best_params)


best_log_reg_model = LogisticRegression(**study.best_params, max_iter=2000, random_state=42)


cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(best_log_reg_model, X_train_top10_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(best_log_reg_model, X_train_top10_scaled, y_train, cv=cv, method='predict_proba')[:, 1]


conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages')
plt.show()


print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_top10_scaled, y_train):
    best_log_reg_model.fit(X_train_top10_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = best_log_reg_model.predict_proba(X_train_top10_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (3-fold CV with Best Parameters)')
plt.legend(loc='lower right')
plt.show()


TOP10 SHAP

In [None]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd


X_train = H1_train[['deposit_type', 'market_segment', 'total_of_special_requests', 'country',
                    'lead_time', 'previous_cancellations', 'required_car_parking_spaces', 'adr',
                    'stays_in_week_nights', 'stays_in_weekend_nights']]
y_train = H1_train['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)


def objective(trial):
    C = trial.suggest_float('C', 1e-4, 1e2, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])

    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        raise optuna.exceptions.TrialPruned()
    if penalty == 'l2' and solver not in ['liblinear', 'saga', 'lbfgs']:
        raise optuna.exceptions.TrialPruned()


    log_reg_model_optuna = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=2000, random_state=42)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred_proba_cv = cross_val_predict(log_reg_model_optuna, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]
    auc_score = roc_auc_score(y_train, y_pred_proba_cv)

    return auc_score


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best Parameters (Optuna):", study.best_params)

best_log_reg_model = LogisticRegression(**study.best_params, max_iter=2000, random_state=42)


cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(best_log_reg_model, X_train_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(best_log_reg_model, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]


conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Top 10 SHAP Features)')
plt.show()


print("\nClassification Report (Top 10 SHAP Features):")
print(classification_report(y_train, y_pred_cv))


plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    best_log_reg_model.fit(X_train_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = best_log_reg_model.predict_proba(X_train_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (3-fold CV, Top 10 SHAP Features with Best Parameters)')
plt.legend(loc='lower right')
plt.show()


BORUTA

In [None]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd


X_train_boruta = X_train[['lead_time', 'country', 'deposit_type', 'agent', 'adr', 'total_of_special_requests']]
y_train = H1_train['is_canceled']


scaler = StandardScaler()
X_train_boruta_scaled = scaler.fit_transform(X_train_boruta)


def objective(trial):
    C = trial.suggest_float('C', 1e-4, 1e2, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])


    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        raise optuna.exceptions.TrialPruned()
    if penalty == 'l2' and solver not in ['liblinear', 'saga', 'lbfgs']:
        raise optuna.exceptions.TrialPruned()


    log_reg_model_optuna = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=2000, random_state=42)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred_proba_cv = cross_val_predict(log_reg_model_optuna, X_train_boruta_scaled, y_train, cv=cv, method='predict_proba')[:, 1]
    auc_score = roc_auc_score(y_train, y_pred_proba_cv)

    return auc_score


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)


print("Best Parameters (Optuna):", study.best_params)


best_log_reg_model = LogisticRegression(**study.best_params, max_iter=2000, random_state=42)


cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(best_log_reg_model, X_train_boruta_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(best_log_reg_model, X_train_boruta_scaled, y_train, cv=cv, method='predict_proba')[:, 1]


conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Boruta Selected Features)')
plt.show()


print("\nClassification Report (Boruta Selected Features):")
print(classification_report(y_train, y_pred_cv))


plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_boruta_scaled, y_train):
    best_log_reg_model.fit(X_train_boruta_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = best_log_reg_model.predict_proba(X_train_boruta_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (3-fold CV, Boruta Selected Features with Best Parameters)')
plt.legend(loc='lower right')
plt.show()


### test set

baseline model

In [None]:

X_test = H1_test[selected_columns].drop(columns=['is_canceled'])
y_test = H1_test['is_canceled']

X_test_scaled = scaler.transform(X_test)


y_pred_test = log_reg_model.predict(X_test_scaled)
y_pred_proba_test = log_reg_model.predict_proba(X_test_scaled)[:, 1]


conf_matrix_test = confusion_matrix(y_test, y_pred_test)
conf_matrix_percentage_test = conf_matrix_test / conf_matrix_test.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

for i in range(conf_matrix_test.shape[0]):
    for j in range(conf_matrix_test.shape[1]):
        percentage_text = f"{conf_matrix_percentage_test[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Test Set)')
plt.show()


print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))


fpr_test, tpr_test, _ = roc_curve(y_test, y_pred_proba_test)
roc_auc_test = roc_auc_score(y_test, y_pred_proba_test)

plt.figure(figsize=(12, 8))
plt.plot(fpr_test, tpr_test, label=f'Logistic Regression (AUC = {roc_auc_test:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Test Set)')
plt.legend(loc='lower right')
plt.show()


baseline with Optuna

In [None]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)


def objective(trial):
    C = trial.suggest_loguniform('C', 1e-4, 1e2)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])


    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        raise optuna.exceptions.TrialPruned()
    if penalty == 'l2' and solver not in ['liblinear', 'saga', 'lbfgs']:
        raise optuna.exceptions.TrialPruned()


    log_reg_model = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=2000, random_state=42)

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred_proba_cv = cross_val_predict(log_reg_model, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]
    auc_score = roc_auc_score(y_train, y_pred_proba_cv)

    return auc_score


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)


print("Best Parameters (Optuna):", study.best_params)


best_log_reg_model = LogisticRegression(**study.best_params, max_iter=2000, random_state=42)


best_log_reg_model.fit(X_train_scaled, y_train)


X_test = H1_test[selected_columns].drop(columns=['is_canceled'])
y_test = H1_test['is_canceled']


X_test_scaled = scaler.transform(X_test)


y_pred_test = best_log_reg_model.predict(X_test_scaled)
y_pred_proba_test = best_log_reg_model.predict_proba(X_test_scaled)[:, 1]


conf_matrix = confusion_matrix(y_test, y_pred_test)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})


for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Test Set)')
plt.show()


print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))


fpr, tpr, _ = roc_curve(y_test, y_pred_proba_test)
roc_auc = roc_auc_score(y_test, y_pred_proba_test)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Test Set)')
plt.legend(loc='lower right')
plt.show()


top10 coefficients with optuna tuning

In [None]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)


log_reg_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
log_reg_model.fit(X_train_scaled, y_train)

coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': log_reg_model.coef_[0]
})
top_10_features = coefficients.reindex(coefficients.Coefficient.abs().sort_values(ascending=False).index).head(10)['Feature']
print("Top 10 Features:", top_10_features.values)

X_train_top10 = H1_train[top_10_features]
X_train_top10_scaled = scaler.fit_transform(X_train_top10)


def objective(trial):
    C = trial.suggest_float('C', 1e-4, 1e2, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])

    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        raise optuna.exceptions.TrialPruned()
    if penalty == 'l2' and solver not in ['liblinear', 'saga', 'lbfgs']:
        raise optuna.exceptions.TrialPruned()

    log_reg_model_optuna = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=2000, random_state=42)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred_proba_cv = cross_val_predict(log_reg_model_optuna, X_train_top10_scaled, y_train, cv=cv, method='predict_proba')[:, 1]
    auc_score = roc_auc_score(y_train, y_pred_proba_cv)

    return auc_score


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)


print("Best Parameters (Optuna):", study.best_params)


best_log_reg_model = LogisticRegression(**study.best_params, max_iter=2000, random_state=42)


best_log_reg_model.fit(X_train_top10_scaled, y_train)


X_test_top10 = H1_test[top_10_features]
X_test_top10_scaled = scaler.transform(X_test_top10)
y_test = H1_test['is_canceled']


y_pred_test = best_log_reg_model.predict(X_test_top10_scaled)
y_pred_proba_test = best_log_reg_model.predict_proba(X_test_top10_scaled)[:, 1]


conf_matrix = confusion_matrix(y_test, y_pred_test)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Test Set)')
plt.show()


print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))


fpr, tpr, _ = roc_curve(y_test, y_pred_proba_test)
roc_auc = roc_auc_score(y_test, y_pred_proba_test)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Test Set, Top 10 Features with Best Parameters)')
plt.legend(loc='lower right')
plt.show()


top10 SHAP with optuna tuning

In [None]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd


X_train = H1_train[['deposit_type', 'market_segment', 'total_of_special_requests', 'country',
                    'lead_time', 'previous_cancellations', 'required_car_parking_spaces', 'adr',
                    'stays_in_week_nights', 'stays_in_weekend_nights']]
y_train = H1_train['is_canceled']


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)


def objective(trial):
    C = trial.suggest_float('C', 1e-4, 1e2, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])

    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        raise optuna.exceptions.TrialPruned()
    if penalty == 'l2' and solver not in ['liblinear', 'saga', 'lbfgs']:
        raise optuna.exceptions.TrialPruned()

    log_reg_model_optuna = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=2000, random_state=42)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred_proba_cv = cross_val_predict(log_reg_model_optuna, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]
    auc_score = roc_auc_score(y_train, y_pred_proba_cv)

    return auc_score


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)


print("Best Parameters (Optuna):", study.best_params)

best_log_reg_model = LogisticRegression(**study.best_params, max_iter=2000, random_state=42)


best_log_reg_model.fit(X_train_scaled, y_train)

X_test = H1_test[['deposit_type', 'market_segment', 'total_of_special_requests', 'country',
                  'lead_time', 'previous_cancellations', 'required_car_parking_spaces', 'adr',
                  'stays_in_week_nights', 'stays_in_weekend_nights']]
X_test_scaled = scaler.transform(X_test)
y_test = H1_test['is_canceled']


y_pred_test = best_log_reg_model.predict(X_test_scaled)
y_pred_proba_test = best_log_reg_model.predict_proba(X_test_scaled)[:, 1]


conf_matrix = confusion_matrix(y_test, y_pred_test)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Test Set, Top 10 SHAP Features)')
plt.show()


print("\nClassification Report (Test Set, Top 10 SHAP Features):")
print(classification_report(y_test, y_pred_test))


fpr, tpr, _ = roc_curve(y_test, y_pred_proba_test)
roc_auc = roc_auc_score(y_test, y_pred_proba_test)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Test Set, Top 10 SHAP Features with Best Parameters)')
plt.legend(loc='lower right')
plt.show()


Boruta with optuna tuning

In [None]:

X_test_boruta = H1_test[['lead_time', 'country', 'deposit_type', 'agent', 'adr', 'total_of_special_requests']]
y_test = H1_test['is_canceled']


X_test_boruta_scaled = scaler.transform(X_test_boruta)

best_log_reg_model = LogisticRegression(**study.best_params, max_iter=2000, random_state=42)
best_log_reg_model.fit(X_train_boruta_scaled, y_train)


y_pred_test = best_log_reg_model.predict(X_test_boruta_scaled)
y_pred_proba_test = best_log_reg_model.predict_proba(X_test_boruta_scaled)[:, 1]


conf_matrix_test = confusion_matrix(y_test, y_pred_test)
conf_matrix_percentage_test = conf_matrix_test / conf_matrix_test.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_test.shape[0]):
    for j in range(conf_matrix_test.shape[1]):
        percentage_text = f"{conf_matrix_percentage_test[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Boruta Selected Features on Test Set)')
plt.show()


print("\nClassification Report (Boruta Selected Features on Test Set):")
print(classification_report(y_test, y_pred_test))


fpr, tpr, _ = roc_curve(y_test, y_pred_proba_test)
roc_auc = roc_auc_score(y_test, y_pred_proba_test)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Test Set AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Test Set, Boruta Selected Features with Best Parameters)')
plt.legend(loc='lower right')
plt.show()


### cross-dataset evaluation: H2 test set

baseline model

In [None]:

X_test_H2 = H2_test[selected_columns].drop(columns=['is_canceled'])
y_test_H2 = H2_test['is_canceled']


X_test_H2_scaled = scaler.transform(X_test_H2)  t


y_pred_test_H2 = log_reg_model.predict(X_test_H2_scaled)
y_pred_proba_test_H2 = log_reg_model.predict_proba(X_test_H2_scaled)[:, 1]


conf_matrix_test_H2 = confusion_matrix(y_test_H2, y_pred_test_H2)
conf_matrix_percentage_test_H2 = conf_matrix_test_H2 / conf_matrix_test_H2.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_test_H2, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})


for i in range(conf_matrix_test_H2.shape[0]):
    for j in range(conf_matrix_test_H2.shape[1]):
        percentage_text = f"{conf_matrix_percentage_test_H2[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (H2 Test Set)')
plt.show()


print("\nClassification Report (H2 Test Set):")
print(classification_report(y_test_H2, y_pred_test_H2))


fpr_test_H2, tpr_test_H2, _ = roc_curve(y_test_H2, y_pred_proba_test_H2)
roc_auc_test_H2 = roc_auc_score(y_test_H2, y_pred_proba_test_H2)

plt.figure(figsize=(12, 8))
plt.plot(fpr_test_H2, tpr_test_H2, label=f'Logistic Regression (AUC = {roc_auc_test_H2:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (H2 Test Set)')
plt.legend(loc='lower right')
plt.show()


baseline with optuna tuning

In [None]:

X_test_H2 = H2_test[selected_columns].drop(columns=['is_canceled'])
y_test_H2 = H2_test['is_canceled']

X_test_H2_scaled = scaler.transform(X_test_H2)


y_pred_test_H2 = best_log_reg_model.predict(X_test_H2_scaled)
y_pred_proba_test_H2 = best_log_reg_model.predict_proba(X_test_H2_scaled)[:, 1]


conf_matrix_H2 = confusion_matrix(y_test_H2, y_pred_test_H2)
conf_matrix_percentage_H2 = conf_matrix_H2 / conf_matrix_H2.sum(axis=1).reshape(-1, 1) * 100


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_H2, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})


for i in range(conf_matrix_H2.shape[0]):
    for j in range(conf_matrix_H2.shape[1]):
        percentage_text = f"{conf_matrix_percentage_H2[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (H2 Test Set)')
plt.show()


print("\nClassification Report (H2 Test Set):")
print(classification_report(y_test_H2, y_pred_test_H2))


fpr_H2, tpr_H2, _ = roc_curve(y_test_H2, y_pred_proba_test_H2)
roc_auc_H2 = roc_auc_score(y_test_H2, y_pred_proba_test_H2)

plt.figure(figsize=(8, 6))
plt.plot(fpr_H2, tpr_H2, label=f'AUC = {roc_auc_H2:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (H2 Test Set)')
plt.legend(loc='lower right')
plt.show()


top10 efficients with optuna tuning

In [None]:

X_test_H2 = H2_test[top_10_features]
y_test_H2 = H2_test['is_canceled']


X_test_H2_scaled = scaler.transform(X_test_H2)


y_pred_test_H2 = best_log_reg_model.predict(X_test_H2_scaled)
y_pred_proba_test_H2 = best_log_reg_model.predict_proba(X_test_H2_scaled)[:, 1]


conf_matrix_H2 = confusion_matrix(y_test_H2, y_pred_test_H2)
conf_matrix_percentage_H2 = conf_matrix_H2 / conf_matrix_H2.sum(axis=1).reshape(-1, 1) * 100


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_H2, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_H2.shape[0]):
    for j in range(conf_matrix_H2.shape[1]):
        percentage_text = f"{conf_matrix_percentage_H2[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (H2 Test Set)')
plt.show()

print("\nClassification Report (H2 Test Set):")
print(classification_report(y_test_H2, y_pred_test_H2))


fpr_H2, tpr_H2, _ = roc_curve(y_test_H2, y_pred_proba_test_H2)
roc_auc_H2 = roc_auc_score(y_test_H2, y_pred_proba_test_H2)

plt.figure(figsize=(8, 6))
plt.plot(fpr_H2, tpr_H2, label=f'AUC = {roc_auc_H2:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (H2 Test Set, Top 10 Features with Best Parameters)')
plt.legend(loc='lower right')
plt.show()


top10 SHAP with optuna tuning

In [None]:

X_test_H2 = H2_test[['deposit_type', 'market_segment', 'total_of_special_requests', 'country',
                     'lead_time', 'previous_cancellations', 'required_car_parking_spaces', 'adr',
                     'stays_in_week_nights', 'stays_in_weekend_nights']]
y_test_H2 = H2_test['is_canceled']


X_test_H2_scaled = scaler.transform(X_test_H2)


y_pred_test_H2 = best_log_reg_model.predict(X_test_H2_scaled)
y_pred_proba_test_H2 = best_log_reg_model.predict_proba(X_test_H2_scaled)[:, 1]


conf_matrix_H2 = confusion_matrix(y_test_H2, y_pred_test_H2)
conf_matrix_percentage_H2 = conf_matrix_H2 / conf_matrix_H2.sum(axis=1).reshape(-1, 1) * 100


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_H2, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_H2.shape[0]):
    for j in range(conf_matrix_H2.shape[1]):
        percentage_text = f"{conf_matrix_percentage_H2[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (H2 Test Set, Top 10 SHAP Features)')
plt.show()

#
print("\nClassification Report (H2 Test Set, Top 10 SHAP Features):")
print(classification_report(y_test_H2, y_pred_test_H2))


fpr_H2, tpr_H2, _ = roc_curve(y_test_H2, y_pred_proba_test_H2)
roc_auc_H2 = roc_auc_score(y_test_H2, y_pred_proba_test_H2)

plt.figure(figsize=(8, 6))
plt.plot(fpr_H2, tpr_H2, label=f'AUC = {roc_auc_H2:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (H2 Test Set, Top 10 SHAP Features with Best Parameters)')
plt.legend(loc='lower right')
plt.show()


Boruta

In [None]:
# 使用 H2_test 定义测试集特征和目标变量
X_test_H2_boruta = H2_test[['lead_time', 'country', 'deposit_type', 'agent', 'adr', 'total_of_special_requests']]
y_test_H2 = H2_test['is_canceled']

# 使用训练集的 scaler 对 H2_test 进行标准化
X_test_H2_boruta_scaled = scaler.transform(X_test_H2_boruta)

# 在 H2_test 上进行预测
y_pred_test_H2 = best_log_reg_model.predict(X_test_H2_boruta_scaled)
y_pred_proba_test_H2 = best_log_reg_model.predict_proba(X_test_H2_boruta_scaled)[:, 1]

# 计算 H2_test 的混淆矩阵
conf_matrix_H2 = confusion_matrix(y_test_H2, y_pred_test_H2)
conf_matrix_percentage_H2 = conf_matrix_H2 / conf_matrix_H2.sum(axis=1).reshape(-1, 1) * 100

# 绘制 H2_test 的混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_H2, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_H2.shape[0]):
    for j in range(conf_matrix_H2.shape[1]):
        percentage_text = f"{conf_matrix_percentage_H2[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (H2 Test Set, Boruta Selected Features)')
plt.show()

# 输出 H2_test 的分类报告
print("\nClassification Report (H2 Test Set, Boruta Selected Features):")
print(classification_report(y_test_H2, y_pred_test_H2))

# 绘制 H2_test 的 ROC 曲线
fpr_H2, tpr_H2, _ = roc_curve(y_test_H2, y_pred_proba_test_H2)
roc_auc_H2 = roc_auc_score(y_test_H2, y_pred_proba_test_H2)

plt.figure(figsize=(8, 6))
plt.plot(fpr_H2, tpr_H2, label=f'AUC = {roc_auc_H2:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (H2 Test Set, Boruta Selected Features with Best Parameters)')
plt.legend(loc='lower right')
plt.show()


 ## 2.Random Forest





### baseline model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# 使用 H1_train 定义特征和目标变量
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']

# 标准化数据（随机森林通常不需要标准化，但可以保留以一致性）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 创建随机森林模型
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # 可以调整n_estimators等超参数

# 进行3-fold交叉验证并预测
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(rf_model, X_train_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(rf_model, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]

# 计算混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

# 绘制混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

# 在单元格内显示百分比，位于计数值的下方
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Random Forest)')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

# 绘制每个折的ROC曲线
plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    rf_model.fit(X_train_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = rf_model.predict_proba(X_train_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Random Forest, 3-fold CV)')
plt.legend(loc='lower right')
plt.show()


feature importance

In [None]:
# 计算并显示特征重要性
feature_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# 绘制特征重要性条形图
plt.figure(figsize=(10, 8))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance (Random Forest)')
plt.gca().invert_yaxis()
plt.show()

# 输出特征重要性数据表
print("Feature Importances:")
print(feature_importance_df)


SHAP

In [None]:
import shap

In [None]:
print(shap_values.shape)

In [None]:
print(X_sample.shape)

In [None]:
print("X_train shape:", X_train.shape)         # 原始训练数据形状
print("Sample data shape:", sample_data.shape)  # 传入SHAP的数据形状


In [None]:
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from tqdm import tqdm  # 进度条库

# Step 1: 计算特征重要性并选择前10个重要特征
feature_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# 选择前10个重要特征
top_features = feature_importance_df['Feature'].head(10).tolist()
print("Top 10 important features:", top_features)

# Step 2: 从训练集中抽取样本数据，仅保留前10个重要特征用于绘图
sample_data = X_train[top_features].sample(n=500, random_state=42)

# Step 3: 使用完整的训练数据作为背景数据初始化SHAP解释器
start_time = time.time()
print("Initializing SHAP TreeExplainer...")

# 使用完整训练数据作为背景数据，确保模型输入一致性
explainer = shap.TreeExplainer(rf_model, data=X_train.sample(n=200, random_state=42), approximate=True)
shap_values = explainer.shap_values(sample_data)

# Step 4: 计算中位数SHAP值并绘制特征重要性图
print("Calculating Median SHAP values...")
shap_median_importance = np.median(np.abs(shap_values[1]), axis=0)  # 使用类别1的SHAP值，适用于二分类模型
shap_importance_df = pd.DataFrame({
    'Feature': top_features,
    'Median SHAP Value': shap_median_importance
}).sort_values(by='Median SHAP Value', ascending=False)

# 绘制特征重要性图
plt.figure(figsize=(10, 8))
plt.barh(shap_importance_df['Feature'], shap_importance_df['Median SHAP Value'])
plt.xlabel('Median SHAP Value')
plt.ylabel('Feature')
plt.title('Top 10 Feature Importance (Median SHAP Values) for Random Forest')
plt.gca().invert_yaxis()
plt.show()

# Step 5: 绘制SHAP summary plot
print("Generating SHAP summary plot...")
shap.summary_plot(shap_values[1], sample_data, plot_type='dot', show=True)

# 记录结束时间
end_time = time.time()
print(f"Execution completed in {end_time - start_time:.2f} seconds.")



In [None]:
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.ensemble import RandomForestClassifier

# 计算特征重要性并选择前10个重要特征
feature_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# 选择前10个重要特征
top_features = feature_importance_df['Feature'].head(10).tolist()
print("Top 10 important features:", top_features)

# 重新训练模型，仅使用前10个重要特征
X_train_reduced = X_train[top_features]  # 保留前10个重要特征
rf_model_reduced = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)  # 使用更简单的模型
rf_model_reduced.fit(X_train_reduced, y_train)  # 使用前10个特征重新训练模型

# 准备样本数据
sample_data = X_train_reduced.sample(n=200, random_state=42)

# 使用 KernelExplainer 进行 SHAP 分析
start_time = time.time()
print("Initializing SHAP KernelExplainer...")
explainer = shap.KernelExplainer(rf_model_reduced.predict, shap.kmeans(sample_data, 10))
shap_values = explainer.shap_values(sample_data)

# 绘制特征重要性图
print("Calculating Median SHAP values...")
shap_median_importance = np.median(np.abs(shap_values), axis=0)
shap_importance_df = pd.DataFrame({
    'Feature': top_features,
    'Median SHAP Value': shap_median_importance
}).sort_values(by='Median SHAP Value', ascending=False)

plt.figure(figsize=(10, 8))
plt.barh(shap_importance_df['Feature'], shap_importance_df['Median SHAP Value'])
plt.xlabel('Median SHAP Value')
plt.ylabel('Feature')
plt.title('Top 10 Feature Importance (Median SHAP Values) for Reduced Random Forest')
plt.gca().invert_yaxis()
plt.show()

# 绘制SHAP summary plot
print("Generating SHAP summary plot...")
shap.summary_plot(shap_values, sample_data, plot_type='dot', show=True)

# 记录结束时间
end_time = time.time()
print(f"Execution completed in {end_time - start_time:.2f} seconds.")



Boruta

boruta 1114

In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# 使用 H1_train 定义特征和目标变量
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 定义Boruta模型并进行特征选择
rf_boruta = RandomForestClassifier(n_estimators=100, random_state=42)
boruta_selector = BorutaPy(rf_boruta, n_estimators='auto', random_state=42)
boruta_selector.fit(X_train_scaled, y_train)

# 获取被选中的特征
selected_features = X_train.columns[boruta_selector.support_]
print("Selected features by Boruta:", selected_features)

# 使用Boruta筛选的特征重新定义训练数据
X_train_boruta = X_train[selected_features]
X_train_boruta_scaled = scaler.fit_transform(X_train_boruta)

# 使用筛选后的特征构建新的随机森林模型
rf_model_boruta = RandomForestClassifier(n_estimators=100, random_state=42)
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv_boruta = cross_val_predict(rf_model_boruta, X_train_boruta_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv_boruta = cross_val_predict(rf_model_boruta, X_train_boruta_scaled, y_train, cv=cv, method='predict_proba')[:, 1]

# 计算混淆矩阵
conf_matrix_boruta = confusion_matrix(y_train, y_pred_cv_boruta)
conf_matrix_percentage_boruta = conf_matrix_boruta / conf_matrix_boruta.sum(axis=1).reshape(-1, 1) * 100

# 绘制混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_boruta, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

# 在单元格内显示百分比，位于计数值的下方
for i in range(conf_matrix_boruta.shape[0]):
    for j in range(conf_matrix_boruta.shape[1]):
        percentage_text = f"{conf_matrix_percentage_boruta[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Random Forest with Boruta Features)')
plt.show()

# 输出分类报告
print("\nClassification Report (Random Forest with Boruta Features):")
print(classification_report(y_train, y_pred_cv_boruta))

# 绘制每个折的ROC曲线
plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_boruta_scaled, y_train):
    rf_model_boruta.fit(X_train_boruta_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = rf_model_boruta.predict_proba(X_train_boruta_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Random Forest with Boruta Features, 3-fold CV)')
plt.legend(loc='lower right')
plt.show()


In [None]:
# 绘制 Boruta 选择的特征的重要性条形图
# 使用 Random Forest 的 feature_importances_
rf_model_boruta.fit(X_train_boruta_scaled, y_train)  # 重新训练以获取特征重要性
feature_importance_df = pd.DataFrame({
    'Feature': selected_features,
    'Importance': rf_model_boruta.feature_importances_
}).sort_values(by='Importance', ascending=False)

# 绘制特征重要性条形图
plt.figure(figsize=(10, 8))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance for Boruta-selected Features (Random Forest)')
plt.gca().invert_yaxis()
plt.show()

### different feature combinations

TOP10

feature_importance

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# 使用特征重要性计算并选择前10个重要特征
feature_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# 选择前10个重要特征
top_features = feature_importance_df['Feature'].head(10).tolist()
print("Top 10 important features based on feature_importance_:", top_features)

# 绘制特征重要性条形图
plt.figure(figsize=(10, 8))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance (Random Forest)')
plt.gca().invert_yaxis()
plt.show()

# 输出特征重要性数据表
print("Feature Importances:")
print(feature_importance_df)

# 使用前10个重要特征重新定义特征和目标变量
X_train_top10 = H1_train[top_features]
y_train = H1_train['is_canceled']

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_top10)

# 创建随机森林模型
rf_model_top10 = RandomForestClassifier(n_estimators=100, random_state=42)

# 进行3-fold交叉验证并预测
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(rf_model_top10, X_train_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(rf_model_top10, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]

# 计算混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

# 绘制混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

# 在单元格内显示百分比，位于计数值的下方
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Random Forest with Top 10 Features)')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

# 绘制每个折的ROC曲线
plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    rf_model_top10.fit(X_train_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = rf_model_top10.predict_proba(X_train_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Random Forest with Top 10 Features, 3-fold CV)')
plt.legend(loc='lower right')
plt.show()


SHAP

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# 使用 SHAP 选出的10个特征
selected_columns = ['lead_time', 'country', 'deposit_type', 'adr',
                    'total_of_special_requests', 'agent', 'arrival_date_month',
                    'stays_in_week_nights', 'market_segment', 'previous_cancellations']

# 定义特征和目标变量
X_train = H1_train[selected_columns]
y_train = H1_train['is_canceled']

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 创建随机森林模型
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# 进行3-fold交叉验证并预测
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(rf_model, X_train_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(rf_model, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]

# 计算混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

# 绘制混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

# 在单元格内显示百分比，位于计数值的下方
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Random Forest)')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

# 绘制每个折的ROC曲线
plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    rf_model.fit(X_train_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = rf_model.predict_proba(X_train_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Random Forest, 3-fold CV)')
plt.legend(loc='lower right')
plt.show()


Boruta

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# 定义 Boruta 筛选出的特征
selected_columns = ['lead_time', 'country', 'deposit_type', 'agent', 'adr', 'total_of_special_requests']

# 使用 H1_train 定义特征和目标变量
X_train = H1_train[selected_columns]
y_train = H1_train['is_canceled']

# 标准化数据（随机森林通常不需要标准化，但为了流程一致可以保留）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 创建随机森林模型
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# 进行3-fold交叉验证并预测
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(rf_model, X_train_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(rf_model, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]

# 计算混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

# 绘制混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

# 在单元格内显示百分比，位于计数值的下方
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Random Forest)')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

# 绘制每个折的ROC曲线
plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    rf_model.fit(X_train_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = rf_model.predict_proba(X_train_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Random Forest, 3-fold CV)')
plt.legend(loc='lower right')
plt.show()


### hyperparameter tuning

In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

def objective(trial):
    # Define the hyperparameters to optimize
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None])
    }

    # Create and evaluate model using cross-validation
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    model = RandomForestClassifier(**params, random_state=42)
    scores = []

    for train_idx, val_idx in cv.split(X_train_scaled, y_train):
        X_fold_train = X_train_scaled[train_idx]
        X_fold_val = X_train_scaled[val_idx]
        y_fold_train = y_train.iloc[train_idx]
        y_fold_val = y_train.iloc[val_idx]

        model.fit(X_fold_train, y_fold_train)
        y_pred_proba = model.predict_proba(X_fold_val)[:, 1]
        score = roc_auc_score(y_fold_val, y_pred_proba)
        scores.append(score)

    return np.mean(scores)

# Prepare data
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Create and run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get best parameters
best_params = study.best_params
print("\nBest parameters:", best_params)
print(f"Best ROC-AUC score: {study.best_value:.4f}")

# Train final model with best parameters
best_rf = RandomForestClassifier(**best_params, random_state=42)

# Perform cross-validation with the best model
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(best_rf, X_train_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(best_rf, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]

# Plot optimization history
plt.figure(figsize=(10, 6))
optuna.visualization.matplotlib.plot_optimization_history(study)
plt.title('Optimization History')
plt.show()

# Plot parameter importances
plt.figure(figsize=(10, 6))
optuna.visualization.matplotlib.plot_param_importances(study)
plt.title('Parameter Importances')
plt.show()

# Confusion Matrix
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Optimized Random Forest)')
plt.show()

# Classification Report
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

# ROC Curves
plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    best_rf.fit(X_train_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = best_rf.predict_proba(X_train_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Optimized Random Forest, 3-fold CV)')
plt.legend(loc='lower right')
plt.show()

# Feature Importance Plot
best_rf.fit(X_train_scaled, y_train)
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_rf.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance (Optimized Random Forest)')
plt.show()

top10 feature importance

In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_top10)

# 定义目标函数用于Optuna超参数调优
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)

    # 创建随机森林模型
    rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    # 进行3-fold交叉验证
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred_proba_cv = cross_val_predict(rf_model, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]
    auc_score = roc_auc_score(y_train, y_pred_proba_cv)

    return auc_score

# 使用Optuna进行超参数调优
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 输出最佳参数
print("Best Parameters (Optuna):", study.best_params)

# 使用最佳参数重新创建随机森林模型
best_rf_model = RandomForestClassifier(**study.best_params, random_state=42)

# 在训练集上拟合最佳模型
best_rf_model.fit(X_train_scaled, y_train)

# 在训练集上进行预测
y_pred_train = best_rf_model.predict(X_train_scaled)
y_pred_proba_train = best_rf_model.predict_proba(X_train_scaled)[:, 1]

# 计算混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred_train)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

# 绘制混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Random Forest with Best Parameters)')
plt.show()

# 输出分类报告
print("\nClassification Report (Training Set):")
print(classification_report(y_train, y_pred_train))

# 绘制 ROC 曲线
fpr, tpr, _ = roc_curve(y_train, y_pred_proba_train)
roc_auc = roc_auc_score(y_train, y_pred_proba_train)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Training Set, Random Forest with Best Parameters)')
plt.legend(loc='lower right')
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import optuna

# 定义目标函数用于Optuna调参
def objective(trial):
    # 调整随机森林的超参数
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    max_depth = trial.suggest_int("max_depth", 5, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 5)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])

    # 创建随机森林模型
    rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )

    # 3折交叉验证
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred_proba_cv = []
    for train_idx, val_idx in cv.split(X_train_scaled, y_train):
        rf_model.fit(X_train_scaled[train_idx], y_train.iloc[train_idx])
        y_pred_proba_cv.extend(rf_model.predict_proba(X_train_scaled[val_idx])[:, 1])

    # 计算AUC
    auc_score = roc_auc_score(y_train, y_pred_proba_cv)
    return auc_score

# 使用Optuna进行超参数调优
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# 输出最佳参数
print("Best Parameters (Random Forest):", study.best_params)

# 使用最佳参数创建随机森林模型
best_rf_model = RandomForestClassifier(
    n_estimators=study.best_params["n_estimators"],
    max_depth=study.best_params["max_depth"],
    min_samples_split=study.best_params["min_samples_split"],
    min_samples_leaf=study.best_params["min_samples_leaf"],
    max_features=study.best_params["max_features"],
    random_state=42
)

# 在训练集上拟合模型
best_rf_model.fit(X_train_scaled, y_train)

# 在测试集上进行预测
X_test_scaled = scaler.transform(X_test)  # 使用训练集的scaler
y_pred_test = best_rf_model.predict(X_test_scaled)
y_pred_proba_test = best_rf_model.predict_proba(X_test_scaled)[:, 1]

# 计算测试集的混淆矩阵
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
conf_matrix_percentage_test = conf_matrix_test / conf_matrix_test.sum(axis=1).reshape(-1, 1) * 100

# 绘制测试集混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_test.shape[0]):
    for j in range(conf_matrix_test.shape[1]):
        percentage_text = f"{conf_matrix_percentage_test[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Test Set with Optimized Random Forest)')
plt.show()

# 输出测试集分类报告
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))

# 绘制ROC曲线
fpr, tpr, _ = roc_curve(y_test, y_pred_proba_test)
roc_auc = roc_auc_score(y_test, y_pred_proba_test)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Test Set AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Test Set with Optimized Random Forest)')
plt.legend(loc='lower right')
plt.show()


In [None]:
print(selected_columns)

1126

In [None]:
import optuna
from sklearn.model_selection import cross_val_score
import numpy as np

def objective(trial):
    """
    Optuna目标函数，用于优化随机森林的超参数
    """
    # 定义超参数搜索空间
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }

    # 创建模型
    rf_model = RandomForestClassifier(
        random_state=42,
        **params
    )

    # 使用5折交叉验证评估模型
    # 使用ROC AUC作为评估指标
    scores = cross_val_score(
        rf_model,
        X_train_scaled,
        y_train,
        cv=5,
        scoring='roc_auc',
        n_jobs=-1
    )

    # 返回平均分数
    return np.mean(scores)

# 创建study对象
study = optuna.create_study(direction='maximize')

# 运行优化
print("Starting hyperparameter optimization...")
study.optimize(objective, n_trials=50, show_progress_bar=True)

# 打印最佳结果
print("\nBest trial:")
trial = study.best_trial

print("  Value (ROC AUC): ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# 使用最佳参数创建和训练最终模型
best_params = study.best_params
best_rf_model = RandomForestClassifier(
    random_state=42,
    **best_params
)

# 在完整训练集上训练模型
best_rf_model.fit(X_train_scaled, y_train)

# 进行交叉验证预测
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(best_rf_model, X_train_scaled, y_train, cv=cv)
y_pred_proba_cv = cross_val_predict(best_rf_model, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]

# 评估最佳模型
print("\nBest Model Evaluation:")
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

# 计算和打印ROC AUC分数
roc_auc = roc_auc_score(y_train, y_pred_proba_cv)
print(f"\nROC AUC Score: {roc_auc:.4f}")

# 绘制ROC曲线
plt.figure(figsize=(10, 6))
fpr, tpr, _ = roc_curve(y_train, y_pred_proba_cv)
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve of Optimized Random Forest Model')
plt.legend(loc='lower right')
plt.show()

# 绘制混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred_cv)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix of Optimized Random Forest Model')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# 绘制优化历史
plt.figure(figsize=(10, 6))
optuna.visualization.matplotlib.plot_optimization_history(study)
plt.title('Optimization History')
plt.show()

# 绘制参数重要性
plt.figure(figsize=(10, 6))
optuna.visualization.matplotlib.plot_param_importances(study)
plt.title('Parameter Importances')
plt.show()

TOP10 SHAP

In [None]:
import optuna
from sklearn.model_selection import train_test_split

# 使用 SHAP 选出的10个特征
selected_columns = ['lead_time', 'country', 'deposit_type', 'adr',
                    'total_of_special_requests', 'agent', 'arrival_date_month',
                    'stays_in_week_nights', 'market_segment', 'previous_cancellations']

# 定义特征和目标变量
X_train = H1_train[selected_columns]
y_train = H1_train['is_canceled']

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 分割训练集为训练和验证集，用于Optuna调优
X_train_optuna, X_val_optuna, y_train_optuna, y_val_optuna = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# 定义目标函数
def objective(trial):
    # 定义搜索空间
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 5, 50)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])

    # 创建随机森林模型
    rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )

    # 训练模型
    rf_model.fit(X_train_optuna, y_train_optuna)

    # 预测验证集
    y_val_pred = rf_model.predict_proba(X_val_optuna)[:, 1]

    # 返回AUC分数
    return roc_auc_score(y_val_optuna, y_val_pred)

# 创建Optuna研究
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# 输出最佳超参数和最佳分数
print("Best parameters:", study.best_params)
print("Best AUC score:", study.best_value)

# 使用最佳参数训练最终模型
best_params = study.best_params
final_model = RandomForestClassifier(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    max_features=best_params["max_features"],
    random_state=42
)

final_model.fit(X_train_scaled, y_train)

# 使用交叉验证评估最终模型
y_pred_cv_final = cross_val_predict(final_model, X_train_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv_final = cross_val_predict(final_model, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]

# 输出分类报告和混淆矩阵
conf_matrix_final = confusion_matrix(y_train, y_pred_cv_final)
print("\nConfusion Matrix (Final Model):")
print(conf_matrix_final)
print("\nClassification Report (Final Model):")
print(classification_report(y_train, y_pred_cv_final))


In [None]:
# 绘制测试集混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_test.shape[0]):
    for j in range(conf_matrix_test.shape[1]):
        percentage_text = f"{conf_matrix_percentage_test[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Test Set with Optimized Random Forest)')
plt.show()

# 输出测试集分类报告
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))

# 绘制测试集的ROC曲线
fpr, tpr, _ = roc_curve(y_test, y_pred_proba_test)
roc_auc = roc_auc_score(y_test, y_pred_proba_test)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Test Set AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Test Set with Optimized Random Forest)')
plt.legend(loc='lower right')
plt.show()

BORUTA

1116

In [None]:
# 导入必要库
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 定义 Boruta 筛选出的特征
selected_columns = ['lead_time', 'country', 'deposit_type', 'agent', 'adr', 'total_of_special_requests']

# 使用 Boruta 筛选的特征子集
X_train = H1_train[selected_columns]
y_train = H1_train['is_canceled']

# 标准化数据（随机森林不需要标准化，但为了流程一致可以保留）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 定义优化目标函数
def objective(trial):
    # 定义需要优化的超参数
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    # 创建随机森林模型
    rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )

    # 使用 StratifiedKFold 进行交叉验证
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    auc_scores = []
    for train_idx, test_idx in cv.split(X_train_scaled, y_train):
        rf_model.fit(X_train_scaled[train_idx], y_train.iloc[train_idx])
        y_pred_proba = rf_model.predict_proba(X_train_scaled[test_idx])[:, 1]
        auc_scores.append(roc_auc_score(y_train.iloc[test_idx], y_pred_proba))

    # 返回平均 AUC 作为目标优化的指标
    return np.mean(auc_scores)

# 创建 Optuna study 对象并优化
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 输出最佳参数和最高 AUC
print("Best Parameters:", study.best_params)
print("Best AUC:", study.best_value)

# 使用最佳参数重新训练模型
best_params = study.best_params
optimized_rf_model = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    random_state=42
)

# 使用最佳参数模型进行交叉验证预测
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(optimized_rf_model, X_train_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(optimized_rf_model, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]

# 计算混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

# 绘制混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

# 在单元格内显示百分比，位于计数值的下方
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Optimized Random Forest)')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

# 绘制每个折的 ROC 曲线
plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    optimized_rf_model.fit(X_train_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = optimized_rf_model.predict_proba(X_train_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Optimized Random Forest, 3-fold CV)')
plt.legend(loc='lower right')
plt.show()


### test set

In [None]:
# Prepare train data
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']

# Prepare test data with the same columns as training data
X_test = H1_test[X_train.columns]  # This ensures we use exactly the same columns
y_test = H1_test['is_canceled']

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Use the same scaler for test data

def objective(trial):
    # Define the hyperparameters to optimize
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None])
    }

    # Create and evaluate model using cross-validation
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    model = RandomForestClassifier(**params, random_state=42)
    scores = []

    for train_idx, val_idx in cv.split(X_train_scaled, y_train):
        X_fold_train = X_train_scaled[train_idx]
        X_fold_val = X_train_scaled[val_idx]
        y_fold_train = y_train.iloc[train_idx]
        y_fold_val = y_train.iloc[val_idx]

        model.fit(X_fold_train, y_fold_train)
        y_pred_proba = model.predict_proba(X_fold_val)[:, 1]
        score = roc_auc_score(y_fold_val, y_pred_proba)
        scores.append(score)

    return np.mean(scores)

# Create and run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get best parameters
best_params = study.best_params
print("\nBest parameters:", best_params)
print(f"Best ROC-AUC score: {study.best_value:.4f}")

# Train final model with best parameters
best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(X_train_scaled, y_train)

# Make predictions on test set
y_test_pred = best_rf.predict(X_test_scaled)
y_test_pred_proba = best_rf.predict_proba(X_test_scaled)[:, 1]

# Calculate test set metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_roc_auc = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Performance Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"ROC-AUC Score: {test_roc_auc:.4f}")

# Confusion Matrix for Test Set
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_conf_matrix_percentage = test_conf_matrix / test_conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(test_conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

for i in range(test_conf_matrix.shape[0]):
    for j in range(test_conf_matrix.shape[1]):
        percentage_text = f"{test_conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Test Set Confusion Matrix (Optimized Random Forest)')
plt.show()

# Classification Report for Test Set
print("\nTest Set Classification Report:")
print(classification_report(y_test, y_test_pred))

# ROC Curve for Test Set
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_pred_proba)
test_auc = roc_auc_score(y_test, y_test_pred_proba)

plt.figure(figsize=(10, 6))
plt.plot(fpr_test, tpr_test, label=f'Test Set (AUC = {test_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve on Test Set (Optimized Random Forest)')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_rf.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance (Optimized Random Forest)')
plt.show()

In [None]:
# Prepare train data
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']

# Prepare test data with the same columns as training data
X_test = H1_test[X_train.columns]  # This ensures we use exactly the same columns
y_test = H1_test['is_canceled']

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Use the same scaler for test data

def objective(trial):
    # Define the hyperparameters to optimize
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None])
    }

    # Create and evaluate model using cross-validation
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    model = RandomForestClassifier(**params, random_state=42)
    scores = []

    for train_idx, val_idx in cv.split(X_train_scaled, y_train):
        X_fold_train = X_train_scaled[train_idx]
        X_fold_val = X_train_scaled[val_idx]
        y_fold_train = y_train.iloc[train_idx]
        y_fold_val = y_train.iloc[val_idx]

        model.fit(X_fold_train, y_fold_train)
        y_pred_proba = model.predict_proba(X_fold_val)[:, 1]
        score = roc_auc_score(y_fold_val, y_pred_proba)
        scores.append(score)

    return np.mean(scores)

# Create and run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get best parameters
best_params = study.best_params
print("\nBest parameters:", best_params)
print(f"Best ROC-AUC score: {study.best_value:.4f}")

# Train final model with best parameters
best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(X_train_scaled, y_train)

# Make predictions on test set
y_test_pred = best_rf.predict(X_test_scaled)
y_test_pred_proba = best_rf.predict_proba(X_test_scaled)[:, 1]

# Calculate test set metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_roc_auc = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Performance Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"ROC-AUC Score: {test_roc_auc:.4f}")

# Confusion Matrix for Test Set
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_conf_matrix_percentage = test_conf_matrix / test_conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(test_conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

for i in range(test_conf_matrix.shape[0]):
    for j in range(test_conf_matrix.shape[1]):
        percentage_text = f"{test_conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Test Set Confusion Matrix (Optimized Random Forest)')
plt.show()

# Classification Report for Test Set
print("\nTest Set Classification Report:")
print(classification_report(y_test, y_test_pred))

# ROC Curve for Test Set
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_pred_proba)
test_auc = roc_auc_score(y_test, y_test_pred_proba)

plt.figure(figsize=(10, 6))
plt.plot(fpr_test, tpr_test, label=f'Test Set (AUC = {test_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve on Test Set (Optimized Random Forest)')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_rf.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance (Optimized Random Forest)')
plt.show()

top10 feature importance

In [None]:
# 使用 H1_test 定义测试集特征和目标变量
X_test = H1_test[top_features]
y_test = H1_test['is_canceled']

# 使用训练集的 scaler 对测试集进行标准化
X_test_scaled = scaler.transform(X_test)

# 使用最佳模型预测测试集结果
y_pred_test = best_rf_model.predict(X_test_scaled)
y_pred_proba_test = best_rf_model.predict_proba(X_test_scaled)[:, 1]

# 计算测试集的混淆矩阵
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
conf_matrix_percentage_test = conf_matrix_test / conf_matrix_test.sum(axis=1).reshape(-1, 1) * 100

# 绘制测试集混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_test.shape[0]):
    for j in range(conf_matrix_test.shape[1]):
        percentage_text = f"{conf_matrix_percentage_test[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Test Set)')
plt.show()

# 输出测试集分类报告
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))

# 计算测试集 ROC 曲线和 AUC
fpr, tpr, _ = roc_curve(y_test, y_pred_proba_test)
roc_auc_test = roc_auc_score(y_test, y_pred_proba_test)
print(f"Test Set AUC: {roc_auc_test:.2f}")

# 绘制测试集 ROC 曲线
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Test Set AUC = {roc_auc_test:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Test Set)')
plt.legend(loc='lower right')
plt.show()


In [None]:
# 使用 H1_test 定义测试集特征和目标变量
X_test = H1_test[top_features]
y_test = H1_test['is_canceled']

# 使用训练集的 scaler 对测试集进行标准化
X_test_scaled = scaler.transform(X_test)

# 使用最佳模型预测测试集结果
y_pred_test = best_rf_model.predict(X_test_scaled)
y_pred_proba_test = best_rf_model.predict_proba(X_test_scaled)[:, 1]

# 计算测试集的混淆矩阵
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
conf_matrix_percentage_test = conf_matrix_test / conf_matrix_test.sum(axis=1).reshape(-1, 1) * 100

# 绘制测试集混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_test.shape[0]):
    for j in range(conf_matrix_test.shape[1]):
        percentage_text = f"{conf_matrix_percentage_test[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Test Set)')
plt.show()

# 输出测试集分类报告
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))

# 计算测试集 ROC 曲线和 AUC
fpr, tpr, _ = roc_curve(y_test, y_pred_proba_test)
roc_auc_test = roc_auc_score(y_test, y_pred_proba_test)
print(f"Test Set AUC: {roc_auc_test:.2f}")

# 绘制测试集 ROC 曲线
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Test Set AUC = {roc_auc_test:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Test Set)')
plt.legend(loc='lower right')
plt.show()


TOP10 SHAP

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# 使用 H1_test 定义测试集特征和目标变量
X_test = H1_test[selected_columns]  # 使用 SHAP 选出的特征
y_test = H1_test['is_canceled']  # 测试集目标变量

# 使用训练集的 scaler 对测试集进行标准化
X_test_scaled = scaler.transform(X_test)

# 使用最佳参数创建随机森林模型
best_rf_model = RandomForestClassifier(
    n_estimators=227,
    max_depth=30,
    min_samples_split=3,
    min_samples_leaf=1,
    max_features='log2',
    random_state=42
)

# 使用训练集重新训练最佳模型
best_rf_model.fit(X_train_scaled, y_train)

# 使用最佳模型预测测试集结果
y_pred_test = best_rf_model.predict(X_test_scaled)
y_pred_proba_test = best_rf_model.predict_proba(X_test_scaled)[:, 1]

# 计算测试集的混淆矩阵
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
conf_matrix_percentage_test = conf_matrix_test / conf_matrix_test.sum(axis=1).reshape(-1, 1) * 100

# 绘制测试集混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_test.shape[0]):
    for j in range(conf_matrix_test.shape[1]):
        percentage_text = f"{conf_matrix_percentage_test[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Test Set)')
plt.show()

# 输出测试集分类报告
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))

# 计算测试集 ROC 曲线和 AUC
fpr, tpr, _ = roc_curve(y_test, y_pred_proba_test)
roc_auc_test = roc_auc_score(y_test, y_pred_proba_test)
print(f"Test Set AUC: {roc_auc_test:.2f}")

# 绘制测试集 ROC 曲线
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Test Set AUC = {roc_auc_test:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Test Set)')
plt.legend(loc='lower right')
plt.show()


BORUTA

In [None]:
# 使用最佳参数构建最终模型
final_rf_model = RandomForestClassifier(
    n_estimators=186,
    max_depth=24,
    min_samples_split=6,
    min_samples_leaf=1,
    max_features='log2',
    random_state=42
)

# 准备测试数据
X_test = H1_test[X_train.columns]  # 确保测试集使用和训练集完全相同的特征
y_test = H1_test['is_canceled']

# 标准化测试数据
X_test_scaled = scaler.transform(X_test)  # 使用训练集的 scaler 对测试集进行转换

# 在训练集上训练模型
final_rf_model.fit(X_train_scaled, y_train)

# 在测试集上进行预测
y_test_pred = final_rf_model.predict(X_test_scaled)
y_test_proba = final_rf_model.predict_proba(X_test_scaled)[:, 1]

# 评估测试集上的表现
# 混淆矩阵
conf_matrix_test = confusion_matrix(y_test, y_test_pred)
conf_matrix_test_percentage = conf_matrix_test / conf_matrix_test.sum(axis=1).reshape(-1, 1) * 100

# 绘制混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

# 在单元格内显示百分比，位于计数值的下方
for i in range(conf_matrix_test.shape[0]):
    for j in range(conf_matrix_test.shape[1]):
        percentage_text = f"{conf_matrix_test_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Test Set)')
plt.show()

# 输出分类报告
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_test_pred))

# 绘制测试集 ROC 曲线
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_proba)
roc_auc_test = roc_auc_score(y_test, y_test_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr_test, tpr_test, label=f'ROC Curve (AUC = {roc_auc_test:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve on Test Set')
plt.legend(loc='lower right')
plt.show()

# 输出 AUC
print(f"Test Set AUC: {roc_auc_test:.2f}")


### cross-dataset evaluation: H2 test set

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Prepare train data
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']

# Prepare H2 test data
X_test_H2 = H2_test[X_train.columns]  # Ensure consistent columns with training data
y_test_H2 = H2_test['is_canceled']

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_H2_scaled = scaler.transform(X_test_H2)  # Use the same scaler for H2 test data

# Use the optimized parameters
best_params = {
    'n_estimators': 98,
    'max_depth': 20,
    'min_samples_split': 5,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'class_weight': 'balanced',
}

# Train the model with the best parameters
best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(X_train_scaled, y_train)

# Make predictions on H2 test set
y_test_H2_pred = best_rf.predict(X_test_H2_scaled)
y_test_H2_pred_proba = best_rf.predict_proba(X_test_H2_scaled)[:, 1]

# Calculate H2 test set metrics
test_H2_accuracy = accuracy_score(y_test_H2, y_test_H2_pred)
test_H2_roc_auc = roc_auc_score(y_test_H2, y_test_H2_pred_proba)

print("\nH2 Test Set Performance Metrics:")
print(f"Accuracy: {test_H2_accuracy:.4f}")
print(f"ROC-AUC Score: {test_H2_roc_auc:.4f}")

# Confusion Matrix for H2 Test Set
test_H2_conf_matrix = confusion_matrix(y_test_H2, y_test_H2_pred)
test_H2_conf_matrix_percentage = test_H2_conf_matrix / test_H2_conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(test_H2_conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

for i in range(test_H2_conf_matrix.shape[0]):
    for j in range(test_H2_conf_matrix.shape[1]):
        percentage_text = f"{test_H2_conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('H2 Test Set Confusion Matrix (Optimized Random Forest)')
plt.show()

# Classification Report for H2 Test Set
print("\nH2 Test Set Classification Report:")
print(classification_report(y_test_H2, y_test_H2_pred))

# ROC Curve for H2 Test Set
fpr_test_H2, tpr_test_H2, _ = roc_curve(y_test_H2, y_test_H2_pred_proba)
test_H2_auc = roc_auc_score(y_test_H2, y_test_H2_pred_proba)

plt.figure(figsize=(10, 6))
plt.plot(fpr_test_H2, tpr_test_H2, label=f'H2 Test Set (AUC = {test_H2_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve on H2 Test Set (Optimized Random Forest)')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()


top10 feature importance

In [None]:
# 使用 H2_test 定义测试集特征和目标变量
X_test_H2 = H2_test[top_features]
y_test_H2 = H2_test['is_canceled']

# 使用训练集的 scaler 对 H2_test 数据集进行标准化
X_test_H2_scaled = scaler.transform(X_test_H2)

# 使用最佳模型预测 H2_test 的结果
y_pred_test_H2 = best_rf_model.predict(X_test_H2_scaled)
y_pred_proba_test_H2 = best_rf_model.predict_proba(X_test_H2_scaled)[:, 1]

# 计算 H2_test 的混淆矩阵
conf_matrix_test_H2 = confusion_matrix(y_test_H2, y_pred_test_H2)
conf_matrix_percentage_test_H2 = conf_matrix_test_H2 / conf_matrix_test_H2.sum(axis=1).reshape(-1, 1) * 100

# 绘制 H2_test 混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_test_H2, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_test_H2.shape[0]):
    for j in range(conf_matrix_test_H2.shape[1]):
        percentage_text = f"{conf_matrix_percentage_test_H2[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (H2 Test Set)')
plt.show()

# 输出 H2_test 分类报告
print("\nClassification Report (H2 Test Set):")
print(classification_report(y_test_H2, y_pred_test_H2))

# 计算 H2_test ROC 曲线和 AUC
fpr_H2, tpr_H2, _ = roc_curve(y_test_H2, y_pred_proba_test_H2)
roc_auc_test_H2 = roc_auc_score(y_test_H2, y_pred_proba_test_H2)
print(f"H2 Test Set AUC: {roc_auc_test_H2:.2f}")

# 绘制 H2_test ROC 曲线
plt.figure(figsize=(8, 6))
plt.plot(fpr_H2, tpr_H2, label=f'H2 Test Set AUC = {roc_auc_test_H2:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (H2 Test Set)')
plt.legend(loc='lower right')
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 使用 H2_test 定义测试集特征和目标变量
X_test_H2 = H2_test[top_features]
y_test_H2 = H2_test['is_canceled']

# 使用训练集的 scaler 对 H2 测试集进行标准化
X_test_H2_scaled = scaler.transform(X_test_H2)

# 使用最佳参数重新创建随机森林模型
best_rf_model = RandomForestClassifier(
    n_estimators=study.best_params["n_estimators"],
    max_depth=study.best_params["max_depth"],
    min_samples_split=study.best_params["min_samples_split"],
    min_samples_leaf=study.best_params["min_samples_leaf"],
    max_features=study.best_params["max_features"],
    random_state=42
)

# 在训练集上拟合模型
best_rf_model.fit(X_train_scaled, y_train)

# 在 H2 测试集上进行预测
y_pred_test_H2 = best_rf_model.predict(X_test_H2_scaled)
y_pred_proba_test_H2 = best_rf_model.predict_proba(X_test_H2_scaled)[:, 1]

# 计算 H2 测试集的混淆矩阵
conf_matrix_test_H2 = confusion_matrix(y_test_H2, y_pred_test_H2)
conf_matrix_percentage_test_H2 = conf_matrix_test_H2 / conf_matrix_test_H2.sum(axis=1).reshape(-1, 1) * 100

# 绘制 H2 测试集的混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_test_H2, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_test_H2.shape[0]):
    for j in range(conf_matrix_test_H2.shape[1]):
        percentage_text = f"{conf_matrix_percentage_test_H2[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (H2 Test Set with Optimized Random Forest)')
plt.show()

# 输出 H2 测试集分类报告
print("\nClassification Report (H2 Test Set):")
print(classification_report(y_test_H2, y_pred_test_H2))

# 绘制 H2 测试集的 ROC 曲线
fpr_H2, tpr_H2, _ = roc_curve(y_test_H2, y_pred_proba_test_H2)
roc_auc_H2 = roc_auc_score(y_test_H2, y_pred_proba_test_H2)

plt.figure(figsize=(8, 6))
plt.plot(fpr_H2, tpr_H2, label=f'H2 Test Set AUC = {roc_auc_H2:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (H2 Test Set with Optimized Random Forest)')
plt.legend(loc='lower right')
plt.show()


TOP10 SHAP

In [None]:
# Prepare H2 test data
X_test_H2 = H2_test[X_train.columns]  # 确保测试集的特征与训练集一致
y_test_H2 = H2_test['is_canceled']  # 提取目标变量

# 使用训练集的 scaler 对 H2_test 进行标准化
X_test_H2_scaled = scaler.transform(X_test_H2)

# 使用最佳模型预测 H2_test 结果
y_pred_test_H2 = best_rf_model.predict(X_test_H2_scaled)
y_pred_proba_test_H2 = best_rf_model.predict_proba(X_test_H2_scaled)[:, 1]

# 计算 H2_test 的混淆矩阵
conf_matrix_test_H2 = confusion_matrix(y_test_H2, y_pred_test_H2)
conf_matrix_percentage_test_H2 = conf_matrix_test_H2 / conf_matrix_test_H2.sum(axis=1).reshape(-1, 1) * 100

# 绘制 H2_test 混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_test_H2, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_test_H2.shape[0]):
    for j in range(conf_matrix_test_H2.shape[1]):
        percentage_text = f"{conf_matrix_percentage_test_H2[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (H2 Test Set)')
plt.show()

# 输出 H2_test 分类报告
print("\nClassification Report (H2 Test Set):")
print(classification_report(y_test_H2, y_pred_test_H2))

# 计算 H2_test ROC 曲线和 AUC
fpr_H2, tpr_H2, _ = roc_curve(y_test_H2, y_pred_proba_test_H2)
roc_auc_test_H2 = roc_auc_score(y_test_H2, y_pred_proba_test_H2)
print(f"H2 Test Set AUC: {roc_auc_test_H2:.2f}")

# 绘制 H2_test ROC 曲线
plt.figure(figsize=(8, 6))
plt.plot(fpr_H2, tpr_H2, label=f'H2 Test Set AUC = {roc_auc_test_H2:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (H2 Test Set)')
plt.legend(loc='lower right')
plt.show()


BORUTA

In [None]:
# 准备 H2 测试数据
X_test_H2 = H2_test[X_train.columns]  # 确保 H2 测试集使用和训练集相同的特征
y_test_H2 = H2_test['is_canceled']

# 标准化 H2 测试数据
X_test_H2_scaled = scaler.transform(X_test_H2)  # 使用训练集的 scaler 对 H2 测试集进行转换

# 在 H2 测试集上进行预测
y_test_H2_pred = final_rf_model.predict(X_test_H2_scaled)
y_test_H2_proba = final_rf_model.predict_proba(X_test_H2_scaled)[:, 1]

# 评估 H2 测试集上的表现
# 混淆矩阵
conf_matrix_test_H2 = confusion_matrix(y_test_H2, y_test_H2_pred)
conf_matrix_test_H2_percentage = conf_matrix_test_H2 / conf_matrix_test_H2.sum(axis=1).reshape(-1, 1) * 100

# 绘制混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_test_H2, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

# 在单元格内显示百分比，位于计数值的下方
for i in range(conf_matrix_test_H2.shape[0]):
    for j in range(conf_matrix_test_H2.shape[1]):
        percentage_text = f"{conf_matrix_test_H2_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (H2 Test Set)')
plt.show()

# 输出分类报告
print("\nClassification Report on H2 Test Set:")
print(classification_report(y_test_H2, y_test_H2_pred))

# 绘制 H2 测试集 ROC 曲线
fpr_test_H2, tpr_test_H2, _ = roc_curve(y_test_H2, y_test_H2_proba)
roc_auc_test_H2 = roc_auc_score(y_test_H2, y_test_H2_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr_test_H2, tpr_test_H2, label=f'ROC Curve (AUC = {roc_auc_test_H2:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve on H2 Test Set')
plt.legend(loc='lower right')
plt.show()

# 输出 AUC
print(f"H2 Test Set AUC: {roc_auc_test_H2:.2f}")


 ## 3.TabNet




### baseline model

1125

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch

# 定义特征和目标变量
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 设置TabNet参数
tabnet_params = {
    'n_d': 64,  # 决策步骤的维度
    'n_a': 64,  # 注意力步骤的维度
    'n_steps': 5,  # 决策步骤数
    'gamma': 1.5,  # 特征选择的系数
    'n_independent': 2,  # 独立特征转换器的数量
    'n_shared': 2,  # 共享特征转换器的数量
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=2e-2),
    'scheduler_params': dict(mode="min",
                           patience=5,
                           min_lr=1e-5,
                           factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'mask_type': 'entmax',
    'seed': 42
}

# 创建用于存储交叉验证预测的数组
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = np.zeros_like(y_train)
y_pred_proba_cv = np.zeros_like(y_train, dtype=float)

# 执行交叉验证
fold_count = 1
plt.figure(figsize=(12, 8))

for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    # 初始化TabNet模型
    clf = TabNetClassifier(**tabnet_params)

    # 准备折内数据
    X_fold_train = X_train_scaled[train_idx]
    y_fold_train = y_train.iloc[train_idx].values
    X_fold_test = X_train_scaled[test_idx]
    y_fold_test = y_train.iloc[test_idx].values

    # 训练模型
    clf.fit(
        X_fold_train, y_fold_train,
        eval_set=[(X_fold_test, y_fold_test)],
        max_epochs=100,
        patience=10,
        batch_size=1024,
        virtual_batch_size=128
    )

    # 保存预测结果
    y_pred_cv[test_idx] = clf.predict(X_fold_test)
    y_pred_proba_cv[test_idx] = clf.predict_proba(X_fold_test)[:, 1]

    # 计算并绘制ROC曲线
    fpr, tpr, _ = roc_curve(y_fold_test, clf.predict_proba(X_fold_test)[:, 1])
    roc_auc_fold = roc_auc_score(y_fold_test, clf.predict_proba(X_fold_test)[:, 1])
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

# 完成ROC曲线图
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for H1 Dataset with TabNet (3-fold CV)')
plt.legend(loc='lower right')
plt.show()

# 计算并绘制混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('H1 Confusion Matrix with TabNet')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

# 计算并输出总体ROC AUC
print(f"\nOverall ROC AUC Score: {roc_auc_score(y_train, y_pred_proba_cv):.4f}")

# 训练最终模型用于特征重要性分析
final_model = TabNetClassifier(**tabnet_params)
final_model.fit(
    X_train_scaled, y_train.values,
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128
)

# 获取特征重要性
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': final_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# 可视化特征重要性
plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in TabNet Model (H1 Dataset)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# 打印数据集信息
print("\nDataset Information:")
print(f"Total Training Samples: {len(X_train)}")
print(f"Number of Features: {X_train.shape[1]}")
print(f"Class Distribution in Training Set:")
print(y_train.value_counts(normalize=True).round(4) * 100)

SHAP

In [None]:
import shap

# 创建 SHAP Explainer
explainer = shap.Explainer(tabnet_model.predict_proba, X_train)

# 计算 SHAP 值
shap_values = explainer(X_train)

# 绘制特征重要性图
shap.summary_plot(shap_values, X_train, plot_type="bar")


In [None]:
import shap
import numpy as np
import matplotlib.pyplot as plt
import time
import pandas as pd

# 从 feature_importance_df 中提取前10个重要特征
top_features = feature_importance_df['Feature'].head(10).tolist()
print("Top 10 important features:", top_features)

# 使用完整特征集数据
sample_data = X_train[:200]  # 从完整特征数据中选择前200个样本，以减少计算时间

# 使用 SHAP 的 KernelExplainer 进行分析
start_time = time.time()
print("Initializing SHAP KernelExplainer...")
explainer = shap.KernelExplainer(tabnet_model.predict_proba, shap.kmeans(sample_data, 10))
shap_values = explainer.shap_values(sample_data)

# 计算 SHAP 中位数特征重要性，仅关注前10个特征
print("Calculating Median SHAP values...")
shap_median_importance = np.median(np.abs(shap_values[1]), axis=0)  # 使用正类（取消）的 SHAP 值
shap_importance_df = pd.DataFrame({
    'Feature': top_features,
    'Median SHAP Value': shap_median_importance[:len(top_features)]  # 仅提取前10个特征的 SHAP 值
}).sort_values(by='Median SHAP Value', ascending=False)

# 绘制 SHAP 特征重要性条形图
plt.figure(figsize=(10, 8))
plt.barh(shap_importance_df['Feature'], shap_importance_df['Median SHAP Value'])
plt.xlabel('Median SHAP Value')
plt.ylabel('Feature')
plt.title('Top 10 Feature Importance (Median SHAP Values) for TabNet')
plt.gca().invert_yaxis()
plt.show()

# 绘制 SHAP summary plot，使用完整特征集进行绘图
print("Generating SHAP summary plot...")
shap.summary_plot(shap_values[1], sample_data, plot_type='dot', show=True)

# 记录执行时间
end_time = time.time()
print(f"Execution completed in {end_time - start_time:.2f} seconds.")



In [None]:
print("shap_median_importance:", len(shap_median_importance))
print("top_features:", len(top_features))


In [None]:
import shap
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time

# 从 feature_importance_df 中提取前10个重要特征
top_features = feature_importance_df['Feature'].head(10).tolist()
print("Top 10 important features:", top_features)

# 使用前200个样本数据进行 SHAP 分析
sample_data = X_train[:, :10][:200]  # 从完整特征集中选择前10个特征，并抽取前200个样本，减少计算时间

# 使用 SHAP 的 KernelExplainer 进行分析
start_time = time.time()
print("Initializing SHAP KernelExplainer...")
explainer = shap.KernelExplainer(tabnet_model.predict_proba, shap.kmeans(sample_data, 10))
shap_values = explainer.shap_values(sample_data)

# 计算 SHAP 中位数重要性得分，仅关注类别1的特征
print("Calculating Median SHAP values...")
shap_median_importance = np.median(np.abs(shap_values[1]), axis=0)  # 使用类别1的 SHAP 值
shap_importance_df = pd.DataFrame({
    'Feature': top_features,
    'Median SHAP Value': shap_median_importance[:len(top_features)]  # 仅提取前10个特征的 SHAP 值
}).sort_values(by='Median SHAP Value', ascending=False)

# 绘制 SHAP 特征重要性条形图
plt.figure(figsize=(10, 8))
plt.barh(shap_importance_df['Feature'], shap_importance_df['Median SHAP Value'])
plt.xlabel('Median SHAP Value')
plt.ylabel('Feature')
plt.title('Top 10 Feature Importance (Median SHAP Values) for TabNet')
plt.gca().invert_yaxis()
plt.show()

# 绘制 SHAP summary plot，使用完整特征集进行绘图
print("Generating SHAP summary plot...")
shap.summary_plot(shap_values[1], sample_data, plot_type='dot', show=True)

end_time = time.time()
print(f"Execution completed in {end_time - start_time:.2f} seconds.")


Boruta

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

# 使用 Boruta 选择的特征构建新的训练数据集
# 假设 selected_features 已经包含 Boruta 选择的特征
X_train_boruta = H1_train[selected_features].values
y_train = H1_train['is_canceled'].values

# 创建 TabNet 模型
tabnet_model = TabNetClassifier(seed=42)

# 定义 3 折交叉验证
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = []
y_pred_proba_cv = []

# 进行交叉验证并预测
fold_count = 1
for train_idx, test_idx in cv.split(X_train_boruta, y_train):
    print(f"Training fold {fold_count}...")
    X_train_fold, X_test_fold = X_train_boruta[train_idx], X_train_boruta[test_idx]
    y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]

    # 训练 TabNet 模型
    tabnet_model.fit(X_train_fold, y_train_fold, max_epochs=100, patience=20, batch_size=1024, virtual_batch_size=128)

    # 预测
    y_pred_fold = tabnet_model.predict(X_test_fold)
    y_pred_cv.extend(y_pred_fold)

    # 预测概率
    y_pred_proba_fold = tabnet_model.predict_proba(X_test_fold)[:, 1]
    y_pred_proba_cv.extend(y_pred_proba_fold)

    fold_count += 1

# 将预测结果转换为数组
y_pred_cv = np.array(y_pred_cv)
y_pred_proba_cv = np.array(y_pred_proba_cv)

# 计算混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

# 绘制混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

# 在单元格内显示百分比，位于计数值的下方
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (TabNet with Boruta-selected Features)')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

# 绘制 ROC 曲线
plt.figure(figsize=(12, 8))
fpr, tpr, _ = roc_curve(y_train, y_pred_proba_cv)
roc_auc = roc_auc_score(y_train, y_pred_proba_cv)
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (TabNet with Boruta-selected Features)')
plt.legend(loc='lower right')
plt.show()


In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import pandas as pd

# Boruta 选择的特征列表
selected_features = ['lead_time', 'country', 'deposit_type', 'agent', 'adr', 'total_of_special_requests']

# 使用 Boruta 选择的特征定义训练数据和目标变量
X_train_boruta = H1_train[selected_features]
y_train = H1_train['is_canceled']

# 确保 TabNet 数据格式正确
X_train_boruta = X_train_boruta.values
y_train = y_train.values

# 创建 TabNet 模型
tabnet_model = TabNetClassifier(seed=42)

# 定义 3 折交叉验证
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = []
y_pred_proba_cv = []

# 进行交叉验证并预测
fold_count = 1
for train_idx, test_idx in cv.split(X_train_boruta, y_train):
    print(f"Training fold {fold_count}...")
    X_train_fold, X_test_fold = X_train_boruta[train_idx], X_train_boruta[test_idx]
    y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]

    # 训练 TabNet 模型
    tabnet_model.fit(X_train_fold, y_train_fold, max_epochs=100, patience=20, batch_size=1024, virtual_batch_size=128)

    # 预测
    y_pred_fold = tabnet_model.predict(X_test_fold)
    y_pred_cv.extend(y_pred_fold)

    # 预测概率
    y_pred_proba_fold = tabnet_model.predict_proba(X_test_fold)[:, 1]
    y_pred_proba_cv.extend(y_pred_proba_fold)

    fold_count += 1

# 将预测结果转换为数组
y_pred_cv = np.array(y_pred_cv)
y_pred_proba_cv = np.array(y_pred_proba_cv)

# 计算混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

# 绘制混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

# 在单元格内显示百分比，位于计数值的下方
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (TabNet with Boruta-selected Features)')
plt.show()

# 输出分类报告
print("\nClassification Report (TabNet with Boruta-selected Features):")
print(classification_report(y_train, y_pred_cv))

# 绘制每个折的 ROC 曲线
plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_boruta, y_train):
    # 在每个折上获取预测概率
    y_pred_proba_fold = tabnet_model.predict_proba(X_train_boruta[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (TabNet with Boruta-selected Features, 3-fold CV)')
plt.legend(loc='lower right')
plt.show()


### different feature combinations

feature importance top10

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import pandas as pd

# 首先训练一个模型来获取 top 10 特征
initial_model = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5, gamma=1.5,
    n_independent=2, n_shared=2,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.5),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    mask_type='entmax',
    seed=42
)

# 准备初始数据
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 训练初始模型获取特征重要性
initial_model.fit(
    X_train_scaled, y_train.values,
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128
)

# 获取 top 10 特征
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': initial_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

top_10_features = feature_importance.head(10)['Feature'].tolist()
print("\nSelected Top 10 Features:")
print(top_10_features)

# 使用 top 10 特征重新准备数据
X_train_top10 = X_train[top_10_features]
X_train_scaled_top10 = scaler.fit_transform(X_train_top10)

# TabNet参数
tabnet_params = {
    'n_d': 64,
    'n_a': 64,
    'n_steps': 5,
    'gamma': 1.5,
    'n_independent': 2,
    'n_shared': 2,
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=2e-2),
    'scheduler_params': dict(mode="min",
                           patience=5,
                           min_lr=1e-5,
                           factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'mask_type': 'entmax',
    'seed': 42
}

# 创建用于存储交叉验证预测的数组
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = np.zeros_like(y_train)
y_pred_proba_cv = np.zeros_like(y_train, dtype=float)

# 执行交叉验证
fold_count = 1
plt.figure(figsize=(12, 8))

for train_idx, test_idx in cv.split(X_train_scaled_top10, y_train):
    # 初始化TabNet模型
    clf = TabNetClassifier(**tabnet_params)

    # 准备折内数据
    X_fold_train = X_train_scaled_top10[train_idx]
    y_fold_train = y_train.iloc[train_idx].values
    X_fold_test = X_train_scaled_top10[test_idx]
    y_fold_test = y_train.iloc[test_idx].values

    # 训练模型
    clf.fit(
        X_fold_train, y_fold_train,
        eval_set=[(X_fold_test, y_fold_test)],
        max_epochs=100,
        patience=10,
        batch_size=1024,
        virtual_batch_size=128
    )

    # 保存预测结果
    y_pred_cv[test_idx] = clf.predict(X_fold_test)
    y_pred_proba_cv[test_idx] = clf.predict_proba(X_fold_test)[:, 1]

    # 计算并绘制ROC曲线
    fpr, tpr, _ = roc_curve(y_fold_test, clf.predict_proba(X_fold_test)[:, 1])
    roc_auc_fold = roc_auc_score(y_fold_test, clf.predict_proba(X_fold_test)[:, 1])
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

# 完成ROC曲线图
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for H1 Dataset with TabNet (Top 10 Features, 3-fold CV)')
plt.legend(loc='lower right')
plt.show()

# 计算并绘制混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('H1 Confusion Matrix with TabNet (Top 10 Features)')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

# 计算并输出总体ROC AUC
print(f"\nOverall ROC AUC Score: {roc_auc_score(y_train, y_pred_proba_cv):.4f}")

# 训练最终模型用于特征重要性分析
final_model = TabNetClassifier(**tabnet_params)
final_model.fit(
    X_train_scaled_top10, y_train.values,
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128
)

# 获取特征重要性
feature_importance = pd.DataFrame({
    'Feature': X_train_top10.columns,
    'Importance': final_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# 可视化特征重要性
plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in TabNet Model (H1 Dataset, Top 10 Features)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# 打印数据集信息
print("\nDataset Information:")
print(f"Total Training Samples: {len(X_train_top10)}")
print(f"Number of Features: {X_train_top10.shape[1]}")
print(f"Class Distribution in Training Set:")
print(y_train.value_counts(normalize=True).round(4) * 100)

SHAP top10

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import pandas as pd
import shap

# 准备初始数据
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 首先训练一个初始模型用于SHAP值计算
initial_model = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5, gamma=1.5,
    n_independent=2, n_shared=2,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.5),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    mask_type='entmax',
    seed=42
)

# 训练初始模型
initial_model.fit(
    X_train_scaled, y_train.values,
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128
)

# 计算SHAP值
background = shap.kmeans(X_train_scaled, k=20)  # 选择一些背景样本
explainer = shap.KernelExplainer(initial_model.predict_proba, background)
shap_values = explainer.shap_values(X_train_scaled[:100])  # 使用部分样本计算SHAP值

# 计算每个特征的平均绝对SHAP值
mean_abs_shap = np.mean(np.abs(shap_values[1]), axis=0)
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': mean_abs_shap
}).sort_values(by='Importance', ascending=False)

# 获取top 10特征
top_10_features = feature_importance.head(10)['Feature'].tolist()
print("\nSelected Top 10 Features by SHAP:")
print(top_10_features)

# 使用top 10特征重新准备数据
X_train_top10 = X_train[top_10_features]
X_train_scaled_top10 = scaler.fit_transform(X_train_top10)

# TabNet参数
tabnet_params = {
    'n_d': 64,
    'n_a': 64,
    'n_steps': 5,
    'gamma': 1.5,
    'n_independent': 2,
    'n_shared': 2,
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=2e-2),
    'scheduler_params': dict(mode="min",
                           patience=5,
                           min_lr=1e-5,
                           factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'mask_type': 'entmax',
    'seed': 42
}

# 创建用于存储交叉验证预测的数组
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = np.zeros_like(y_train)
y_pred_proba_cv = np.zeros_like(y_train, dtype=float)

# 执行交叉验证
fold_count = 1
plt.figure(figsize=(12, 8))

for train_idx, test_idx in cv.split(X_train_scaled_top10, y_train):
    # 初始化TabNet模型
    clf = TabNetClassifier(**tabnet_params)

    # 准备折内数据
    X_fold_train = X_train_scaled_top10[train_idx]
    y_fold_train = y_train.iloc[train_idx].values
    X_fold_test = X_train_scaled_top10[test_idx]
    y_fold_test = y_train.iloc[test_idx].values

    # 训练模型
    clf.fit(
        X_fold_train, y_fold_train,
        eval_set=[(X_fold_test, y_fold_test)],
        max_epochs=100,
        patience=10,
        batch_size=1024,
        virtual_batch_size=128
    )

    # 保存预测结果
    y_pred_cv[test_idx] = clf.predict(X_fold_test)
    y_pred_proba_cv[test_idx] = clf.predict_proba(X_fold_test)[:, 1]

    # 计算并绘制ROC曲线
    fpr, tpr, _ = roc_curve(y_fold_test, clf.predict_proba(X_fold_test)[:, 1])
    roc_auc_fold = roc_auc_score(y_fold_test, clf.predict_proba(X_fold_test)[:, 1])
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

# 完成ROC曲线图
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for H1 Dataset with TabNet (SHAP Top 10 Features, 3-fold CV)')
plt.legend(loc='lower right')
plt.show()

# 计算并绘制混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('H1 Confusion Matrix with TabNet (SHAP Top 10 Features)')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

# 计算并输出总体ROC AUC
print(f"\nOverall ROC AUC Score: {roc_auc_score(y_train, y_pred_proba_cv):.4f}")

# 训练最终模型用于特征重要性分析
final_model = TabNetClassifier(**tabnet_params)
final_model.fit(
    X_train_scaled_top10, y_train.values,
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128
)

# 获取特征重要性
feature_importance_final = pd.DataFrame({
    'Feature': X_train_top10.columns,
    'Importance': final_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# 可视化特征重要性
plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance_final, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in TabNet Model (SHAP Top 10 Features)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# 打印SHAP特征重要性
plt.figure(figsize=(12, 6))
shap.summary_plot(shap_values[1][:, feature_importance.index[:10]],
                 X_train.iloc[:100, feature_importance.index[:10]],
                 plot_type="bar")
plt.title('SHAP Feature Importance (Top 10 Features)')
plt.tight_layout()
plt.show()

# 打印数据集信息
print("\nDataset Information:")
print(f"Total Training Samples: {len(X_train_top10)}")
print(f"Number of Features: {X_train_top10.shape[1]}")
print(f"Class Distribution in Training Set:")
print(y_train.value_counts(normalize=True).round(4) * 100)

BORUTA 4

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import pandas as pd

# 准备初始数据
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 使用Boruta进行特征选择
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
boruta = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=42)

# 运行Boruta
boruta.fit(X_train_scaled, y_train.values)

# 获取选中的特征
selected_features = X_train.columns[boruta.support_].tolist()
print("\nSelected Features by Boruta:")
print(selected_features)
print(f"\nTotal number of selected features: {len(selected_features)}")

# 使用Boruta选中的特征重新准备数据
X_train_selected = X_train[selected_features]
X_train_scaled_selected = scaler.fit_transform(X_train_selected)

# TabNet参数
tabnet_params = {
    'n_d': 64,
    'n_a': 64,
    'n_steps': 5,
    'gamma': 1.5,
    'n_independent': 2,
    'n_shared': 2,
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=2e-2),
    'scheduler_params': dict(mode="min",
                           patience=5,
                           min_lr=1e-5,
                           factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'mask_type': 'entmax',
    'seed': 42
}

# 创建用于存储交叉验证预测的数组
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = np.zeros_like(y_train)
y_pred_proba_cv = np.zeros_like(y_train, dtype=float)

# 执行交叉验证
fold_count = 1
plt.figure(figsize=(12, 8))

for train_idx, test_idx in cv.split(X_train_scaled_selected, y_train):
    # 初始化TabNet模型
    clf = TabNetClassifier(**tabnet_params)

    # 准备折内数据
    X_fold_train = X_train_scaled_selected[train_idx]
    y_fold_train = y_train.iloc[train_idx].values
    X_fold_test = X_train_scaled_selected[test_idx]
    y_fold_test = y_train.iloc[test_idx].values

    # 训练模型
    clf.fit(
        X_fold_train, y_fold_train,
        eval_set=[(X_fold_test, y_fold_test)],
        max_epochs=100,
        patience=10,
        batch_size=1024,
        virtual_batch_size=128
    )

    # 保存预测结果
    y_pred_cv[test_idx] = clf.predict(X_fold_test)
    y_pred_proba_cv[test_idx] = clf.predict_proba(X_fold_test)[:, 1]

    # 计算并绘制ROC曲线
    fpr, tpr, _ = roc_curve(y_fold_test, clf.predict_proba(X_fold_test)[:, 1])
    roc_auc_fold = roc_auc_score(y_fold_test, clf.predict_proba(X_fold_test)[:, 1])
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

# 完成ROC曲线图
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for H1 Dataset with TabNet (Boruta Selected Features, 3-fold CV)')
plt.legend(loc='lower right')
plt.show()

# 计算并绘制混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('H1 Confusion Matrix with TabNet (Boruta Selected Features)')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

# 计算并输出总体ROC AUC
print(f"\nOverall ROC AUC Score: {roc_auc_score(y_train, y_pred_proba_cv):.4f}")

# 训练最终模型用于特征重要性分析
final_model = TabNetClassifier(**tabnet_params)
final_model.fit(
    X_train_scaled_selected, y_train.values,
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128
)

# 获取特征重要性
feature_importance = pd.DataFrame({
    'Feature': X_train_selected.columns,
    'Importance': final_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# 可视化特征重要性
plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in TabNet Model (H1 Dataset, Boruta Selected Features)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# 打印数据集信息
print("\nDataset Information:")
print(f"Total Training Samples: {len(X_train_selected)}")
print(f"Number of Features: {X_train_selected.shape[1]}")
print(f"Class Distribution in Training Set:")
print(y_train.value_counts(normalize=True).round(4) * 100)

### hyperparameter tuning

BASELINE

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold, train_test_split
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import optuna
import pandas as pd

# 准备数据
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 定义Optuna优化目标函数
def objective(trial):
    tabnet_params = {
        'n_d': trial.suggest_categorical('n_d', [8, 16, 32, 64]),
        'n_a': trial.suggest_categorical('n_a', [8, 16, 32, 64]),
        'n_steps': trial.suggest_int('n_steps', 3, 5),
        'gamma': trial.suggest_float('gamma', 1.0, 1.5),
        'n_independent': trial.suggest_int('n_independent', 1, 3),
        'n_shared': trial.suggest_int('n_shared', 1, 3),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {
            'lr': trial.suggest_categorical('learning_rate', [0.01, 0.02, 0.005])
        },
        'scheduler_params': {
            'mode': 'min',
            'patience': 5,
            'min_lr': 1e-5,
            'factor': 0.5
        },
        'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
        'mask_type': 'entmax',
        'seed': 42
    }

    clf = TabNetClassifier(**tabnet_params)

    # 使用单次验证集
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train_scaled, y_train, test_size=0.2, random_state=42
    )

    clf.fit(
        X_train_split, y_train_split,
        eval_set=[(X_val_split, y_val_split)],
        max_epochs=30,
        patience=5,
        batch_size=2048,
        virtual_batch_size=256
    )

    pred_proba = clf.predict_proba(X_val_split)[:, 1]
    return roc_auc_score(y_val_split, pred_proba)

# 运行Optuna优化
print("Starting Optuna optimization...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

# 打印最佳参数
print("\nBest parameters:", study.best_trial.params)
print("Best AUC score:", study.best_trial.value)

# 使用最佳参数训练最终模型
best_params = study.best_trial.params
final_params = {
    'n_d': best_params['n_d'],
    'n_a': best_params['n_a'],
    'n_steps': best_params['n_steps'],
    'gamma': best_params['gamma'],
    'n_independent': best_params['n_independent'],
    'n_shared': best_params['n_shared'],
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': {'lr': best_params['learning_rate']},
    'scheduler_params': {
        'mode': 'min',
        'patience': 5,
        'min_lr': 1e-5,
        'factor': 0.5
    },
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'mask_type': 'entmax',
    'seed': 42
}

# 使用最佳参数训练最终模型
print("\nTraining final model with best parameters...")
best_model = TabNetClassifier(**final_params)
best_model.fit(
    X_train_scaled, y_train.values,
    eval_set=[(X_train_scaled, y_train.values)],
    max_epochs=100,
    batch_size=2048,
    virtual_batch_size=256
)

# 计算预测和预测概率
y_pred = best_model.predict(X_train_scaled)
y_pred_proba = best_model.predict_proba(X_train_scaled)[:, 1]

# 绘制ROC曲线
plt.figure(figsize=(12, 8))
fpr, tpr, _ = roc_curve(y_train, y_pred_proba)
roc_auc = roc_auc_score(y_train, y_pred_proba)
plt.plot(fpr, tpr, label=f'Optimized TabNet (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for H1 Dataset with Optimized TabNet')
plt.legend(loc='lower right')
plt.show()

# 计算和绘制混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)

for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('H1 Confusion Matrix with Optimized TabNet')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_train, y_pred))

# 分析特征重要性
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': best_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# 可视化特征重要性
plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in Optimized TabNet Model (H1 Dataset)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# 绘制Optuna优化历史
plt.figure(figsize=(10, 6))
optuna.visualization.matplotlib.plot_optimization_history(study)
plt.title('Optuna Optimization History')
plt.show()

# 打印数据集信息
print("\nDataset Information:")
print(f"Total Training Samples: {len(X_train)}")
print(f"Number of Features: {X_train.shape[1]}")
print(f"Class Distribution in Training Set:")
print(y_train.value_counts(normalize=True).round(4) * 100)

feature importance top10

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold, train_test_split
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import optuna
import pandas as pd

# 准备初始数据
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 首先训练一个初始模型来获取 top 10 特征
initial_model = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5, gamma=1.5,
    n_independent=2, n_shared=2,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.5),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    mask_type='entmax',
    seed=42
)

# 训练初始模型获取特征重要性
initial_model.fit(
    X_train_scaled, y_train.values,
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128
)

# 获取 top 10 特征
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': initial_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

top_10_features = feature_importance.head(10)['Feature'].tolist()
print("\nSelected Top 10 Features:")
print(top_10_features)

# 使用 top 10 特征重新准备数据
X_train_top10 = X_train[top_10_features]
X_train_scaled_top10 = scaler.fit_transform(X_train_top10)

# 定义Optuna优化目标函数
def objective(trial):
    tabnet_params = {
        'n_d': trial.suggest_categorical('n_d', [8, 16, 32, 64]),
        'n_a': trial.suggest_categorical('n_a', [8, 16, 32, 64]),
        'n_steps': trial.suggest_int('n_steps', 3, 5),
        'gamma': trial.suggest_float('gamma', 1.0, 1.5),
        'n_independent': trial.suggest_int('n_independent', 1, 3),
        'n_shared': trial.suggest_int('n_shared', 1, 3),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {
            'lr': trial.suggest_categorical('learning_rate', [0.01, 0.02, 0.005])
        },
        'scheduler_params': {
            'mode': 'min',
            'patience': 5,
            'min_lr': 1e-5,
            'factor': 0.5
        },
        'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
        'mask_type': 'entmax',
        'seed': 42
    }

    clf = TabNetClassifier(**tabnet_params)

    # 使用单次验证集
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train_scaled_top10, y_train, test_size=0.2, random_state=42
    )

    clf.fit(
        X_train_split, y_train_split,
        eval_set=[(X_val_split, y_val_split)],
        max_epochs=30,
        patience=5,
        batch_size=2048,
        virtual_batch_size=256
    )

    pred_proba = clf.predict_proba(X_val_split)[:, 1]
    return roc_auc_score(y_val_split, pred_proba)

# 运行Optuna优化
print("Starting Optuna optimization...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

# 打印最佳参数
print("\nBest parameters:", study.best_trial.params)
print("Best AUC score:", study.best_trial.value)

# 使用最佳参数训练最终模型
best_params = study.best_trial.params
final_params = {
    'n_d': best_params['n_d'],
    'n_a': best_params['n_a'],
    'n_steps': best_params['n_steps'],
    'gamma': best_params['gamma'],
    'n_independent': best_params['n_independent'],
    'n_shared': best_params['n_shared'],
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': {'lr': best_params['learning_rate']},
    'scheduler_params': {
        'mode': 'min',
        'patience': 5,
        'min_lr': 1e-5,
        'factor': 0.5
    },
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'mask_type': 'entmax',
    'seed': 42
}

# 使用最佳参数训练最终模型
print("\nTraining final model with best parameters...")
best_model = TabNetClassifier(**final_params)
best_model.fit(
    X_train_scaled_top10, y_train.values,
    eval_set=[(X_train_scaled_top10, y_train.values)],
    max_epochs=100,
    batch_size=2048,
    virtual_batch_size=256
)

# 计算预测和预测概率
y_pred = best_model.predict(X_train_scaled_top10)
y_pred_proba = best_model.predict_proba(X_train_scaled_top10)[:, 1]

# 绘制ROC曲线
plt.figure(figsize=(12, 8))
fpr, tpr, _ = roc_curve(y_train, y_pred_proba)
roc_auc = roc_auc_score(y_train, y_pred_proba)
plt.plot(fpr, tpr, label=f'Optimized TabNet (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for H1 Dataset with Optimized TabNet (Top 10 Features)')
plt.legend(loc='lower right')
plt.show()

# 计算和绘制混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)

for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('H1 Confusion Matrix with Optimized TabNet (Top 10 Features)')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_train, y_pred))

# 分析特征重要性
feature_importance = pd.DataFrame({
    'Feature': top_10_features,
    'Importance': best_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# 可视化特征重要性
plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in Optimized TabNet Model (Top 10 Features)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# 绘制Optuna优化历史
plt.figure(figsize=(10, 6))
optuna.visualization.matplotlib.plot_optimization_history(study)
plt.title('Optuna Optimization History (Top 10 Features)')
plt.show()

# 打印数据集信息
print("\nDataset Information:")
print(f"Total Training Samples: {len(X_train_top10)}")
print(f"Number of Features: {X_train_top10.shape[1]}")
print(f"Class Distribution in Training Set:")
print(y_train.value_counts(normalize=True).round(4) * 100)

BORUTA

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import optuna
import pandas as pd

# 准备初始数据
X_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train = H1_train['is_canceled']

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 使用Boruta进行特征选择
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
boruta = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=42)

# 运行Boruta
boruta.fit(X_train_scaled, y_train.values)

# 获取选中的特征
selected_features = X_train.columns[boruta.support_].tolist()
print("\nSelected Features by Boruta:")
print(selected_features)
print(f"\nTotal number of selected features: {len(selected_features)}")

# 使用Boruta选中的特征重新准备数据
X_train_selected = X_train[selected_features]
X_train_scaled_selected = scaler.fit_transform(X_train_selected)

# 定义Optuna优化目标函数
def objective(trial):
    tabnet_params = {
        'n_d': trial.suggest_categorical('n_d', [8, 16, 32, 64]),
        'n_a': trial.suggest_categorical('n_a', [8, 16, 32, 64]),
        'n_steps': trial.suggest_int('n_steps', 3, 5),
        'gamma': trial.suggest_float('gamma', 1.0, 1.5),
        'n_independent': trial.suggest_int('n_independent', 1, 3),
        'n_shared': trial.suggest_int('n_shared', 1, 3),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {
            'lr': trial.suggest_categorical('learning_rate', [0.01, 0.02, 0.005])
        },
        'scheduler_params': {
            'mode': 'min',
            'patience': 5,
            'min_lr': 1e-5,
            'factor': 0.5
        },
        'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
        'mask_type': 'entmax',
        'seed': 42
    }

    clf = TabNetClassifier(**tabnet_params)

    # 使用单次验证集
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train_scaled_selected, y_train, test_size=0.2, random_state=42
    )

    clf.fit(
        X_train_split, y_train_split,
        eval_set=[(X_val_split, y_val_split)],
        max_epochs=30,
        patience=5,
        batch_size=2048,
        virtual_batch_size=256
    )

    pred_proba = clf.predict_proba(X_val_split)[:, 1]
    return roc_auc_score(y_val_split, pred_proba)

# 运行Optuna优化
print("Starting Optuna optimization...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

# 打印最佳参数
print("\nBest parameters:", study.best_trial.params)
print("Best AUC score:", study.best_trial.value)

# 使用最佳参数训练最终模型
best_params = study.best_trial.params
final_params = {
    'n_d': best_params['n_d'],
    'n_a': best_params['n_a'],
    'n_steps': best_params['n_steps'],
    'gamma': best_params['gamma'],
    'n_independent': best_params['n_independent'],
    'n_shared': best_params['n_shared'],
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': {'lr': best_params['learning_rate']},
    'scheduler_params': {
        'mode': 'min',
        'patience': 5,
        'min_lr': 1e-5,
        'factor': 0.5
    },
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'mask_type': 'entmax',
    'seed': 42
}

# 使用最佳参数训练最终模型
print("\nTraining final model with best parameters...")
best_model = TabNetClassifier(**final_params)
best_model.fit(
    X_train_scaled_selected, y_train.values,
    eval_set=[(X_train_scaled_selected, y_train.values)],
    max_epochs=100,
    batch_size=2048,
    virtual_batch_size=256
)

# 计算预测和预测概率
y_pred = best_model.predict(X_train_scaled_selected)
y_pred_proba = best_model.predict_proba(X_train_scaled_selected)[:, 1]

# 绘制ROC曲线
plt.figure(figsize=(12, 8))
fpr, tpr, _ = roc_curve(y_train, y_pred_proba)
roc_auc = roc_auc_score(y_train, y_pred_proba)
plt.plot(fpr, tpr, label=f'Optimized TabNet (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for H1 Dataset with Optimized TabNet (Boruta Features)')
plt.legend(loc='lower right')
plt.show()

# 计算和绘制混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)

for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('H1 Confusion Matrix with Optimized TabNet (Boruta Features)')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_train, y_pred))

# 分析特征重要性
feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': best_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# 可视化特征重要性
plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in Optimized TabNet Model (Boruta Features)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# 绘制Optuna优化历史
plt.figure(figsize=(10, 6))
optuna.visualization.matplotlib.plot_optimization_history(study)
plt.title('Optuna Optimization History (Boruta Features)')
plt.show()

# 打印数据集信息
print("\nDataset Information:")
print(f"Total Training Samples: {len(X_train_selected)}")
print(f"Number of Features: {X_train_selected.shape[1]}")
print(f"Class Distribution in Training Set:")
print(y_train.value_counts(normalize=True).round(4) * 100)

### test set

BASELINE

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import pandas as pd


# 最佳参数配置
best_params = {
    'n_d': 64,
    'n_a': 32,
    'n_steps': 3,
    'gamma': 1.0072198139700859,
    'n_independent': 1,
    'n_shared': 1,
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': {'lr': 0.02},
    'scheduler_params': {
        'mode': 'min',
        'patience': 5,
        'min_lr': 1e-5,
        'factor': 0.5
    },
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'mask_type': 'entmax',
    'seed': 42
}

# 准备数据
# H1数据
X_h1_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_h1_train = H1_train['is_canceled']
X_h1_test = H1_test[selected_columns].drop(columns=['is_canceled'])
y_h1_test = H1_test['is_canceled']

# H2数据
X_h2_test = H2_test[selected_columns].drop(columns=['is_canceled'])
y_h2_test = H2_test['is_canceled']

# 标准化数据
scaler = StandardScaler()
X_h1_train_scaled = scaler.fit_transform(X_h1_train)
X_h1_test_scaled = scaler.transform(X_h1_test)
X_h2_test_scaled = scaler.transform(X_h2_test)

# 训练模型
print("Training model on H1 training data...")
model = TabNetClassifier(**best_params)
model.fit(
    X_h1_train_scaled, y_h1_train,
    eval_set=[(X_h1_train_scaled, y_h1_train)],
    max_epochs=100,
    batch_size=2048,
    virtual_batch_size=256
)

# 在H1测试集上评估
print("\nEvaluating on H1 test set...")
y_h1_pred = model.predict(X_h1_test_scaled)
y_h1_pred_proba = model.predict_proba(X_h1_test_scaled)[:, 1]

# 在H2测试集上评估
print("\nEvaluating on H2 test set...")
y_h2_pred = model.predict(X_h2_test_scaled)
y_h2_pred_proba = model.predict_proba(X_h2_test_scaled)[:, 1]

# 绘制ROC曲线对比
plt.figure(figsize=(12, 8))
# H1 ROC
fpr_h1, tpr_h1, _ = roc_curve(y_h1_test, y_h1_pred_proba)
roc_auc_h1 = roc_auc_score(y_h1_test, y_h1_pred_proba)
plt.plot(fpr_h1, tpr_h1, label=f'H1 Test (AUC = {roc_auc_h1:.2f})', color='blue')

# H2 ROC
fpr_h2, tpr_h2, _ = roc_curve(y_h2_test, y_h2_pred_proba)
roc_auc_h2 = roc_auc_score(y_h2_test, y_h2_pred_proba)
plt.plot(fpr_h2, tpr_h2, label=f'H2 Test (AUC = {roc_auc_h2:.2f})', color='red')

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison: H1 vs H2 Test Sets')
plt.legend(loc='lower right')
plt.show()

# 创建混淆矩阵对比图
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# H1混淆矩阵
conf_matrix_h1 = confusion_matrix(y_h1_test, y_h1_pred)
conf_matrix_percentage_h1 = conf_matrix_h1 / conf_matrix_h1.sum(axis=1).reshape(-1, 1) * 100

sns.heatmap(conf_matrix_h1, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax1)
for i in range(conf_matrix_h1.shape[0]):
    for j in range(conf_matrix_h1.shape[1]):
        percentage_text = f"{conf_matrix_percentage_h1[i, j]:.1f}%"
        ax1.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)
ax1.set_title('H1 Test Set Confusion Matrix')
ax1.set_xlabel('Predicted Label')
ax1.set_ylabel('True Label')

# H2混淆矩阵
conf_matrix_h2 = confusion_matrix(y_h2_test, y_h2_pred)
conf_matrix_percentage_h2 = conf_matrix_h2 / conf_matrix_h2.sum(axis=1).reshape(-1, 1) * 100

sns.heatmap(conf_matrix_h2, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax2)
for i in range(conf_matrix_h2.shape[0]):
    for j in range(conf_matrix_h2.shape[1]):
        percentage_text = f"{conf_matrix_percentage_h2[i, j]:.1f}%"
        ax2.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)
ax2.set_title('H2 Test Set Confusion Matrix')
ax2.set_xlabel('Predicted Label')
ax2.set_ylabel('True Label')

plt.tight_layout()
plt.show()

# 对比性能指标
def print_metrics(y_true, y_pred, y_pred_proba, dataset_name):
    print(f"\n{dataset_name} Metrics:")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    print(f"ROC AUC Score: {roc_auc_score(y_true, y_pred_proba):.4f}")

print_metrics(y_h1_test, y_h1_pred, y_h1_pred_proba, "H1 Test Set")
print_metrics(y_h2_test, y_h2_pred, y_h2_pred_proba, "H2 Test Set")

# 特征重要性分析
feature_importance = pd.DataFrame({
    'Feature': X_h1_train.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in TabNet Model')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()


# 更新最后的性能指标对比部分
from sklearn.metrics import precision_score, recall_score, f1_score

# 计算所有指标
metrics = {
    'Accuracy': [accuracy_score(y_h1_test, y_h1_pred), accuracy_score(y_h2_test, y_h2_pred)],
    'ROC AUC': [roc_auc_score(y_h1_test, y_h1_pred_proba), roc_auc_score(y_h2_test, y_h2_pred_proba)],
    'Precision': [precision_score(y_h1_test, y_h1_pred), precision_score(y_h2_test, y_h2_pred)],
    'Recall': [recall_score(y_h1_test, y_h1_pred), recall_score(y_h2_test, y_h2_pred)],
    'F1 Score': [f1_score(y_h1_test, y_h1_pred), f1_score(y_h2_test, y_h2_pred)]
}

df_metrics = pd.DataFrame(metrics, index=['H1 Test', 'H2 Test'])

# 设置更大的图形尺寸以适应更多指标
plt.figure(figsize=(15, 8))

# 创建分组柱状图
bar_width = 0.35
x = np.arange(len(df_metrics.index))

# 使用不同的颜色绘制每个指标
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEEAD']
for i, (metric, values) in enumerate(df_metrics.items()):
    plt.bar(x + i * bar_width, values, bar_width,
            label=metric, color=colors[i % len(colors)])

# 优化图表样式
plt.title('Performance Metrics Comparison: H1 vs H2', fontsize=14, pad=20)
plt.xlabel('Dataset', fontsize=12)
plt.ylabel('Score', fontsize=12)
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')

# 设置x轴刻度
plt.xticks(x + bar_width * 2, df_metrics.index)

# 在每个柱子上添加数值标签
for i, metric in enumerate(df_metrics.columns):
    for j, value in enumerate(df_metrics[metric]):
        plt.text(j + i * bar_width, value, f'{value:.3f}',
                ha='center', va='bottom', rotation=0)

# 调整布局以确保图例完全可见
plt.tight_layout()
plt.show()

# 打印详细的指标对比表格
print("\nDetailed Performance Metrics:")
print(df_metrics.round(4))

# 打印数据集信息
print("\nDataset Information:")
print("H1 Test Set:")
print(f"Total Samples: {len(X_h1_test)}")
print("Class Distribution:")
print(y_h1_test.value_counts(normalize=True).round(4) * 100)

print("\nH2 Test Set:")
print(f"Total Samples: {len(X_h2_test)}")
print("Class Distribution:")
print(y_h2_test.value_counts(normalize=True).round(4) * 100)

In [None]:
# 更新性能指标可视化部分
import numpy as np

# 准备数据
datasets = ['H1 Test', 'H2 Test']
metrics = {
    'Accuracy': [accuracy_score(y_h1_test, y_h1_pred), accuracy_score(y_h2_test, y_h2_pred)],
    'Precision': [precision_score(y_h1_test, y_h1_pred), precision_score(y_h2_test, y_h2_pred)],
    'Recall': [recall_score(y_h1_test, y_h1_pred), recall_score(y_h2_test, y_h2_pred)],
    'F1 Score': [f1_score(y_h1_test, y_h1_pred), f1_score(y_h2_test, y_h2_pred)],
    'ROC AUC': [roc_auc_score(y_h1_test, y_h1_pred_proba), roc_auc_score(y_h2_test, y_h2_pred_proba)]
}

# 设置图形样式
plt.figure(figsize=(12, 10))
x = np.arange(len(datasets))
width = 0.15  # 柱子的宽度
multiplier = 0

# 设置颜色方案
colors = ['#4e79a7', '#f28e2b', '#59a14f', '#e15759', '#76b7b2']

# 绘制每个指标的柱子
for attribute, values in metrics.items():
    offset = width * multiplier
    rects = plt.bar(x + offset, values, width, label=attribute, color=colors[multiplier])
    multiplier += 1

# 优化图表样式
plt.ylabel('Score')
plt.title('Performance Metrics Comparison: H1 vs H2')
plt.xticks(x + width * 2, datasets)  # 将x轴标签放在分组的中间
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left')
plt.grid(True, axis='y', linestyle='--', alpha=0.3)

# 在柱子上添加数值标签
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        plt.text(rect.get_x() + rect.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom', rotation=90)

# 为所有柱子添加标签
for container in plt.gca().containers:
    autolabel(container)

# 调整布局
plt.tight_layout()
plt.show()

# 打印详细的指标表格
df_metrics = pd.DataFrame(metrics, index=datasets)
print("\nDetailed Performance Metrics:")
print(df_metrics.round(4))

feature importance top10

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report,
                           roc_auc_score, roc_curve, precision_score, recall_score, f1_score)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import pandas as pd

# 准备数据
# H1数据
X_h1_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_h1_train = H1_train['is_canceled']
X_h1_test = H1_test[selected_columns].drop(columns=['is_canceled'])
y_h1_test = H1_test['is_canceled']

# H2数据
X_h2_test = H2_test[selected_columns].drop(columns=['is_canceled'])
y_h2_test = H2_test['is_canceled']

# 设置最佳参数
best_params = {
    'n_d': 8,
    'n_a': 32,
    'n_steps': 4,
    'gamma': 1.1704703445472,
    'n_independent': 3,
    'n_shared': 2,
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': {'lr': 0.02},
    'scheduler_params': {
        'mode': 'min',
        'patience': 5,
        'min_lr': 1e-5,
        'factor': 0.5
    },
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'mask_type': 'entmax',
    'seed': 42
}

# 首先获取top 10特征
initial_model = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5, gamma=1.5,
    n_independent=2, n_shared=2,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.5),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    mask_type='entmax',
    seed=42
)

# 标准化数据
scaler = StandardScaler()
X_h1_train_scaled = scaler.fit_transform(X_h1_train)

# 训练初始模型获取特征重要性
initial_model.fit(
    X_h1_train_scaled, y_h1_train.values,
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128
)

# 获取top 10特征
feature_importance = pd.DataFrame({
    'Feature': X_h1_train.columns,
    'Importance': initial_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

top_10_features = feature_importance.head(10)['Feature'].tolist()
print("\nSelected Top 10 Features:")
print(top_10_features)

# 使用top 10特征准备数据
X_h1_train_top10 = X_h1_train[top_10_features]
X_h1_test_top10 = X_h1_test[top_10_features]
X_h2_test_top10 = X_h2_test[top_10_features]

# 标准化数据
scaler = StandardScaler()
X_h1_train_scaled = scaler.fit_transform(X_h1_train_top10)
X_h1_test_scaled = scaler.transform(X_h1_test_top10)
X_h2_test_scaled = scaler.transform(X_h2_test_top10)

# 训练最终模型
final_model = TabNetClassifier(**best_params)
final_model.fit(
    X_h1_train_scaled, y_h1_train.values,
    max_epochs=100,
    batch_size=2048,
    virtual_batch_size=256
)

# 进行预测
y_h1_pred = final_model.predict(X_h1_test_scaled)
y_h1_pred_proba = final_model.predict_proba(X_h1_test_scaled)[:, 1]
y_h2_pred = final_model.predict(X_h2_test_scaled)
y_h2_pred_proba = final_model.predict_proba(X_h2_test_scaled)[:, 1]

# 绘制ROC曲线对比
plt.figure(figsize=(12, 8))

# H1 ROC
fpr_h1, tpr_h1, _ = roc_curve(y_h1_test, y_h1_pred_proba)
roc_auc_h1 = roc_auc_score(y_h1_test, y_h1_pred_proba)
plt.plot(fpr_h1, tpr_h1, label=f'H1 Test (AUC = {roc_auc_h1:.2f})', color='blue')

# H2 ROC
fpr_h2, tpr_h2, _ = roc_curve(y_h2_test, y_h2_pred_proba)
roc_auc_h2 = roc_auc_score(y_h2_test, y_h2_pred_proba)
plt.plot(fpr_h2, tpr_h2, label=f'H2 Test (AUC = {roc_auc_h2:.2f})', color='red')

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison: H1 vs H2 Test Sets (Top 10 Features)')
plt.legend(loc='lower right')
plt.show()

# 创建混淆矩阵对比图
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# H1混淆矩阵
conf_matrix_h1 = confusion_matrix(y_h1_test, y_h1_pred)
conf_matrix_percentage_h1 = conf_matrix_h1 / conf_matrix_h1.sum(axis=1).reshape(-1, 1) * 100

sns.heatmap(conf_matrix_h1, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax1)
for i in range(conf_matrix_h1.shape[0]):
    for j in range(conf_matrix_h1.shape[1]):
        percentage_text = f"{conf_matrix_percentage_h1[i, j]:.1f}%"
        ax1.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)
ax1.set_title('H1 Test Set Confusion Matrix (Top 10 Features)')
ax1.set_xlabel('Predicted Label')
ax1.set_ylabel('True Label')

# H2混淆矩阵
conf_matrix_h2 = confusion_matrix(y_h2_test, y_h2_pred)
conf_matrix_percentage_h2 = conf_matrix_h2 / conf_matrix_h2.sum(axis=1).reshape(-1, 1) * 100

sns.heatmap(conf_matrix_h2, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax2)
for i in range(conf_matrix_h2.shape[0]):
    for j in range(conf_matrix_h2.shape[1]):
        percentage_text = f"{conf_matrix_percentage_h2[i, j]:.1f}%"
        ax2.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)
ax2.set_title('H2 Test Set Confusion Matrix (Top 10 Features)')
ax2.set_xlabel('Predicted Label')
ax2.set_ylabel('True Label')

plt.tight_layout()
plt.show()

# 性能指标对比可视化
datasets = ['H1 Test', 'H2 Test']
metrics = {
    'Accuracy': [accuracy_score(y_h1_test, y_h1_pred), accuracy_score(y_h2_test, y_h2_pred)],
    'Precision': [precision_score(y_h1_test, y_h1_pred), precision_score(y_h2_test, y_h2_pred)],
    'Recall': [recall_score(y_h1_test, y_h1_pred), recall_score(y_h2_test, y_h2_pred)],
    'F1 Score': [f1_score(y_h1_test, y_h1_pred), f1_score(y_h2_test, y_h2_pred)],
    'ROC AUC': [roc_auc_score(y_h1_test, y_h1_pred_proba), roc_auc_score(y_h2_test, y_h2_pred_proba)]
}

plt.figure(figsize=(12, 10))
x = np.arange(len(datasets))
width = 0.15
multiplier = 0
colors = ['#4e79a7', '#f28e2b', '#59a14f', '#e15759', '#76b7b2']

for attribute, values in metrics.items():
    offset = width * multiplier
    rects = plt.bar(x + offset, values, width, label=attribute, color=colors[multiplier])
    multiplier += 1

plt.ylabel('Score')
plt.title('Performance Metrics Comparison: H1 vs H2 (Top 10 Features)')
plt.xticks(x + width * 2, datasets)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left')
plt.grid(True, axis='y', linestyle='--', alpha=0.3)

def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        plt.text(rect.get_x() + rect.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom', rotation=90)

for container in plt.gca().containers:
    autolabel(container)

plt.tight_layout()
plt.show()

# 输出详细性能指标
df_metrics = pd.DataFrame(metrics, index=datasets)
print("\nDetailed Performance Metrics:")
print(df_metrics.round(4))

# 输出分类报告
print("\nH1 Test Set Classification Report:")
print(classification_report(y_h1_test, y_h1_pred))

print("\nH2 Test Set Classification Report:")
print(classification_report(y_h2_test, y_h2_pred))

BORUTA

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report,
                           roc_auc_score, roc_curve, precision_score, recall_score, f1_score)
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import pandas as pd

# 准备数据
# H1数据
X_h1_train = H1_train[selected_columns].drop(columns=['is_canceled'])
y_h1_train = H1_train['is_canceled']
X_h1_test = H1_test[selected_columns].drop(columns=['is_canceled'])
y_h1_test = H1_test['is_canceled']

# H2数据
X_h2_test = H2_test[selected_columns].drop(columns=['is_canceled'])
y_h2_test = H2_test['is_canceled']

# 设置最佳参数
best_params = {
    'n_d': 64,
    'n_a': 64,
    'n_steps': 5,
    'gamma': 1.0003359875270394,
    'n_independent': 3,
    'n_shared': 3,
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': {'lr': 0.02},
    'scheduler_params': {
        'mode': 'min',
        'patience': 5,
        'min_lr': 1e-5,
        'factor': 0.5
    },
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'mask_type': 'entmax',
    'seed': 42
}

# 标准化初始数据用于Boruta
scaler = StandardScaler()
X_h1_train_scaled = scaler.fit_transform(X_h1_train)

# 使用Boruta进行特征选择
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
boruta = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=42)

# 运行Boruta
boruta.fit(X_h1_train_scaled, y_h1_train.values)

# 获取选中的特征
selected_features = X_h1_train.columns[boruta.support_].tolist()
print("\nSelected Features by Boruta:")
print(selected_features)
print(f"\nTotal number of selected features: {len(selected_features)}")

# 使用Boruta选中的特征准备数据
X_h1_train_selected = X_h1_train[selected_features]
X_h1_test_selected = X_h1_test[selected_features]
X_h2_test_selected = X_h2_test[selected_features]

# 标准化数据
scaler = StandardScaler()
X_h1_train_scaled = scaler.fit_transform(X_h1_train_selected)
X_h1_test_scaled = scaler.transform(X_h1_test_selected)
X_h2_test_scaled = scaler.transform(X_h2_test_selected)

# 训练最终模型
final_model = TabNetClassifier(**best_params)
final_model.fit(
    X_h1_train_scaled, y_h1_train.values,
    max_epochs=100,
    batch_size=2048,
    virtual_batch_size=256
)

# 进行预测
y_h1_pred = final_model.predict(X_h1_test_scaled)
y_h1_pred_proba = final_model.predict_proba(X_h1_test_scaled)[:, 1]
y_h2_pred = final_model.predict(X_h2_test_scaled)
y_h2_pred_proba = final_model.predict_proba(X_h2_test_scaled)[:, 1]

# 绘制ROC曲线对比
plt.figure(figsize=(12, 8))

# H1 ROC
fpr_h1, tpr_h1, _ = roc_curve(y_h1_test, y_h1_pred_proba)
roc_auc_h1 = roc_auc_score(y_h1_test, y_h1_pred_proba)
plt.plot(fpr_h1, tpr_h1, label=f'H1 Test (AUC = {roc_auc_h1:.2f})', color='blue')

# H2 ROC
fpr_h2, tpr_h2, _ = roc_curve(y_h2_test, y_h2_pred_proba)
roc_auc_h2 = roc_auc_score(y_h2_test, y_h2_pred_proba)
plt.plot(fpr_h2, tpr_h2, label=f'H2 Test (AUC = {roc_auc_h2:.2f})', color='red')

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison: H1 vs H2 Test Sets (Boruta Features)')
plt.legend(loc='lower right')
plt.show()

# 创建混淆矩阵对比图
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# H1混淆矩阵
conf_matrix_h1 = confusion_matrix(y_h1_test, y_h1_pred)
conf_matrix_percentage_h1 = conf_matrix_h1 / conf_matrix_h1.sum(axis=1).reshape(-1, 1) * 100

sns.heatmap(conf_matrix_h1, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax1)
for i in range(conf_matrix_h1.shape[0]):
    for j in range(conf_matrix_h1.shape[1]):
        percentage_text = f"{conf_matrix_percentage_h1[i, j]:.1f}%"
        ax1.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)
ax1.set_title('H1 Test Set Confusion Matrix (Boruta Features)')
ax1.set_xlabel('Predicted Label')
ax1.set_ylabel('True Label')

# H2混淆矩阵
conf_matrix_h2 = confusion_matrix(y_h2_test, y_h2_pred)
conf_matrix_percentage_h2 = conf_matrix_h2 / conf_matrix_h2.sum(axis=1).reshape(-1, 1) * 100

sns.heatmap(conf_matrix_h2, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax2)
for i in range(conf_matrix_h2.shape[0]):
    for j in range(conf_matrix_h2.shape[1]):
        percentage_text = f"{conf_matrix_percentage_h2[i, j]:.1f}%"
        ax2.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)
ax2.set_title('H2 Test Set Confusion Matrix (Boruta Features)')
ax2.set_xlabel('Predicted Label')
ax2.set_ylabel('True Label')

plt.tight_layout()
plt.show()

# 性能指标对比可视化
datasets = ['H1 Test', 'H2 Test']
metrics = {
    'Accuracy': [accuracy_score(y_h1_test, y_h1_pred), accuracy_score(y_h2_test, y_h2_pred)],
    'Precision': [precision_score(y_h1_test, y_h1_pred), precision_score(y_h2_test, y_h2_pred)],
    'Recall': [recall_score(y_h1_test, y_h1_pred), recall_score(y_h2_test, y_h2_pred)],
    'F1 Score': [f1_score(y_h1_test, y_h1_pred), f1_score(y_h2_test, y_h2_pred)],
    'ROC AUC': [roc_auc_score(y_h1_test, y_h1_pred_proba), roc_auc_score(y_h2_test, y_h2_pred_proba)]
}

plt.figure(figsize=(12, 10))
x = np.arange(len(datasets))
width = 0.15
multiplier = 0
colors = ['#4e79a7', '#f28e2b', '#59a14f', '#e15759', '#76b7b2']

for attribute, values in metrics.items():
    offset = width * multiplier
    rects = plt.bar(x + offset, values, width, label=attribute, color=colors[multiplier])
    multiplier += 1

plt.ylabel('Score')
plt.title('Performance Metrics Comparison: H1 vs H2 (Boruta Features)')
plt.xticks(x + width * 2, datasets)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left')
plt.grid(True, axis='y', linestyle='--', alpha=0.3)

def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        plt.text(rect.get_x() + rect.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom', rotation=90)

for container in plt.gca().containers:
    autolabel(container)

plt.tight_layout()
plt.show()

# 输出详细性能指标
df_metrics = pd.DataFrame(metrics, index=datasets)
print("\nDetailed Performance Metrics:")
print(df_metrics.round(4))

# 输出分类报告
print("\nH1 Test Set Classification Report:")
print(classification_report(y_h1_test, y_h1_pred))

print("\nH2 Test Set Classification Report:")
print(classification_report(y_h2_test, y_h2_pred))

### cross-dataset evaluation: H2 test set

in test set

BASELINE

feature importance top10

SHAP top10

BORUTA

#H2

 ## 1.Logistic Regression





### baseline model

#### imbalanced data

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# 使用 H2_train 定义特征和目标变量
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化数据（随机森林通常不需要标准化，但可以保留以一致性）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 创建随机森林模型
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # 可以调整n_estimators等超参数

# 进行3-fold交叉验证并预测
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(rf_model, X_train_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(rf_model, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]

# 计算混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

# 绘制混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

# 在单元格内显示百分比，位于计数值的下方
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Random Forest)')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

# 绘制每个折的ROC曲线
plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    rf_model.fit(X_train_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = rf_model.predict_proba(X_train_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Random Forest, 3-fold CV)')
plt.legend(loc='lower right')
plt.show()

#### handling imbalanced datasets

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# 假设 H2_train 和 selected_columns 已经定义
# 定义特征和目标变量
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 定义采样方法，包括 "未处理数据"
samplers = {
    'No Sampling': None,  # 未处理数据
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

# 初始化结果存储
metrics_data = []

# 交叉验证设置
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 进行采样、训练和评估
for method, sampler in samplers.items():
    print(f"=== {method} ===")
    if sampler is None:
        # 未处理数据，直接使用原始数据
        X_resampled, y_resampled = X_train_scaled, y_train
    else:
        # 应用采样方法
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled, y_train)

    # 定义逻辑回归模型
    log_reg_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)

    # 执行交叉验证预测
    y_pred_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    # 计算指标
    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    # 计算混淆矩阵
    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    # 绘制混淆矩阵
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    # 在单元格内显示百分比
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix with Counts and Percentages ({method})')
    plt.show()

    # 打印分类报告
    report = classification_report(y_resampled, y_pred_cv)
    print(report)

    # 绘制ROC曲线
    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    # 将指标添加到结果
    metrics_data.append({
        'Balancing Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

# 转换为 DataFrame 并显示汇总结果
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print(metrics_df)

# 绘制汇总表格的条形图
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Balancing Methods')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


SMOTE效果最好

### different feature combinations

coefficient

1116

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# 假设 H2_train 和 selected_columns 已经定义
# 定义特征和目标变量
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 使用未处理数据提取重要特征
log_reg_model_raw = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
log_reg_model_raw.fit(X_train_scaled, y_train)

coefficients = log_reg_model_raw.coef_[0]
features = X_train.columns  # 假设 X_train 的列名是特征名称
coef_df = pd.DataFrame({'Feature': features, 'Coefficient': coefficients})
coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
coef_df_sorted = coef_df.sort_values(by='Abs_Coefficient', ascending=False).head(10)

# 提取 Top 10 特征
top_features = coef_df_sorted['Feature'].tolist()
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

for method, sampler in samplers.items():
    print(f"=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

    log_reg_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
    y_pred_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix with Counts and Percentages ({method})')
    plt.show()

    report = classification_report(y_resampled, y_pred_cv)
    print(report)

    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    metrics_data.append({
        'Balancing Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print(metrics_df)

metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Balancing Methods (Top 10 Features)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



SHAP

In [None]:
import shap

# 计算 SHAP 值
explainer = shap.LinearExplainer(log_reg_model_smote, X_smote, feature_perturbation="interventional")
shap_values = explainer.shap_values(X_smote)

# 绘制 SHAP 特征重要性图
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_smote, feature_names=features, plot_type="bar")


In [None]:
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_predict

# 计算 SHAP 值
explainer = shap.LinearExplainer(log_reg_model_smote, X_smote, feature_perturbation="interventional")
shap_values = explainer.shap_values(X_smote)

# 绘制 SHAP 特征重要性图
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_smote, feature_names=features, plot_type="bar")

# 获取 Top 10 SHAP 特征
shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame([features, shap_sum]).T
importance_df.columns = ['Feature', 'SHAP Importance']
importance_df.sort_values(by='SHAP Importance', ascending=False, inplace=True)
top10_shap_features = importance_df.head(10)['Feature'].values

# 筛选 Top 10 特征
X_train_top10_shap = X_train_scaled[:, X_train.columns.isin(top10_shap_features)]

# 定义逻辑回归模型
log_reg_model_top10_shap = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)

# 3-fold 交叉验证
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv_shap = cross_val_predict(log_reg_model_top10_shap, X_train_top10_shap, y_train, cv=cv, method='predict')
y_pred_proba_cv_shap = cross_val_predict(log_reg_model_top10_shap, X_train_top10_shap, y_train, cv=cv, method='predict_proba')[:, 1]

# 计算并绘制混淆矩阵
conf_matrix_shap = confusion_matrix(y_train, y_pred_cv_shap)
conf_matrix_percentage_shap = conf_matrix_shap / conf_matrix_shap.sum(axis=1).reshape(-1, 1) * 100
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_shap, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_shap.shape[0]):
    for j in range(conf_matrix_shap.shape[1]):
        percentage_text = f"{conf_matrix_percentage_shap[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (SHAP Logistic Regression)')
plt.show()

# 分类报告和 ROC 曲线
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv_shap))
plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_top10_shap, y_train):
    log_reg_model_top10_shap.fit(X_train_top10_shap[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold_shap = log_reg_model_top10_shap.predict_proba(X_train_top10_shap[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold_shap)
    roc_auc_fold_shap = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold_shap)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold_shap:.2f})')
    fold_count += 1
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (SHAP Logistic Regression, 3-fold CV)')
plt.legend(loc='lower right')
plt.show()


shap

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import shap
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# 假设 H2_train 和 selected_columns 已经定义
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 定义初始逻辑回归模型并训练
log_reg_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
log_reg_model.fit(X_train_scaled, y_train)

# 使用 SHAP 计算特征重要性
explainer = shap.LinearExplainer(log_reg_model, X_train_scaled, feature_perturbation="interventional")
shap_values = explainer.shap_values(X_train_scaled)

# 绘制 SHAP 特征重要性图
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_train_scaled, feature_names=X_train.columns, plot_type="bar")

# 获取 Top 10 SHAP 特征
shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame({'Feature': X_train.columns, 'SHAP Importance': shap_sum})
importance_df.sort_values(by='SHAP Importance', ascending=False, inplace=True)
top10_shap_features = importance_df.head(10)['Feature'].values

# 筛选 Top 10 SHAP 特征
X_train_top10_shap = X_train[top10_shap_features]
X_train_scaled_top10_shap = scaler.fit_transform(X_train_top10_shap)

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 对每种采样方法的模型进行训练并评估
for method, sampler in samplers.items():
    print(f"=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top10_shap, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top10_shap, y_train)

    log_reg_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
    y_pred_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix with Counts and Percentages ({method})')
    plt.show()

    report = classification_report(y_resampled, y_pred_cv)
    print(report)

    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    metrics_data.append({
        'Balancing Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print(metrics_df)

metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Balancing Methods (Top 10 SHAP Features)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


Boruta

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 假设 X_train_scaled 和 y_train 已经预处理完毕

# 数据准备
smote_sampler = SMOTE(random_state=42)
X_smote, y_smote = smote_sampler.fit_resample(X_train_scaled, y_train)

# 创建逻辑回归模型
log_reg = LogisticRegression(random_state=42, max_iter=1000)

# 3折交叉验证
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 保存各折ROC曲线数据
fold_count = 1
plt.figure(figsize=(12, 8))
for train_idx, test_idx in cv.split(X_smote, y_smote):
    X_train_fold, X_test_fold = X_smote[train_idx], X_smote[test_idx]
    y_train_fold, y_test_fold = y_smote[train_idx], y_smote[test_idx]

    log_reg.fit(X_train_fold, y_train_fold)
    y_pred_proba_fold = log_reg.predict_proba(X_test_fold)[:, 1]
    fpr, tpr, _ = roc_curve(y_test_fold, y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_test_fold, y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Logistic Regression, 3-fold CV)')
plt.legend(loc='lower right')
plt.show()

# 使用所有数据进行预测
y_pred = cross_val_predict(log_reg, X_smote, y_smote, cv=cv, method="predict")
y_pred_proba = cross_val_predict(log_reg, X_smote, y_smote, cv=cv, method="predict_proba")[:, 1]

# 混淆矩阵和百分比
conf_matrix = confusion_matrix(y_smote, y_pred)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1)[:, None] * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

# 在单元格内显示百分比
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Logistic Regression)')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_smote, y_pred))


boruta

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# 假设 H2_train 和 selected_columns 已经定义
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 使用 Boruta 进行特征选择
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
boruta_selector = BorutaPy(
    rf,
    n_estimators='auto',
    verbose=2,
    random_state=42
)

# 运行 Boruta
boruta_selector.fit(X_train_scaled, y_train)

# 获取选中的特征
selected_features = X_train.columns[boruta_selector.support_]
print("\nBoruta selected features:")
print("========================")
for feature in selected_features:
    print(feature)
print("\nNumber of selected features:", len(selected_features))

# 使用 Boruta 选择的特征
X_train_boruta = X_train[selected_features]
X_train_scaled_boruta = scaler.fit_transform(X_train_boruta)

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 对每种采样方法的模型进行训练并评估
for method, sampler in samplers.items():
    print(f"=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_boruta, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_boruta, y_train)

    log_reg_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
    y_pred_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix with Counts and Percentages ({method})')
    plt.show()

    report = classification_report(y_resampled, y_pred_cv)
    print(report)

    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    metrics_data.append({
        'Balancing Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print(metrics_df)

metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Balancing Methods (Boruta Selected Features)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### hyperparameter tuning

baseline model

In [None]:
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import optuna
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, make_scorer
)
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# 假设 H2_train 和 selected_columns 已经定义
# 定义特征和目标变量
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 定义采样方法，包括 "未处理数据"
samplers = {
    'No Sampling': None,  # 未处理数据
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

# 初始化结果存储
metrics_data = []

# 交叉验证设置
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 定义目标函数用于超参数调优
def objective(trial, X_resampled, y_resampled):
    # 定义需要调优的超参数
    C = trial.suggest_loguniform('C', 1e-4, 1e2)
    solver = trial.suggest_categorical('solver', ['lbfgs', 'liblinear'])

    # 定义逻辑回归模型
    log_reg_model = LogisticRegression(C=C, solver=solver, max_iter=2000, random_state=42)

    # 使用 F1 Score 作为目标函数
    f1 = cross_val_score(log_reg_model, X_resampled, y_resampled, cv=cv, scoring=make_scorer(f1_score)).mean()
    return f1

# 进行采样、超参数调优、训练和评估
for method, sampler in samplers.items():
    print(f"=== {method} ===")
    if sampler is None:
        # 未处理数据，直接使用原始数据
        X_resampled, y_resampled = X_train_scaled, y_train
    else:
        # 应用采样方法
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled, y_train)

    # 使用 Optuna 进行超参数调优
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_resampled, y_resampled), n_trials=50, timeout=600)
    best_params = study.best_params

    # 打印每种方法的最佳参数
    print(f"Best parameters for {method}: {best_params}")

    # 使用最优超参数重新训练模型
    log_reg_model_optimized = LogisticRegression(**best_params, max_iter=2000, random_state=42)

    # 3-fold 交叉验证
    y_pred_cv = cross_val_predict(log_reg_model_optimized, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(log_reg_model_optimized, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    # 计算指标
    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    # 将当前采样方法的指标存入列表
    metrics_data.append({
        'Balancing Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

    # 计算混淆矩阵
    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    # 绘制混淆矩阵
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    # 在单元格内显示百分比
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix with Counts and Percentages ({method})')
    plt.show()

    # 打印分类报告
    report = classification_report(y_resampled, y_pred_cv)
    print(report)

    # 绘制ROC曲线
    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

# 转换为 DataFrame 并显示汇总结果
metrics_df = pd.DataFrame(metrics_data).drop_duplicates().sort_values(by='ROC AUC', ascending=False)
print(metrics_df)

# 绘制汇总表格的条形图
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Balancing Methods with Optimized Logistic Regression')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


coefficient 4

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import optuna
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# 假设 H2_train 和 selected_columns 已经定义
# 定义特征和目标变量
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 提取 Top 10 特征（未处理数据）
log_reg_model_raw = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
log_reg_model_raw.fit(X_train_scaled, y_train)

coefficients = log_reg_model_raw.coef_[0]
features = X_train.columns
coef_df = pd.DataFrame({'Feature': features, 'Coefficient': coefficients})
coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
coef_df_sorted = coef_df.sort_values(by='Abs_Coefficient', ascending=False).head(10)

top_features = coef_df_sorted['Feature'].tolist()
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

# 初始化交叉验证
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 定义 Optuna 超参数调优函数
def objective(trial, X, y):
    param = {
        'C': trial.suggest_loguniform('C', 1e-4, 10.0),
        'solver': trial.suggest_categorical('solver', ['liblinear', 'lbfgs']),
        'max_iter': 2000
    }
    model = LogisticRegression(**param, random_state=42)
    y_pred_cv = cross_val_predict(model, X, y, cv=cv, method='predict')
    return f1_score(y, y_pred_cv, pos_label=1)

# 记录每种平衡方法的最佳模型和指标
metrics_data = []

for method, sampler in samplers.items():
    print(f"=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

    # 使用 Optuna 调优超参数
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_resampled, y_resampled), n_trials=30)

    best_params = study.best_params
    print(f"Best params for {method}: {best_params}")

    # 使用最佳参数训练模型
    best_model = LogisticRegression(**best_params, random_state=42)
    y_pred_cv = cross_val_predict(best_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(best_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    # 混淆矩阵可视化
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix with Counts and Percentages ({method})')
    plt.show()

    report = classification_report(y_resampled, y_pred_cv)
    print(report)

    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    metrics_data.append({
        'Balancing Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc,
        'Best Params': best_params
    })

# 转换为 DataFrame 并显示汇总结果
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print(metrics_df)

# 绘制对比图表
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Balancing Methods with Optimized Logistic Regression')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


SHAP smote

In [None]:
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, make_scorer, f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_val_score
import optuna

# 计算 SHAP 值
explainer = shap.LinearExplainer(log_reg_model_smote, X_smote, feature_perturbation="interventional")
shap_values = explainer.shap_values(X_smote)

# 绘制 SHAP 特征重要性图
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_smote, feature_names=features, plot_type="bar")

# 获取 Top 10 SHAP 特征
shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame([features, shap_sum]).T
importance_df.columns = ['Feature', 'SHAP Importance']
importance_df.sort_values(by='SHAP Importance', ascending=False, inplace=True)
top10_shap_features = importance_df.head(10)['Feature'].values

# 筛选 Top 10 特征
X_train_top10_shap = X_train_scaled[:, X_train.columns.isin(top10_shap_features)]

# 定义目标函数进行超参数调优
def objective(trial):
    # 定义需要调优的超参数
    C = trial.suggest_loguniform('C', 1e-4, 1e2)  # 正则化强度
    solver = trial.suggest_categorical('solver', ['lbfgs', 'liblinear'])

    # 定义逻辑回归模型
    log_reg_model = LogisticRegression(C=C, solver=solver, max_iter=2000, random_state=42)

    # 使用 F1 Score 作为目标函数
    f1 = cross_val_score(log_reg_model, X_train_top10_shap, y_train, cv=3, scoring=make_scorer(f1_score)).mean()
    return f1

# 使用 Optuna 进行调优
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=600)

# 输出最优超参数
print("Best trial:")
print(f"  Value: {study.best_value}")
print(f"  Params: {study.best_params}")

# 使用最优超参数重新训练模型
best_params = study.best_params
log_reg_model_optimized = LogisticRegression(**best_params, max_iter=2000, random_state=42)

# 3-fold 交叉验证
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv_optimized = cross_val_predict(log_reg_model_optimized, X_train_top10_shap, y_train, cv=cv, method='predict')
y_pred_proba_cv_optimized = cross_val_predict(log_reg_model_optimized, X_train_top10_shap, y_train, cv=cv, method='predict_proba')[:, 1]

# 计算并绘制混淆矩阵
conf_matrix_optimized = confusion_matrix(y_train, y_pred_cv_optimized)
conf_matrix_percentage_optimized = conf_matrix_optimized / conf_matrix_optimized.sum(axis=1).reshape(-1, 1) * 100
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_optimized, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_optimized.shape[0]):
    for j in range(conf_matrix_optimized.shape[1]):
        percentage_text = f"{conf_matrix_percentage_optimized[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (Optimized Logistic Regression)')
plt.show()

# 分类报告和 ROC 曲线
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv_optimized))
plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_top10_shap, y_train):
    log_reg_model_optimized.fit(X_train_top10_shap[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold_optimized = log_reg_model_optimized.predict_proba(X_train_top10_shap[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold_optimized)
    roc_auc_fold_optimized = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold_optimized)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold_optimized:.2f})')
    fold_count += 1
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Optimized Logistic Regression, 3-fold CV)')
plt.legend(loc='lower right')
plt.show()



shap 1117

In [None]:
import optuna
from sklearn.linear_model import LogisticRegression

# 定义目标函数，用于Optuna超参数调优
def objective(trial, X_train, y_train):
    # 定义需要调优的超参数
    C = trial.suggest_loguniform('C', 1e-4, 1e4)
    solver = trial.suggest_categorical('solver', ['lbfgs', 'liblinear'])
    penalty = trial.suggest_categorical('penalty', ['l2', 'none'] if solver == 'lbfgs' else ['l1', 'l2'])

    # 创建逻辑回归模型
    model = LogisticRegression(C=C, solver=solver, penalty=penalty, max_iter=2000, random_state=42)

    # 使用交叉验证评估模型
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred_cv = cross_val_predict(model, X_train, y_train, cv=cv, method='predict')
    return f1_score(y_train, y_pred_cv, pos_label=1)

# 用于存储最终的评估结果
metrics_data = []

# 循环四种采样方法并应用调优
for method, sampler in samplers.items():
    print(f"=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top10_shap, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top10_shap, y_train)

    # 使用Optuna进行超参数调优
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_resampled, y_resampled), n_trials=30)  # 设置搜索次数为30
    best_params = study.best_params

    # 使用最佳参数创建模型
    log_reg_model = LogisticRegression(**best_params, max_iter=2000, random_state=42)
    log_reg_model.fit(X_resampled, y_resampled)

    # 使用交叉验证进行预测
    y_pred_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    # 计算评估指标
    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    # 混淆矩阵绘制
    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix with Counts and Percentages ({method})')
    plt.show()

    # 打印分类报告
    print(f"\nClassification Report for {method}:")
    print(classification_report(y_resampled, y_pred_cv))

    # 绘制ROC曲线
    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    # 保存评估结果
    metrics_data.append({
        'Balancing Method': method,
        'Best Params': best_params,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

# 转换为DataFrame并排序
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print(metrics_df)

# 绘制结果对比条形图
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Balancing Methods with Optuna (Top 10 SHAP Features)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
def objective(trial, X_train, y_train):
    # 定义需要调优的超参数
    C = trial.suggest_loguniform('C', 1e-4, 1e4)
    solver = trial.suggest_categorical('solver', ['lbfgs', 'liblinear'])
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'none'])

    # 跳过不支持的组合
    if solver == 'lbfgs' and penalty == 'l1':
        raise optuna.exceptions.TrialPruned()

    # 创建逻辑回归模型
    model = LogisticRegression(C=C, solver=solver, penalty=penalty, max_iter=2000, random_state=42)

    # 使用交叉验证评估模型
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred_cv = cross_val_predict(model, X_train, y_train, cv=cv, method='predict')
    return f1_score(y_train, y_pred_cv, pos_label=1)


In [None]:
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import shap
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# 假设 H2_train 和 selected_columns 已经定义
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 定义初始逻辑回归模型并训练
log_reg_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
log_reg_model.fit(X_train_scaled, y_train)

# 使用 SHAP 计算特征重要性
explainer = shap.LinearExplainer(log_reg_model, X_train_scaled, feature_perturbation="interventional")
shap_values = explainer.shap_values(X_train_scaled)

# 绘制 SHAP 特征重要性图
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_train_scaled, feature_names=X_train.columns, plot_type="bar")

# 获取 Top 10 SHAP 特征
shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame({'Feature': X_train.columns, 'SHAP Importance': shap_sum})
importance_df.sort_values(by='SHAP Importance', ascending=False, inplace=True)
top10_shap_features = importance_df.head(10)['Feature'].values

# 筛选 Top 10 SHAP 特征
X_train_top10_shap = X_train[top10_shap_features]
X_train_scaled_top10_shap = scaler.fit_transform(X_train_top10_shap)

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

# 定义目标函数，用于 Optuna 超参数调优
def objective(trial, X_train, y_train):
    # 定义需要调优的超参数
    C = trial.suggest_loguniform('C', 1e-4, 1e4)
    solver = trial.suggest_categorical('solver', ['lbfgs', 'liblinear'])
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'none'])

    # 跳过不支持的组合
    if solver == 'lbfgs' and penalty == 'l1':
        raise optuna.exceptions.TrialPruned()

    # 创建逻辑回归模型
    model = LogisticRegression(C=C, solver=solver, penalty=penalty, max_iter=2000, random_state=42)

    # 使用交叉验证评估模型
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred_cv = cross_val_predict(model, X_train, y_train, cv=cv, method='predict')
    return f1_score(y_train, y_pred_cv, pos_label=1)

# 用于存储最终的评估结果
metrics_data = []

# 循环四种采样方法并应用调优
for method, sampler in samplers.items():
    print(f"=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top10_shap, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top10_shap, y_train)

    # 使用 Optuna 进行超参数调优
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_resampled, y_resampled), n_trials=30)  # 设置搜索次数为30
    best_params = study.best_params

    # 使用最佳参数创建模型
    log_reg_model = LogisticRegression(**best_params, max_iter=2000, random_state=42)
    log_reg_model.fit(X_resampled, y_resampled)

    # 使用交叉验证进行预测
    y_pred_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    # 计算评估指标
    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    # 混淆矩阵绘制
    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix with Counts and Percentages ({method})')
    plt.show()

    # 打印分类报告
    print(f"\nClassification Report for {method}:")
    print(classification_report(y_resampled, y_pred_cv))

    # 绘制ROC曲线
    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    # 保存评估结果
    metrics_data.append({
        'Balancing Method': method,
        'Best Params': best_params,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

# 转换为 DataFrame 并排序
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print(metrics_df)

# 绘制结果对比条形图
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Balancing Methods with Optuna (Top 10 SHAP Features)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import shap
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Define the objective function for Optuna
def objective(trial, X, y, sampler_type=None):
    # Define hyperparameters to optimize
    params = {
        'C': trial.suggest_float('C', 1e-5, 100, log=True),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'max_iter': 2000,
        'random_state': 42,
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'newton-cg', 'sag'])
    }

    # Apply sampling if specified
    if sampler_type == 'Undersample':
        sampler = RandomUnderSampler(random_state=42)
        X_resampled, y_resampled = sampler.fit_resample(X, y)
    elif sampler_type == 'Oversample':
        sampler = RandomOverSampler(random_state=42)
        X_resampled, y_resampled = sampler.fit_resample(X, y)
    elif sampler_type == 'SMOTE':
        k_neighbors = trial.suggest_int('k_neighbors', 2, 10)
        sampler = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_resampled, y_resampled = sampler.fit_resample(X, y)
    else:
        X_resampled, y_resampled = X, y

    # Create and evaluate model using cross-validation
    model = LogisticRegression(**params)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in cv.split(X_resampled, y_resampled):
        X_train_fold, X_val_fold = X_resampled[train_idx], X_resampled[val_idx]
        y_train_fold, y_val_fold = y_resampled[train_idx], y_resampled[val_idx]

        model.fit(X_train_fold, y_train_fold)
        y_pred_proba = model.predict_proba(X_val_fold)[:, 1]
        score = roc_auc_score(y_val_fold, y_pred_proba)
        scores.append(score)

    return np.mean(scores)

def optimize_and_evaluate(X_train, y_train, sampler_type=None, n_trials=100):
    """Optimize model for a specific sampling method and evaluate results"""
    # Create study
    study = optuna.create_study(direction='maximize', study_name=f'{sampler_type}_optimization')

    # Optimize
    study.optimize(lambda trial: objective(trial, X_train, y_train, sampler_type), n_trials=n_trials)

    # Get best parameters
    best_params = study.best_params
    best_params.update({'max_iter': 2000, 'random_state': 42})

    # Train final model with best parameters
    if sampler_type == 'Undersample':
        sampler = RandomUnderSampler(random_state=42)
        X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    elif sampler_type == 'Oversample':
        sampler = RandomOverSampler(random_state=42)
        X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    elif sampler_type == 'SMOTE':
        sampler = SMOTE(random_state=42, k_neighbors=best_params.get('k_neighbors', 5))
        X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    else:
        X_resampled, y_resampled = X_train, y_train

    model = LogisticRegression(**best_params)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred_cv = cross_val_predict(model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    # Calculate metrics
    metrics = {
        'Balancing Method': sampler_type if sampler_type else 'No Sampling',
        'Best Parameters': best_params,
        'Best Score': study.best_value,
        'Accuracy': accuracy_score(y_resampled, y_pred_cv),
        'Precision': precision_score(y_resampled, y_pred_cv),
        'Recall': recall_score(y_resampled, y_pred_cv),
        'F1 Score': f1_score(y_resampled, y_pred_cv),
        'ROC AUC': roc_auc_score(y_resampled, y_pred_proba_cv)
    }

    return metrics, y_pred_cv, y_pred_proba_cv, y_resampled

# Main execution code
def run_optimization(X_train, y_train, n_trials=100):
    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    # Calculate SHAP values for feature selection
    base_model = LogisticRegression(random_state=42)
    base_model.fit(X_train_scaled, y_train)
    explainer = shap.LinearExplainer(base_model, X_train_scaled)
    shap_values = explainer.shap_values(X_train_scaled)

    # Get top 10 features
    shap_sum = np.abs(shap_values).mean(axis=0)
    importance_df = pd.DataFrame({'Feature': X_train.columns, 'SHAP Importance': shap_sum})
    importance_df.sort_values(by='SHAP Importance', ascending=False, inplace=True)
    top10_features = importance_df.head(10)['Feature'].values

    # Select top 10 features and scale
    X_train_top10 = X_train[top10_features]
    X_train_scaled_top10 = scaler.fit_transform(X_train_top10)

    # Run optimization for each sampling method
    sampling_methods = [None, 'Undersample', 'Oversample', 'SMOTE']
    results = []

    for method in sampling_methods:
        print(f"\nOptimizing for {method if method else 'No Sampling'}...")
        metrics, y_pred, y_pred_proba, y_resampled = optimize_and_evaluate(
            X_train_scaled_top10, y_train, method, n_trials
        )
        results.append(metrics)

        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        conf_matrix = confusion_matrix(y_resampled, y_pred)
        conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
        for i in range(conf_matrix.shape[0]):
            for j in range(conf_matrix.shape[1]):
                percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
                plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green')
        plt.title(f'Confusion Matrix ({method if method else "No Sampling"})')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.show()

        # Plot ROC curve
        fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'ROC (AUC = {metrics["ROC AUC"]:.3f})')
        plt.plot([0, 1], [0, 1], 'k--', label='Random')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve ({method if method else "No Sampling"})')
        plt.legend()
        plt.show()

    # Create and display results DataFrame
    results_df = pd.DataFrame(results)
    print("\nOptimization Results:")
    print(results_df[['Balancing Method', 'Best Score', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']])

    # Plot comparison of metrics
    metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']
    results_df.set_index('Balancing Method')[metrics_to_plot].plot(kind='bar', figsize=(12, 6))
    plt.title('Comparison of Optimized Models')
    plt.ylabel('Score')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    return results_df

# Usage example:
# Assuming X_train and y_train are your feature matrix and target vector
# results = run_optimization(X_train, y_train, n_trials=100)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import shap
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# 使用原始数据
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 定义初始逻辑回归模型并训练
log_reg_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
log_reg_model.fit(X_train_scaled, y_train)

# 使用 SHAP 计算特征重要性
explainer = shap.LinearExplainer(log_reg_model, X_train_scaled, feature_perturbation="interventional")
shap_values = explainer.shap_values(X_train_scaled)

# 绘制 SHAP 特征重要性图
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_train_scaled, feature_names=X_train.columns, plot_type="bar")
plt.tight_layout()
plt.show()

# 获取 Top 10 SHAP 特征
shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame({'Feature': X_train.columns, 'SHAP Importance': shap_sum})
importance_df.sort_values(by='SHAP Importance', ascending=False, inplace=True)
top10_shap_features = importance_df.head(10)['Feature'].values

# 筛选 Top 10 SHAP 特征
X_train_top10_shap = X_train[top10_shap_features]
X_train_scaled_top10_shap = scaler.fit_transform(X_train_top10_shap)

# 定义Optuna优化目标函数
def objective(trial, X, y, sampler=None):
    # 定义要优化的超参数
    params = {
        'C': trial.suggest_float('C', 1e-5, 100, log=True),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'solver': 'lbfgs',
        'max_iter': 2000,
        'random_state': 42
    }

    # 应用采样方法
    if sampler is not None:
        X_resampled, y_resampled = sampler.fit_resample(X, y)
    else:
        X_resampled, y_resampled = X, y

    # 使用交叉验证评估模型
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    model = LogisticRegression(**params)
    y_pred_proba = cross_val_predict(model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]
    score = roc_auc_score(y_resampled, y_pred_proba)

    return score

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

# 存储优化结果
metrics_data = []

# 对每种采样方法进行优化和评估
for method, sampler in samplers.items():
    print(f"\n=== Optimizing {method} ===")

    # 创建Optuna study
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train_scaled_top10_shap, y_train, sampler),
                  n_trials=50)

    # 使用最佳参数训练模型
    best_params = study.best_params
    best_params.update({'solver': 'lbfgs', 'max_iter': 2000, 'random_state': 42})

    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top10_shap, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top10_shap, y_train)

    # 使用最佳参数的模型进行预测
    log_reg_model = LogisticRegression(**best_params)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    # 计算评估指标
    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    # 打印最佳参数和分类报告
    print(f"\nBest parameters: {best_params}")
    print(f"Best ROC AUC score: {study.best_value:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_resampled, y_pred_cv))

    # 绘制混淆矩阵
    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix with Counts and Percentages ({method})')
    plt.show()

    # 绘制ROC曲线
    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    # 存储评估指标
    metrics_data.append({
        'Balancing Method': method,
        'Best Parameters': best_params,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

# 创建并显示评估指标DataFrame
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print("\nFinal Results:")
print(metrics_df)

# 绘制评估指标比较图
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(
    kind='bar',
    figsize=(10, 6)
)
plt.title('Comparison of Optimized Balancing Methods (Top 10 SHAP Features)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 打印每个方法的最佳参数
print("\nBest Parameters for Each Method:")
for metric in metrics_data:
    print(f"\n{metric['Balancing Method']}:")
    print(f"Parameters: {metric['Best Parameters']}")

Boruta 4

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import optuna
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# 假设 H2_train 和 selected_columns 已经定义
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 使用 Boruta 进行特征选择
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
boruta_selector = BorutaPy(
    rf,
    n_estimators='auto',
    verbose=2,
    random_state=42
)

# 运行 Boruta
boruta_selector.fit(X_train_scaled, y_train)

# 获取选中的特征
selected_features = X_train.columns[boruta_selector.support_]
print("\nBoruta selected features:")
print("========================")
for feature in selected_features:
    print(feature)
print("\nNumber of selected features:", len(selected_features))

# 使用 Boruta 选择的特征
X_train_boruta = X_train[selected_features]
X_train_scaled_boruta = scaler.fit_transform(X_train_boruta)

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

# 设置提前停止回调
class OptunaCallback:
    def __init__(self, early_stopping_rounds, early_stopping_value):
        self.early_stopping_rounds = early_stopping_rounds
        self.early_stopping_value = early_stopping_value
        self.best_score = None
        self.no_improvement_count = 0

    def __call__(self, study, trial):
        current_score = trial.value
        if self.best_score is None or current_score > self.best_score:
            self.best_score = current_score
            self.no_improvement_count = 0
        else:
            self.no_improvement_count += 1

        if self.best_score >= self.early_stopping_value:
            study.stop()

        if self.no_improvement_count >= self.early_stopping_rounds:
            study.stop()

def objective(trial, X, y, sampler=None):
    # 定义更保守的超参数搜索空间
    params = {
        'C': trial.suggest_loguniform('C', 1e-3, 1e3),  # 缩小范围
        'max_iter': trial.suggest_int('max_iter', 500, 1000),  # 增加最小迭代次数
        'tol': trial.suggest_loguniform('tol', 1e-4, 1e-2),  # 放宽收敛容差
    }

    # 重采样（如果需要）
    if sampler is not None:
        X_resampled, y_resampled = sampler.fit_resample(X, y)
    else:
        X_resampled, y_resampled = X, y

    # 创建模型
    model = LogisticRegression(
        penalty='l2',  # 使用更稳定的L2正则化
        solver='lbfgs',  # 使用更稳定的求解器
        random_state=42,
        **params
    )

    try:
        # 使用3折交叉验证评估模型
        scores = cross_val_score(
            model,
            X_resampled,
            y_resampled,
            cv=3,
            scoring='roc_auc'
        )
        return scores.mean()
    except Exception as e:
        print(f"Error in trial: {str(e)}")
        return float('-inf')  # 返回一个很差的分数

# 为每种采样方法优化超参数
best_params = {}
best_scores = {}
n_trials = 30  # 减少试验次数

# 设置早停回调
early_stopping_callback = OptunaCallback(
    early_stopping_rounds=10,  # 10轮内没有改善就停止
    early_stopping_value=0.95  # 达到0.95的AUC就停止
)

for method, sampler in samplers.items():
    print(f"\nOptimizing for {method}")

    # 设置更严格的时间限制
    study = optuna.create_study(direction='maximize')
    try:
        study.optimize(
            lambda trial: objective(trial, X_train_scaled_boruta, y_train, sampler),
            n_trials=n_trials,
            timeout=300,  # 5分钟超时
            callbacks=[early_stopping_callback]
        )

        best_params[method] = study.best_params
        best_scores[method] = study.best_value

        print(f"Best parameters for {method}:")
        print(study.best_params)
        print(f"Best ROC-AUC score: {study.best_value:.4f}")

        # 绘制优化历史
        plt.figure(figsize=(10, 6))
        optuna.visualization.matplotlib.plot_optimization_history(study)
        plt.title(f'Optimization History for {method}')
        plt.show()

    except Exception as e:
        print(f"Optimization failed for {method}: {str(e)}")
        continue

# 使用优化后的参数进行最终评估
metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

for method, sampler in samplers.items():
    if method not in best_params:
        continue

    print(f"\n=== {method} with optimized parameters ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_boruta, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_boruta, y_train)

    # 使用优化后的参数创建模型
    optimized_params = best_params[method]
    log_reg_model = LogisticRegression(
        penalty='l2',
        solver='lbfgs',
        random_state=42,
        **optimized_params
    )

    try:
        y_pred_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict')
        y_pred_proba_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

        # 计算评估指标
        accuracy = accuracy_score(y_resampled, y_pred_cv)
        precision = precision_score(y_resampled, y_pred_cv)
        recall = recall_score(y_resampled, y_pred_cv)
        f1 = f1_score(y_resampled, y_pred_cv)
        roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

        metrics_data.append({
            'Balancing Method': method,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'ROC AUC': roc_auc
        })

        # 绘制混淆矩阵
        conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {method}')
        plt.show()

        # 绘制ROC曲线
        fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - {method}')
        plt.legend()
        plt.show()

    except Exception as e:
        print(f"Evaluation failed for {method}: {str(e)}")
        continue

# 展示最终结果
if metrics_data:
    metrics_df = pd.DataFrame(metrics_data)
    print("\nFinal Results:")
    print(metrics_df)

    # 保存结果
    metrics_df.to_csv('optimization_results.csv', index=False)
    pd.DataFrame.from_dict(best_params, orient='index').to_csv('best_params.csv')

In [None]:
# Previous imports and setup remain the same...

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

for method, sampler in samplers.items():
    if method not in best_params:
        continue

    print(f"\n=== {method} with optimized parameters ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_boruta, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_boruta, y_train)

    # 使用优化后的参数创建模型
    optimized_params = best_params[method]
    log_reg_model = LogisticRegression(
        penalty='l2',
        solver='lbfgs',
        random_state=42,
        **optimized_params
    )

    try:
        y_pred_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict')
        y_pred_proba_cv = cross_val_predict(log_reg_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

        # 计算评估指标
        accuracy = accuracy_score(y_resampled, y_pred_cv)
        precision = precision_score(y_resampled, y_pred_cv)
        recall = recall_score(y_resampled, y_pred_cv)
        f1 = f1_score(y_resampled, y_pred_cv)
        roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

        # 绘制增强版混淆矩阵
        conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
        conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

        # 添加百分比标注
        for i in range(conf_matrix.shape[0]):
            for j in range(conf_matrix.shape[1]):
                percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
                plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.title(f'Confusion Matrix with Counts and Percentages ({method})')
        plt.show()

        # 绘制增强版ROC曲线
        fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve ({method})')
        plt.legend(loc='lower right')
        plt.show()

        # 存储详细评估指标
        metrics_data.append({
            'Balancing Method': method,
            'Best Parameters': best_params[method],
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'ROC AUC': roc_auc
        })

    except Exception as e:
        print(f"Evaluation failed for {method}: {str(e)}")
        continue

# 创建并显示评估指标DataFrame
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print("\nFinal Results:")
print(metrics_df)

# 绘制评估指标比较图
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(
    kind='bar',
    figsize=(10, 6)
)
plt.title('Comparison of Optimized Balancing Methods')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 打印每个方法的最佳参数
print("\nBest Parameters for Each Method:")
for metric in metrics_data:
    print(f"\n{metric['Balancing Method']}:")
    print(f"Parameters: {metric['Best Parameters']}")

# 保存结果
metrics_df.to_csv('optimization_results.csv', index=False)
pd.DataFrame.from_dict(best_params, orient='index').to_csv('best_params.csv')

### test set

baseline 4

In [None]:
# 重新定义标准化器
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 将 H2_test 数据标准化
H2_test_scaled = scaler.transform(H2_test[selected_columns].drop(columns=['is_canceled']))
y_test = H2_test['is_canceled']

# 初始化结果存储
test_results = []

# 对四种采样方法的最佳参数进行测试
for method, sampler in samplers.items():
    print(f"=== Testing {method} on H2 Test Set ===")

    # 获取之前计算的最佳参数
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train_scaled, y_train), n_trials=1)  # 仅获取最佳参数
    best_params = study.best_params

    # 使用最佳参数重新定义模型
    log_reg_model_optimized = LogisticRegression(**best_params, max_iter=2000, random_state=42)

    # 使用 H2_test 数据进行预测
    log_reg_model_optimized.fit(X_train_scaled, y_train)
    y_test_pred = log_reg_model_optimized.predict(H2_test_scaled)
    y_test_pred_proba = log_reg_model_optimized.predict_proba(H2_test_scaled)[:, 1]

    # 计算测试指标
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, pos_label=1)
    recall = recall_score(y_test, y_test_pred, pos_label=1)
    f1 = f1_score(y_test, y_test_pred, pos_label=1)
    roc_auc = roc_auc_score(y_test, y_test_pred_proba)

    # 保存测试结果
    test_results.append({
        'Balancing Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

    # 打印分类报告
    print(f"\nClassification Report for {method}:")
    print(classification_report(y_test, y_test_pred))

    # 绘制 ROC 曲线
    fpr, tpr, _ = roc_curve(y_test, y_test_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method}) on H2 Test Set')
    plt.legend(loc='lower right')
    plt.show()

# 将测试结果转换为 DataFrame 并显示
test_results_df = pd.DataFrame(test_results).sort_values(by='ROC AUC', ascending=False)
print(test_results_df)

# 绘制测试结果对比条形图
test_results_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Balancing Methods on H2 Test Set')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


top10 feature importance 4

In [None]:
# 记录每种平衡方法的最佳模型和指标
metrics_data = []
test_metrics_data = []

for method, sampler in samplers.items():
    print(f"=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

    # 使用 Optuna 调优超参数
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_resampled, y_resampled), n_trials=30)

    best_params = study.best_params
    print(f"Best params for {method}: {best_params}")

    # 使用最佳参数训练模型
    best_model = LogisticRegression(**best_params, random_state=42)
    best_model.fit(X_resampled, y_resampled)

    # 训练集预测
    y_train_pred = cross_val_predict(best_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_train_pred_proba = cross_val_predict(best_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    # 测试集预测
    y_test_pred = best_model.predict(X_test_scaled_top)
    y_test_pred_proba = best_model.predict_proba(X_test_scaled_top)[:, 1]

    # 计算训练集的指标
    accuracy_train = accuracy_score(y_resampled, y_train_pred)
    precision_train = precision_score(y_resampled, y_train_pred, pos_label=1)
    recall_train = recall_score(y_resampled, y_train_pred, pos_label=1)
    f1_train = f1_score(y_resampled, y_train_pred, pos_label=1)
    roc_auc_train = roc_auc_score(y_resampled, y_train_pred_proba)

    # 计算测试集的指标
    accuracy_test = accuracy_score(y_test, y_test_pred)
    precision_test = precision_score(y_test, y_test_pred, pos_label=1)
    recall_test = recall_score(y_test, y_test_pred, pos_label=1)
    f1_test = f1_score(y_test, y_test_pred, pos_label=1)
    roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

    # 保存训练集和测试集结果
    metrics_data.append({
        'Balancing Method': method,
        'Train Accuracy': accuracy_train,
        'Train Precision': precision_train,
        'Train Recall': recall_train,
        'Train F1': f1_train,
        'Train ROC AUC': roc_auc_train
    })
    test_metrics_data.append({
        'Balancing Method': method,
        'Test Accuracy': accuracy_test,
        'Test Precision': precision_test,
        'Test Recall': recall_test,
        'Test F1': f1_test,
        'Test ROC AUC': roc_auc_test
    })

# 转换为 DataFrame 并显示
train_metrics_df = pd.DataFrame(metrics_data)
test_metrics_df = pd.DataFrame(test_metrics_data)

print("Train Metrics:")
print(train_metrics_df)
print("\nTest Metrics:")
print(test_metrics_df)

# 绘制测试集对比图
test_metrics_df.set_index('Balancing Method')[['Test Accuracy', 'Test Precision', 'Test Recall', 'Test F1', 'Test ROC AUC']].plot(kind='bar', figsize=(12, 8))
plt.title('Comparison of Balancing Methods on H2_test')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


coefficient 4 test set& H1 test set

In [None]:
# 定义字典保存每种平衡方法的最佳参数
best_params_dict = {}

# 调优和保存最佳参数
metrics_data = []
test_metrics_data = []

for method, sampler in samplers.items():
    print(f"=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

    # 使用 Optuna 调优超参数
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_resampled, y_resampled), n_trials=30)

    # 保存最佳参数到字典
    best_params = study.best_params
    best_params_dict[method] = best_params  # 将最佳参数存储到字典中
    print(f"Best params for {method}: {best_params}")

    # 使用最佳参数训练模型
    best_model = LogisticRegression(**best_params, random_state=42)
    best_model.fit(X_resampled, y_resampled)

    # 训练集预测
    y_train_pred = cross_val_predict(best_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_train_pred_proba = cross_val_predict(best_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    # 测试集预测
    y_test_pred = best_model.predict(X_test_scaled_top)
    y_test_pred_proba = best_model.predict_proba(X_test_scaled_top)[:, 1]

    # 计算训练集的指标
    accuracy_train = accuracy_score(y_resampled, y_train_pred)
    precision_train = precision_score(y_resampled, y_train_pred, pos_label=1)
    recall_train = recall_score(y_resampled, y_train_pred, pos_label=1)
    f1_train = f1_score(y_resampled, y_train_pred, pos_label=1)
    roc_auc_train = roc_auc_score(y_resampled, y_train_pred_proba)

    # 计算测试集的指标
    accuracy_test = accuracy_score(y_test, y_test_pred)
    precision_test = precision_score(y_test, y_test_pred, pos_label=1)
    recall_test = recall_score(y_test, y_test_pred, pos_label=1)
    f1_test = f1_score(y_test, y_test_pred, pos_label=1)
    roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

    # 保存训练集和测试集结果
    metrics_data.append({
        'Balancing Method': method,
        'Train Accuracy': accuracy_train,
        'Train Precision': precision_train,
        'Train Recall': recall_train,
        'Train F1': f1_train,
        'Train ROC AUC': roc_auc_train
    })
    test_metrics_data.append({
        'Balancing Method': method,
        'Test Accuracy': accuracy_test,
        'Test Precision': precision_test,
        'Test Recall': recall_test,
        'Test F1': f1_test,
        'Test ROC AUC': roc_auc_test
    })

# 转换为 DataFrame 并显示
train_metrics_df = pd.DataFrame(metrics_data)
test_metrics_df = pd.DataFrame(test_metrics_data)

print("Train Metrics:")
print(train_metrics_df)
print("\nTest Metrics:")
print(test_metrics_df)

# 测试集对比图
test_metrics_df.set_index('Balancing Method')[['Test Accuracy', 'Test Precision', 'Test Recall', 'Test F1', 'Test ROC AUC']].plot(kind='bar', figsize=(12, 8))
plt.title('Comparison of Balancing Methods on H2_test')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# ======================
# 在 H1_test 上进行测试
# ======================

# 假设 H1_train 和 H1_test 已定义
X_train_h1 = H1_train[selected_columns].drop(columns=['is_canceled'])
y_train_h1 = H1_train['is_canceled']
X_test_h1 = H1_test[selected_columns].drop(columns=['is_canceled'])
y_test_h1 = H1_test['is_canceled']

# 标准化 H1 数据
X_train_h1_scaled = scaler.fit_transform(X_train_h1)
X_test_h1_scaled = scaler.transform(X_test_h1)

# 使用最佳参数在 H1_test 上测试
test_metrics_data_h1 = []

for method, params in best_params_dict.items():  # 从保存的字典加载最佳参数
    print(f"=== Testing {method} Model on H1_test ===")

    # 初始化模型并在 H1_train 上训练
    best_model = LogisticRegression(**params, random_state=42)
    best_model.fit(X_train_h1_scaled, y_train_h1)  # 在 H1_train 上拟合

    # 在 H1_test 上预测
    y_test_pred_h1 = best_model.predict(X_test_h1_scaled)
    y_test_pred_proba_h1 = best_model.predict_proba(X_test_h1_scaled)[:, 1]

    # 计算 H1_test 上的指标
    accuracy_test_h1 = accuracy_score(y_test_h1, y_test_pred_h1)
    precision_test_h1 = precision_score(y_test_h1, y_test_pred_h1, pos_label=1)
    recall_test_h1 = recall_score(y_test_h1, y_test_pred_h1, pos_label=1)
    f1_test_h1 = f1_score(y_test_h1, y_test_pred_h1, pos_label=1)
    roc_auc_test_h1 = roc_auc_score(y_test_h1, y_test_pred_proba_h1)

    # 保存 H1_test 的测试结果
    test_metrics_data_h1.append({
        'Balancing Method': method,
        'Test Accuracy': accuracy_test_h1,
        'Test Precision': precision_test_h1,
        'Test Recall': recall_test_h1,
        'Test F1': f1_test_h1,
        'Test ROC AUC': roc_auc_test_h1
    })

# 打印并显示 H1_test 的测试结果
test_metrics_df_h1 = pd.DataFrame(test_metrics_data_h1).sort_values(by='Test ROC AUC', ascending=False)
print("\nH1 Test Metrics:")
print(test_metrics_df_h1)

# 绘制 H1_test 对比图
test_metrics_df_h1.set_index('Balancing Method')[['Test Accuracy', 'Test Precision', 'Test Recall', 'Test F1', 'Test ROC AUC']].plot(kind='bar', figsize=(12, 8))
plt.title('Comparison of Balancing Methods on H1_test')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


shap4

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

# 准备测试数据
X_test = H2_test[selected_columns].drop(columns=['is_canceled'])
y_test = H2_test['is_canceled']

# 提取top10特征的测试数据
X_test_top10 = X_test[top10_shap_features]
X_test_scaled_top10 = scaler.transform(X_test_top10)

# 存储测试集评估结果
test_metrics_data = []

# 对每个优化后的模型进行测试集评估
for metric in metrics_data:
    method = metric['Balancing Method']
    best_params = metric['Best Parameters']
    print(f"\n=== Testing {method} ===")

    # 使用最佳参数创建模型
    log_reg_model = LogisticRegression(**best_params)

    # 根据不同的采样方法处理训练数据
    if method == 'No Sampling':
        X_train_final, y_train_final = X_train_scaled_top10_shap, y_train
    elif method == 'Undersample':
        sampler = RandomUnderSampler(random_state=42)
        X_train_final, y_train_final = sampler.fit_resample(X_train_scaled_top10_shap, y_train)
    elif method == 'Oversample':
        sampler = RandomOverSampler(random_state=42)
        X_train_final, y_train_final = sampler.fit_resample(X_train_scaled_top10_shap, y_train)
    else:  # SMOTE
        sampler = SMOTE(random_state=42)
        X_train_final, y_train_final = sampler.fit_resample(X_train_scaled_top10_shap, y_train)

    # 训练模型
    log_reg_model.fit(X_train_final, y_train_final)

    # 在测试集上进行预测
    y_pred = log_reg_model.predict(X_test_scaled_top10)
    y_pred_proba = log_reg_model.predict_proba(X_test_scaled_top10)[:, 1]

    # 计算评估指标
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label=1)
    recall = recall_score(y_test, y_pred, pos_label=1)
    f1 = f1_score(y_test, y_pred, pos_label=1)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    # 打印分类报告
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # 绘制混淆矩阵
    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Test Set Confusion Matrix ({method})')
    plt.show()

    # 绘制ROC曲线
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Test Set ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    # 存储评估指标
    test_metrics_data.append({
        'Balancing Method': method,
        'Best Parameters': best_params,
        'Test Accuracy': accuracy,
        'Test Precision': precision,
        'Test Recall': recall,
        'Test F1 Score': f1,
        'Test ROC AUC': roc_auc
    })

# 创建并显示测试集评估指标DataFrame
test_metrics_df = pd.DataFrame(test_metrics_data).sort_values(by='Test ROC AUC', ascending=False)
print("\nTest Set Results:")
print(test_metrics_df)

# 绘制测试集评估指标比较图
test_metrics_df.set_index('Balancing Method')[['Test Accuracy', 'Test Precision', 'Test Recall', 'Test F1 Score', 'Test ROC AUC']].plot(
    kind='bar',
    figsize=(10, 6)
)
plt.title('Comparison of Optimized Models on Test Set (Top 10 SHAP Features)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 打印训练集和测试集的对比
comparison_data = []
for train_metric, test_metric in zip(metrics_data, test_metrics_data):
    method = train_metric['Balancing Method']
    comparison_data.append({
        'Method': method,
        'Train AUC': train_metric['ROC AUC'],
        'Test AUC': test_metric['Test ROC AUC'],
        'AUC Difference': train_metric['ROC AUC'] - test_metric['Test ROC AUC'],
        'Train F1': train_metric['F1 Score'],
        'Test F1': test_metric['Test F1 Score'],
        'F1 Difference': train_metric['F1 Score'] - test_metric['Test F1 Score']
    })

comparison_df = pd.DataFrame(comparison_data)
print("\nTrain vs Test Performance Comparison:")
print(comparison_df)

# 绘制训练集vs测试集性能对比图
plt.figure(figsize=(12, 6))
x = np.arange(len(comparison_df))
width = 0.35

plt.bar(x - width/2, comparison_df['Train AUC'], width, label='Train AUC')
plt.bar(x + width/2, comparison_df['Test AUC'], width, label='Test AUC')

plt.xlabel('Balancing Method')
plt.ylabel('ROC AUC Score')
plt.title('Train vs Test ROC AUC Comparison')
plt.xticks(x, comparison_df['Method'], rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

boruta 4 types (there is a complete comparison version of H2 test set and H1 test set in the cross dataset section)

In [None]:
# Boruta 筛选后的特征
X_train_boruta = X_train[selected_features]
X_train_scaled_boruta = scaler.fit_transform(X_train_boruta)

# 测试集的 Boruta 筛选特征
X_test_boruta = H2_test[selected_features]
X_test_scaled_boruta = scaler.transform(X_test_boruta)
y_test = H2_test["is_canceled"]

# 定义最佳参数
best_params = {
    "No Sampling": {"C": 0.3417502706151835, "max_iter": 573, "tol": 0.00229469424632226},
    "Undersample": {"C": 0.0031211003628310603, "max_iter": 552, "tol": 0.0004304546227332599},
    "Oversample": {"C": 150.3689576094797, "max_iter": 759, "tol": 0.0003455003585976619},
    "SMOTE": {"C": 1.4183467727092938, "max_iter": 655, "tol": 0.00017510842206403264},
}

# 定义采样方法
samplers = {
    "No Sampling": None,
    "Undersample": RandomUnderSampler(random_state=42),
    "Oversample": RandomOverSampler(random_state=42),
    "SMOTE": SMOTE(random_state=42)
}

# 使用测试集评估并记录结果
final_metrics_data = []

for method, sampler in samplers.items():
    print(f"\n=== {method} Final Testing with Optimized Parameters ===")

    # 获取最佳参数
    optimized_params = best_params[method]

    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_boruta, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_boruta, y_train)

    # 使用最佳参数创建模型
    log_reg_model = LogisticRegression(
        solver="lbfgs",
        random_state=42,
        **optimized_params
    )

    # 训练模型
    log_reg_model.fit(X_resampled, y_resampled)

    # 在测试集上进行预测
    y_pred_test = log_reg_model.predict(X_test_scaled_boruta)
    y_pred_proba_test = log_reg_model.predict_proba(X_test_scaled_boruta)[:, 1]

    # 计算评估指标
    accuracy = accuracy_score(y_test, y_pred_test)
    precision = precision_score(y_test, y_pred_test)
    recall = recall_score(y_test, y_pred_test)
    f1 = f1_score(y_test, y_pred_test)
    roc_auc = roc_auc_score(y_test, y_pred_proba_test)

    # 保存结果
    final_metrics_data.append({
        "Balancing Method": method,
        "Test Accuracy": accuracy,
        "Test Precision": precision,
        "Test Recall": recall,
        "Test F1 Score": f1,
        "Test ROC AUC": roc_auc
    })

    # 绘制混淆矩阵
    conf_matrix = confusion_matrix(y_test, y_pred_test)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False, annot_kws={"size": 14})
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title(f"Confusion Matrix ({method})")
    plt.show()

    # 绘制 ROC 曲线
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba_test)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f"{method} (AUC = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], "k--", label="Random Guess")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC Curve ({method})")
    plt.legend(loc="lower right")
    plt.show()

# 生成最终测试结果 DataFrame
final_metrics_df = pd.DataFrame(final_metrics_data).sort_values(by="Test ROC AUC", ascending=False)
print("\nFinal Test Results:")
print(final_metrics_df)

# 保存评估结果
final_metrics_df.to_csv("final_test_results_boruta.csv", index=False)

# 绘制评估对比图
final_metrics_df.set_index("Balancing Method")[["Test Accuracy", "Test Precision", "Test Recall", "Test F1 Score", "Test ROC AUC"]].plot(
    kind="bar", figsize=(10, 6)
)
plt.title("Comparison of Balancing Methods (Boruta Selected Features)")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


### cross-dataset evaluation: H1 test set

baseline 4

In [None]:
# 重新定义标准化器
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 将 H1_test 数据标准化
H1_test_scaled = scaler.transform(H1_test[selected_columns].drop(columns=['is_canceled']))
y_test = H1_test['is_canceled']

# 初始化结果存储
test_results = []

# 对四种采样方法的最佳参数进行测试
for method, sampler in samplers.items():
    print(f"=== Testing {method} on H1 Test Set ===")

    # 获取之前计算的最佳参数
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train_scaled, y_train), n_trials=1)  # 仅获取最佳参数
    best_params = study.best_params

    # 使用最佳参数重新定义模型
    log_reg_model_optimized = LogisticRegression(**best_params, max_iter=2000, random_state=42)

    # 使用 H1_test 数据进行预测
    log_reg_model_optimized.fit(X_train_scaled, y_train)
    y_test_pred = log_reg_model_optimized.predict(H1_test_scaled)
    y_test_pred_proba = log_reg_model_optimized.predict_proba(H1_test_scaled)[:, 1]

    # 计算测试指标
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, pos_label=1)
    recall = recall_score(y_test, y_test_pred, pos_label=1)
    f1 = f1_score(y_test, y_test_pred, pos_label=1)
    roc_auc = roc_auc_score(y_test, y_test_pred_proba)

    # 保存测试结果
    test_results.append({
        'Balancing Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

    # 打印分类报告
    print(f"\nClassification Report for {method}:")
    print(classification_report(y_test, y_test_pred))

    # 绘制 ROC 曲线
    fpr, tpr, _ = roc_curve(y_test, y_test_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method}) on H1 Test Set')
    plt.legend(loc='lower right')
    plt.show()

# 将测试结果转换为 DataFrame 并显示
test_results_df = pd.DataFrame(test_results).sort_values(by='ROC AUC', ascending=False)
print(test_results_df)

# 绘制测试结果对比条形图
test_results_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Balancing Methods on H1 Test Set')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


SHAP 4

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

# 准备H1测试数据
X_h1_test = H1_test[selected_columns].drop(columns=['is_canceled'])
y_h1_test = H1_test['is_canceled']

# 提取top10特征的测试数据
X_h1_test_top10 = X_h1_test[top10_shap_features]
X_h1_test_scaled_top10 = scaler.transform(X_h1_test_top10)

# 存储H1测试集评估结果
h1_test_metrics_data = []

# 对每个优化后的模型进行H1测试集评估
for metric in metrics_data:
    method = metric['Balancing Method']
    best_params = metric['Best Parameters']
    print(f"\n=== Testing on H1 Dataset - {method} ===")

    # 使用最佳参数创建模型
    log_reg_model = LogisticRegression(**best_params)

    # 根据不同的采样方法处理训练数据
    if method == 'No Sampling':
        X_train_final, y_train_final = X_train_scaled_top10_shap, y_train
    elif method == 'Undersample':
        sampler = RandomUnderSampler(random_state=42)
        X_train_final, y_train_final = sampler.fit_resample(X_train_scaled_top10_shap, y_train)
    elif method == 'Oversample':
        sampler = RandomOverSampler(random_state=42)
        X_train_final, y_train_final = sampler.fit_resample(X_train_scaled_top10_shap, y_train)
    else:  # SMOTE
        sampler = SMOTE(random_state=42)
        X_train_final, y_train_final = sampler.fit_resample(X_train_scaled_top10_shap, y_train)

    # 训练模型
    log_reg_model.fit(X_train_final, y_train_final)

    # 在H1测试集上进行预测
    y_pred = log_reg_model.predict(X_h1_test_scaled_top10)
    y_pred_proba = log_reg_model.predict_proba(X_h1_test_scaled_top10)[:, 1]

    # 计算评估指标
    accuracy = accuracy_score(y_h1_test, y_pred)
    precision = precision_score(y_h1_test, y_pred, pos_label=1)
    recall = recall_score(y_h1_test, y_pred, pos_label=1)
    f1 = f1_score(y_h1_test, y_pred, pos_label=1)
    roc_auc = roc_auc_score(y_h1_test, y_pred_proba)

    # 打印分类报告
    print("\nClassification Report (H1 Test Set):")
    print(classification_report(y_h1_test, y_pred))

    # 绘制混淆矩阵
    conf_matrix = confusion_matrix(y_h1_test, y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'H1 Test Set Confusion Matrix ({method})')
    plt.show()

    # 绘制ROC曲线
    fpr, tpr, _ = roc_curve(y_h1_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'H1 Test Set ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    # 存储评估指标
    h1_test_metrics_data.append({
        'Balancing Method': method,
        'Best Parameters': best_params,
        'H1 Test Accuracy': accuracy,
        'H1 Test Precision': precision,
        'H1 Test Recall': recall,
        'H1 Test F1 Score': f1,
        'H1 Test ROC AUC': roc_auc
    })

# 创建并显示H1测试集评估指标DataFrame
h1_test_metrics_df = pd.DataFrame(h1_test_metrics_data).sort_values(by='H1 Test ROC AUC', ascending=False)
print("\nH1 Test Set Results:")
print(h1_test_metrics_df)

# 绘制H1测试集评估指标比较图
h1_test_metrics_df.set_index('Balancing Method')[
    ['H1 Test Accuracy', 'H1 Test Precision', 'H1 Test Recall', 'H1 Test F1 Score', 'H1 Test ROC AUC']
].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Optimized Models on H1 Test Set (Top 10 SHAP Features)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 创建H2训练集、H2测试集和H1测试集的性能对比
comparison_data = []
for train_metric, h2_test_metric, h1_test_metric in zip(metrics_data, test_metrics_data, h1_test_metrics_data):
    method = train_metric['Balancing Method']
    comparison_data.append({
        'Method': method,
        'H2 Train AUC': train_metric['ROC AUC'],
        'H2 Test AUC': h2_test_metric['Test ROC AUC'],
        'H1 Test AUC': h1_test_metric['H1 Test ROC AUC'],
        'H2 Train F1': train_metric['F1 Score'],
        'H2 Test F1': h2_test_metric['Test F1 Score'],
        'H1 Test F1': h1_test_metric['H1 Test F1 Score']
    })

comparison_df = pd.DataFrame(comparison_data)
print("\nH2 Train vs H2 Test vs H1 Test Performance Comparison:")
print(comparison_df)

# 绘制三个数据集的性能对比图
plt.figure(figsize=(12, 6))
x = np.arange(len(comparison_df))
width = 0.25

plt.bar(x - width, comparison_df['H2 Train AUC'], width, label='H2 Train AUC')
plt.bar(x, comparison_df['H2 Test AUC'], width, label='H2 Test AUC')
plt.bar(x + width, comparison_df['H1 Test AUC'], width, label='H1 Test AUC')

plt.xlabel('Balancing Method')
plt.ylabel('ROC AUC Score')
plt.title('Performance Comparison Across Datasets')
plt.xticks(x, comparison_df['Method'], rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

# 计算并展示性能差异
performance_diff_df = pd.DataFrame({
    'Method': comparison_df['Method'],
    'H2 Train-Test Diff': comparison_df['H2 Train AUC'] - comparison_df['H2 Test AUC'],
    'H2-H1 Test Diff': comparison_df['H2 Test AUC'] - comparison_df['H1 Test AUC']
})
print("\nPerformance Differences:")
print(performance_diff_df)

boruta 4

In [None]:
# 测试集的 Boruta 筛选特征（H1 和 H2 分开处理）
# 对 H1_test 应用相同的 Boruta 筛选步骤
X_test_boruta_H1 = H1_test[selected_features]
X_test_scaled_boruta_H1 = scaler.transform(X_test_boruta_H1)
y_test_H1 = H1_test["is_canceled"]

# 对 H2_test 已经处理完毕，直接使用
X_test_boruta_H2 = H2_test[selected_features]
X_test_scaled_boruta_H2 = scaler.transform(X_test_boruta_H2)
y_test_H2 = H2_test["is_canceled"]

# 定义保存测试结果的列表
final_metrics_data_H1 = []
final_metrics_data_H2 = []

# 遍历采样方法并测试 H1 和 H2
for method, sampler in samplers.items():
    print(f"\n=== {method} Final Testing on H1 and H2 with Optimized Parameters ===")

    # 获取最佳参数
    optimized_params = best_params[method]

    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_boruta, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_boruta, y_train)

    # 使用最佳参数创建模型
    log_reg_model = LogisticRegression(
        solver="lbfgs",
        random_state=42,
        **optimized_params
    )

    # 训练模型
    log_reg_model.fit(X_resampled, y_resampled)

    # 对 H1_test 进行预测
    y_pred_test_H1 = log_reg_model.predict(X_test_scaled_boruta_H1)
    y_pred_proba_test_H1 = log_reg_model.predict_proba(X_test_scaled_boruta_H1)[:, 1]

    # 计算 H1 的评估指标
    accuracy_H1 = accuracy_score(y_test_H1, y_pred_test_H1)
    precision_H1 = precision_score(y_test_H1, y_pred_test_H1)
    recall_H1 = recall_score(y_test_H1, y_pred_test_H1)
    f1_H1 = f1_score(y_test_H1, y_pred_test_H1)
    roc_auc_H1 = roc_auc_score(y_test_H1, y_pred_proba_test_H1)

    # 保存 H1 的测试结果
    final_metrics_data_H1.append({
        "Balancing Method": method,
        "Test Accuracy": accuracy_H1,
        "Test Precision": precision_H1,
        "Test Recall": recall_H1,
        "Test F1 Score": f1_H1,
        "Test ROC AUC": roc_auc_H1
    })

    # 对 H2_test 进行预测
    y_pred_test_H2 = log_reg_model.predict(X_test_scaled_boruta_H2)
    y_pred_proba_test_H2 = log_reg_model.predict_proba(X_test_scaled_boruta_H2)[:, 1]

    # 计算 H2 的评估指标
    accuracy_H2 = accuracy_score(y_test_H2, y_pred_test_H2)
    precision_H2 = precision_score(y_test_H2, y_pred_test_H2)
    recall_H2 = recall_score(y_test_H2, y_pred_test_H2)
    f1_H2 = f1_score(y_test_H2, y_pred_test_H2)
    roc_auc_H2 = roc_auc_score(y_test_H2, y_pred_proba_test_H2)

    # 保存 H2 的测试结果
    final_metrics_data_H2.append({
        "Balancing Method": method,
        "Test Accuracy": accuracy_H2,
        "Test Precision": precision_H2,
        "Test Recall": recall_H2,
        "Test F1 Score": f1_H2,
        "Test ROC AUC": roc_auc_H2
    })

# 生成 H1 和 H2 的测试结果 DataFrame
final_metrics_df_H1 = pd.DataFrame(final_metrics_data_H1).sort_values(by="Test ROC AUC", ascending=False)
final_metrics_df_H2 = pd.DataFrame(final_metrics_data_H2).sort_values(by="Test ROC AUC", ascending=False)

# 打印结果
print("\nFinal Test Results on H1:")
print(final_metrics_df_H1)

print("\nFinal Test Results on H2:")
print(final_metrics_df_H2)

# 保存评估结果
final_metrics_df_H1.to_csv("final_test_results_boruta_H1.csv", index=False)
final_metrics_df_H2.to_csv("final_test_results_boruta_H2.csv", index=False)

# 绘制对比图（H1 和 H2 分别绘制）
final_metrics_df_H1.set_index("Balancing Method")[["Test Accuracy", "Test Precision", "Test Recall", "Test F1 Score", "Test ROC AUC"]].plot(
    kind="bar", figsize=(10, 6)
)
plt.title("Comparison of Balancing Methods on H1 Test Set (Boruta Selected Features)")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

final_metrics_df_H2.set_index("Balancing Method")[["Test Accuracy", "Test Precision", "Test Recall", "Test F1 Score", "Test ROC AUC"]].plot(
    kind="bar", figsize=(10, 6)
)
plt.title("Comparison of Balancing Methods on H2 Test Set (Boruta Selected Features)")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


 ## 2.Random Forest





baseline 4

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
import optuna
import warnings
warnings.filterwarnings('ignore')

# 假设 H2_train 和 selected_columns 已经定义
# 定义特征和目标变量
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 使用未处理数据的随机森林提取重要特征
rf_model_raw = RandomForestClassifier(random_state=42)
rf_model_raw.fit(X_train_scaled, y_train)

# 提取特征重要性
feature_importances = rf_model_raw.feature_importances_
features = X_train.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False).head(10)

# 提取 Top 10 特征
top_features = importance_df_sorted['Feature'].tolist()
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# 定义Optuna的目标函数
def objective(trial, X, y):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'random_state': 42,
        'n_jobs': -1
    }

    rf = RandomForestClassifier(**params)
    scores = cross_val_score(rf, X, y, cv=3, scoring='roc_auc')
    return scores.mean()

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 使用采样方法和 Top 10 特征训练模型
for method, sampler in samplers.items():
    print(f"\n=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

    # 运行Optuna优化
    print(f"Optimizing hyperparameters for {method}...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_resampled, y_resampled),
                  n_trials=50, show_progress_bar=True)

    # 打印最佳参数
    print(f"Best parameters for {method}:")
    print(study.best_params)
    print(f"Best ROC-AUC score: {study.best_value:.4f}")

    # 使用最佳参数训练模型
    rf_model = RandomForestClassifier(**study.best_params, random_state=42, n_jobs=-1)
    y_pred_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix ({method})\nBest ROC-AUC: {study.best_value:.4f}')
    plt.show()

    report = classification_report(y_resampled, y_pred_cv)
    print(report)

    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    metrics_data.append({
        'Balancing Method': method,
        'Best ROC-AUC': study.best_value,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

# 创建结果比较DataFrame
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print("\nFinal Results Comparison:")
print(metrics_df)

# 绘制性能比较图
plt.figure(figsize=(12, 6))
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'Best ROC-AUC']].plot(kind='bar')
plt.title('Comparison of Balancing Methods with Optimized Parameters')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# 可视化特征重要性
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df_sorted, x='Importance', y='Feature', palette='viridis')
plt.title('Top 10 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()


### different feature combinations

feature importance top10 4

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# 假设 H2_train 和 selected_columns 已经定义
# 定义特征和目标变量
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 使用未处理数据的随机森林提取重要特征
rf_model_raw = RandomForestClassifier(random_state=42)
rf_model_raw.fit(X_train_scaled, y_train)

# 提取特征重要性
feature_importances = rf_model_raw.feature_importances_
features = X_train.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False).head(10)

# 提取 Top 10 特征
top_features = importance_df_sorted['Feature'].tolist()
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 使用采样方法和 Top 10 特征训练模型
for method, sampler in samplers.items():
    print(f"=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

    rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)
    y_pred_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix ({method})')
    plt.show()

    report = classification_report(y_resampled, y_pred_cv)
    print(report)

    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    metrics_data.append({
        'Balancing Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print(metrics_df)

metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Balancing Methods (Top 10 Features)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 可视化特征重要性
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df_sorted, x='Importance', y='Feature', palette='viridis')
plt.title('Top 10 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# 假设 H2_train 和 selected_columns 已经定义
# 定义特征和目标变量
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 使用未处理数据的随机森林提取重要特征
rf_model_raw = RandomForestClassifier(random_state=42)
rf_model_raw.fit(X_train_scaled, y_train)

# 提取特征重要性
feature_importances = rf_model_raw.feature_importances_
features = X_train.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False).head(10)

# 提取 Top 10 特征
top_features = importance_df_sorted['Feature'].tolist()
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 使用采样方法和 Top 10 特征训练模型
for method, sampler in samplers.items():
    print(f"=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

    rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)
    y_pred_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix ({method})')
    plt.show()

    report = classification_report(y_resampled, y_pred_cv)
    print(report)

    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    metrics_data.append({
        'Balancing Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print(metrics_df)

metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Balancing Methods (Top 10 Features)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 可视化特征重要性
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df_sorted, x='Importance', y='Feature', palette='viridis')
plt.title('Top 10 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# 假设 H2_train 和 selected_columns 已经定义
# 定义特征和目标变量
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 使用未处理数据的随机森林提取重要特征
rf_model_raw = RandomForestClassifier(random_state=42)
rf_model_raw.fit(X_train_scaled, y_train)

# 提取特征重要性
feature_importances = rf_model_raw.feature_importances_
features = X_train.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False).head(10)

# 提取 Top 10 特征
top_features = importance_df_sorted['Feature'].tolist()
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 使用采样方法和 Top 10 特征训练模型
for method, sampler in samplers.items():
    print(f"=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

    rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)
    y_pred_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix ({method})')
    plt.show()

    report = classification_report(y_resampled, y_pred_cv)
    print(report)

    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    metrics_data.append({
        'Balancing Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print(metrics_df)

metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Balancing Methods (Top 10 Features)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 可视化特征重要性
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df_sorted, x='Importance', y='Feature', palette='viridis')
plt.title('Top 10 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()


SHAP top10 4

In [None]:
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler

# 假设 H2_train 和 selected_columns 已经定义
# 定义特征和目标变量
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 定义初始随机森林模型
rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_model.fit(X_train_scaled, y_train)

# 计算 SHAP 值
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_train_scaled)[1]  # 仅选择目标值为 1 的 SHAP 值

# 绘制 SHAP 特征重要性图
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_train, feature_names=X_train.columns, plot_type="bar")

# 获取 Top 10 SHAP 特征
shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame({'Feature': X_train.columns, 'SHAP Importance': shap_sum})
importance_df.sort_values(by='SHAP Importance', ascending=False, inplace=True)
top10_shap_features = importance_df.head(10)['Feature'].values

# 筛选 Top 10 特征
X_train_top10_shap = X_train[top10_shap_features]
X_train_scaled_top10_shap = scaler.fit_transform(X_train_top10_shap)

# 定义 3-fold 交叉验证
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 进行 3-fold 交叉验证
rf_model_top10_shap = RandomForestClassifier(random_state=42, n_jobs=-1)
y_pred_cv_shap = cross_val_predict(rf_model_top10_shap, X_train_scaled_top10_shap, y_train, cv=cv, method='predict')
y_pred_proba_cv_shap = cross_val_predict(rf_model_top10_shap, X_train_scaled_top10_shap, y_train, cv=cv, method='predict_proba')[:, 1]

# 计算并绘制混淆矩阵
conf_matrix_shap = confusion_matrix(y_train, y_pred_cv_shap)
conf_matrix_percentage_shap = conf_matrix_shap / conf_matrix_shap.sum(axis=1).reshape(-1, 1) * 100
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_shap, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix_shap.shape[0]):
    for j in range(conf_matrix_shap.shape[1]):
        percentage_text = f"{conf_matrix_percentage_shap[i, j]:.1f}%"
        plt.text(j + 0.5, i + 0.5, percentage_text, ha='center', va='center', color='green', fontsize=12)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix with Counts and Percentages (SHAP Random Forest)')
plt.show()

# 分类报告和 ROC 曲线
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv_shap))

plt.figure(figsize=(12, 8))
fold_count = 1
for train_idx, test_idx in cv.split(X_train_scaled_top10_shap, y_train):
    rf_model_top10_shap.fit(X_train_scaled_top10_shap[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold_shap = rf_model_top10_shap.predict_proba(X_train_scaled_top10_shap[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold_shap)
    roc_auc_fold_shap = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold_shap)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold_shap:.2f})')
    fold_count += 1
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (SHAP Random Forest, 3-fold CV)')
plt.legend(loc='lower right')
plt.show()

# 绘制 Top 10 SHAP 特征重要性图
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df.head(10), x='SHAP Importance', y='Feature', palette='viridis')
plt.title('Top 10 SHAP Feature Importances')
plt.xlabel('SHAP Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()


In [None]:
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.ensemble import RandomForestClassifier

def analyze_feature_importance(X_train, y_train, rf_model_raw, n_features=10, sample_size=200):
    """
    Analyze feature importance using SHAP values with proper error handling and data validation
    """
    try:
        # 使用原始模型提取特征重要性
        feature_importances = rf_model_raw.feature_importances_
        feature_importance_df = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': feature_importances
        }).sort_values(by='Importance', ascending=False)

        # 选择前n个重要特征
        top_features = feature_importance_df['Feature'].head(n_features).tolist()
        print(f"Top {n_features} important features:", top_features)

        # 重新训练模型，仅使用前n个重要特征
        X_train_reduced = X_train[top_features]

        # 确保数据完整性
        if X_train_reduced.isnull().any().any():
            X_train_reduced = X_train_reduced.fillna(0)

        # 使用更简单的模型
        rf_model_reduced = RandomForestClassifier(
            n_estimators=50,
            max_depth=5,
            random_state=42
        )
        rf_model_reduced.fit(X_train_reduced, y_train)

        # 准备样本数据 - 确保样本大小不超过可用数据
        sample_size = min(sample_size, len(X_train_reduced))
        sample_data = X_train_reduced.sample(n=sample_size, random_state=42)

        # 确保样本数据是完整的DataFrame
        sample_data = sample_data.reset_index(drop=True)

        print("Initializing SHAP KernelExplainer...")
        background = shap.kmeans(sample_data, 10)
        explainer = shap.KernelExplainer(
            rf_model_reduced.predict_proba,
            background,
            link="logit"
        )

        # 计算SHAP值
        shap_values = explainer.shap_values(sample_data)

        # 验证SHAP值的维度
        if isinstance(shap_values, list):
            shap_values_target = shap_values[1]  # 对于二分类，使用正类的SHAP值
        else:
            shap_values_target = shap_values

        # 计算和绘制特征重要性
        print("Calculating Median SHAP values...")
        shap_median_importance = np.median(np.abs(shap_values_target), axis=0)

        shap_importance_df = pd.DataFrame({
            'Feature': top_features,
            'Median SHAP Value': shap_median_importance
        }).sort_values(by='Median SHAP Value', ascending=False)

        # 绘图
        plt.figure(figsize=(10, 8))
        plt.barh(shap_importance_df['Feature'], shap_importance_df['Median SHAP Value'])
        plt.xlabel('Median SHAP Value')
        plt.ylabel('Feature')
        plt.title(f'Top {n_features} Feature Importance (Median SHAP Values)')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()

        # SHAP summary plot
        print("Generating SHAP summary plot...")
        shap.summary_plot(
            shap_values_target,
            sample_data,
            plot_type='dot',
            show=True
        )

        return shap_importance_df, explainer, shap_values_target

    except Exception as e:
        print(f"Error occurred: {str(e)}")
        print("Traceback:")
        import traceback
        traceback.print_exc()
        return None, None, None

# 使用示例：
# results, explainer, shap_values = analyze_feature_importance(
#     X_train,
#     y_train,
#     rf_model_raw,
#     n_features=10,
#     sample_size=200
# )

In [None]:
# 打印长度
print(f"Length of top_features: {len(top_features)}")
print(f"Length of shap_median_importance: {len(shap_median_importance)}")

# 打印内容（前几个元素以便对比）
print("Top features:", top_features)
print("SHAP median importance values (first 10):", shap_median_importance[:10])


In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = rf_model_reduced.predict(X_train_reduced)
print("Training Accuracy:", accuracy_score(y_train_reduced, y_pred))
print("Classification Report:\n", classification_report(y_train_reduced, y_pred))


Boruta 4

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
from boruta import BorutaPy
import numpy as np

# 假设 H2_train 和 selected_columns 已经定义
# 定义特征和目标变量
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 使用Boruta进行特征选择
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
boruta_selector = BorutaPy(
    rf,
    n_estimators='auto',
    verbose=2,
    random_state=42
)

# 运行Boruta
boruta_selector.fit(X_train_scaled, y_train)

# 获取选中的特征
selected_feat_mask = boruta_selector.support_
selected_features = X_train.columns[selected_feat_mask].tolist()

# 创建特征重要性DataFrame
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': boruta_selector.ranking_,
    'Selected': selected_feat_mask
})
importance_df['Importance'] = max(importance_df['Importance']) - importance_df['Importance'] + 1
importance_df = importance_df[importance_df['Selected']].sort_values(by='Importance', ascending=False)

# 如果选中的特征超过10个，只取前10个
top_features = importance_df['Feature'].head(10).tolist()
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 使用采样方法和Boruta选择的特征训练模型
for method, sampler in samplers.items():
    print(f"=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

    rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)
    y_pred_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix ({method})')
    plt.show()

    report = classification_report(y_resampled, y_pred_cv)
    print(report)

    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    metrics_data.append({
        'Balancing Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print(metrics_df)

metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Balancing Methods (Boruta Selected Features)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 可视化Boruta选择的特征重要性
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df.head(10), x='Importance', y='Feature', palette='viridis')
plt.title('Top Features Selected by Boruta')
plt.xlabel('Importance Rank')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

### hyperparameter tuning

Baseline 4

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
import optuna
import warnings
warnings.filterwarnings('ignore')

# 假设 H2_train 和 selected_columns 已经定义
# 定义特征和目标变量
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 定义Optuna的目标函数
def objective(trial, X, y):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'random_state': 42,
        'n_jobs': -1
    }

    rf = RandomForestClassifier(**params)
    scores = cross_val_score(rf, X, y, cv=3, scoring='roc_auc')
    return scores.mean()

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 使用采样方法和所有特征训练模型
for method, sampler in samplers.items():
    print(f"\n=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled, y_train)

    # 运行Optuna优化
    print(f"Optimizing hyperparameters for {method}...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_resampled, y_resampled),
                  n_trials=50, show_progress_bar=True)

    # 打印最佳参数
    print(f"Best parameters for {method}:")
    print(study.best_params)
    print(f"Best ROC-AUC score: {study.best_value:.4f}")

    # 使用最佳参数训练模型
    rf_model = RandomForestClassifier(**study.best_params, random_state=42, n_jobs=-1)
    y_pred_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix ({method})\nBest ROC-AUC: {study.best_value:.4f}')
    plt.show()

    report = classification_report(y_resampled, y_pred_cv)
    print(report)

    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    metrics_data.append({
        'Balancing Method': method,
        'Best ROC-AUC': study.best_value,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

# 创建结果比较DataFrame
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print("\nFinal Results Comparison:")
print(metrics_df)

# 绘制性能比较图
plt.figure(figsize=(12, 6))
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'Best ROC-AUC']].plot(kind='bar')
plt.title('Comparison of Balancing Methods with Optimized Parameters (Baseline Model)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# 可视化所有特征的重要性
rf_model_baseline = RandomForestClassifier(**study.best_params, random_state=42, n_jobs=-1)
rf_model_baseline.fit(X_train_scaled, y_train)
feature_importances = rf_model_baseline.feature_importances_
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=importance_df.head(15), x='Importance', y='Feature', palette='viridis')
plt.title('Top 15 Feature Importances (Baseline Model)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

feature importance top10 4

In [None]:
import optuna
from sklearn.model_selection import cross_val_score

# 定义目标函数
def objective(trial):
    # 定义需要调优的超参数范围
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    # 创建模型
    rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1
    )

    # 使用交叉验证评估性能
    scores = cross_val_score(rf_model, X_resampled, y_resampled, cv=cv, scoring='roc_auc')
    return scores.mean()  # 返回平均 AUC 作为目标值

# 选择采样方法，例如 'SMOTE'
sampler = SMOTE(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

# 创建 Optuna Study 并优化
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # 运行 50 次试验

# 输出最佳超参数和结果
print("Best trial:")
print(f"  Value: {study.best_trial.value}")
print(f"  Params: {study.best_trial.params}")

# 使用最佳超参数训练最终模型
best_params = study.best_trial.params
rf_model_optimized = RandomForestClassifier(
    **best_params,
    random_state=42,
    n_jobs=-1
)
rf_model_optimized.fit(X_resampled, y_resampled)

# 测试并评估模型性能
y_pred_cv_optimized = cross_val_predict(rf_model_optimized, X_resampled, y_resampled, cv=cv, method='predict')
y_pred_proba_cv_optimized = cross_val_predict(rf_model_optimized, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

accuracy_optimized = accuracy_score(y_resampled, y_pred_cv_optimized)
precision_optimized = precision_score(y_resampled, y_pred_cv_optimized, pos_label=1)
recall_optimized = recall_score(y_resampled, y_pred_cv_optimized, pos_label=1)
f1_optimized = f1_score(y_resampled, y_pred_cv_optimized, pos_label=1)
roc_auc_optimized = roc_auc_score(y_resampled, y_pred_proba_cv_optimized)

print("Optimized Model Performance:")
print(f"Accuracy: {accuracy_optimized:.4f}")
print(f"Precision: {precision_optimized:.4f}")
print(f"Recall: {recall_optimized:.4f}")
print(f"F1 Score: {f1_optimized:.4f}")
print(f"ROC AUC: {roc_auc_optimized:.4f}")


In [None]:
import optuna
from sklearn.model_selection import cross_val_score

# 定义目标函数
def objective(trial):
    # 定义需要调优的超参数范围
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    # 创建模型
    rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1
    )

    # 使用交叉验证评估性能
    scores = cross_val_score(rf_model, X_resampled, y_resampled, cv=cv, scoring='roc_auc')
    return scores.mean()  # 返回平均 AUC 作为目标值

# 选择采样方法，例如 'SMOTE'
sampler = SMOTE(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

# 创建 Optuna Study 并优化
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # 运行 50 次试验

# 输出最佳超参数和结果
print("Best trial:")
print(f"  Value: {study.best_trial.value}")
print(f"  Params: {study.best_trial.params}")

# 使用最佳超参数训练最终模型
best_params = study.best_trial.params
rf_model_optimized = RandomForestClassifier(
    **best_params,
    random_state=42,
    n_jobs=-1
)
rf_model_optimized.fit(X_resampled, y_resampled)

# 测试并评估模型性能
y_pred_cv_optimized = cross_val_predict(rf_model_optimized, X_resampled, y_resampled, cv=cv, method='predict')
y_pred_proba_cv_optimized = cross_val_predict(rf_model_optimized, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

accuracy_optimized = accuracy_score(y_resampled, y_pred_cv_optimized)
precision_optimized = precision_score(y_resampled, y_pred_cv_optimized, pos_label=1)
recall_optimized = recall_score(y_resampled, y_pred_cv_optimized, pos_label=1)
f1_optimized = f1_score(y_resampled, y_pred_cv_optimized, pos_label=1)
roc_auc_optimized = roc_auc_score(y_resampled, y_pred_proba_cv_optimized)

print("Optimized Model Performance:")
print(f"Accuracy: {accuracy_optimized:.4f}")
print(f"Precision: {precision_optimized:.4f}")
print(f"Recall: {recall_optimized:.4f}")
print(f"F1 Score: {f1_optimized:.4f}")
print(f"ROC AUC: {roc_auc_optimized:.4f}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
import optuna
import warnings
warnings.filterwarnings('ignore')

# 假设 H2_train 和 selected_columns 已经定义
# 定义特征和目标变量
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 使用未处理数据的随机森林提取重要特征
rf_model_raw = RandomForestClassifier(random_state=42)
rf_model_raw.fit(X_train_scaled, y_train)

# 提取特征重要性
feature_importances = rf_model_raw.feature_importances_
features = X_train.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False).head(10)

# 提取 Top 10 特征
top_features = importance_df_sorted['Feature'].tolist()
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# 定义Optuna的目标函数
def objective(trial, X, y):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'random_state': 42,
        'n_jobs': -1
    }

    rf = RandomForestClassifier(**params)
    scores = cross_val_score(rf, X, y, cv=3, scoring='roc_auc')
    return scores.mean()

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 使用采样方法和 Top 10 特征训练模型
for method, sampler in samplers.items():
    print(f"\n=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

    # 运行Optuna优化
    print(f"Optimizing hyperparameters for {method}...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_resampled, y_resampled),
                  n_trials=50, show_progress_bar=True)

    # 打印最佳参数
    print(f"Best parameters for {method}:")
    print(study.best_params)
    print(f"Best ROC-AUC score: {study.best_value:.4f}")

    # 使用最佳参数训练模型
    rf_model = RandomForestClassifier(**study.best_params, random_state=42, n_jobs=-1)
    y_pred_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix ({method})\nBest ROC-AUC: {study.best_value:.4f}')
    plt.show()

    report = classification_report(y_resampled, y_pred_cv)
    print(report)

    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    metrics_data.append({
        'Balancing Method': method,
        'Best ROC-AUC': study.best_value,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

# 创建结果比较DataFrame
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print("\nFinal Results Comparison:")
print(metrics_df)

# 绘制性能比较图
plt.figure(figsize=(12, 6))
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'Best ROC-AUC']].plot(kind='bar')
plt.title('Comparison of Balancing Methods with Optimized Parameters')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# 可视化特征重要性
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df_sorted, x='Importance', y='Feature', palette='viridis')
plt.title('Top 10 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
import optuna
import warnings
warnings.filterwarnings('ignore')

# 假设 H2_train 和 selected_columns 已经定义
# 定义特征和目标变量
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 使用未处理数据的随机森林提取重要特征
rf_model_raw = RandomForestClassifier(random_state=42)
rf_model_raw.fit(X_train_scaled, y_train)

# 提取特征重要性
feature_importances = rf_model_raw.feature_importances_
features = X_train.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False).head(10)

# 提取 Top 10 特征
top_features = importance_df_sorted['Feature'].tolist()
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# 定义Optuna的目标函数
def objective(trial, X, y):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'random_state': 42,
        'n_jobs': -1
    }

    rf = RandomForestClassifier(**params)
    scores = cross_val_score(rf, X, y, cv=3, scoring='roc_auc')
    return scores.mean()

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 使用采样方法和 Top 10 特征训练模型
for method, sampler in samplers.items():
    print(f"\n=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

    # 运行Optuna优化
    print(f"Optimizing hyperparameters for {method}...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_resampled, y_resampled),
                  n_trials=50, show_progress_bar=True)

    # 打印最佳参数
    print(f"Best parameters for {method}:")
    print(study.best_params)
    print(f"Best ROC-AUC score: {study.best_value:.4f}")

    # 使用最佳参数训练模型
    rf_model = RandomForestClassifier(**study.best_params, random_state=42, n_jobs=-1)
    y_pred_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix ({method})\nBest ROC-AUC: {study.best_value:.4f}')
    plt.show()

    report = classification_report(y_resampled, y_pred_cv)
    print(report)

    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    metrics_data.append({
        'Balancing Method': method,
        'Best ROC-AUC': study.best_value,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

# 创建结果比较DataFrame
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print("\nFinal Results Comparison:")
print(metrics_df)

# 绘制性能比较图
plt.figure(figsize=(12, 6))
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'Best ROC-AUC']].plot(kind='bar')
plt.title('Comparison of Balancing Methods with Optimized Parameters')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# 可视化特征重要性
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df_sorted, x='Importance', y='Feature', palette='viridis')
plt.title('Top 10 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()


SHAP top10 4

Boruta 4

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
from boruta import BorutaPy
import optuna
import warnings
warnings.filterwarnings('ignore')

# 假设 H2_train 和 selected_columns 已经定义
# 定义特征和目标变量
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 使用Boruta进行特征选择
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
boruta_selector = BorutaPy(
    rf,
    n_estimators='auto',
    verbose=2,
    random_state=42
)

# 运行Boruta
boruta_selector.fit(X_train_scaled, y_train)

# 获取选中的特征
selected_feat_mask = boruta_selector.support_
selected_features = X_train.columns[selected_feat_mask].tolist()

# 创建特征重要性DataFrame
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': boruta_selector.ranking_,
    'Selected': selected_feat_mask
})
importance_df['Importance'] = max(importance_df['Importance']) - importance_df['Importance'] + 1
importance_df = importance_df[importance_df['Selected']].sort_values(by='Importance', ascending=False)

# 如果选中的特征超过10个，只取前10个
top_features = importance_df['Feature'].head(10).tolist()
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# 定义Optuna的目标函数
def objective(trial, X, y):
    # 定义超参数搜索空间
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'random_state': 42,
        'n_jobs': -1
    }

    rf = RandomForestClassifier(**params)
    scores = cross_val_score(rf, X, y, cv=3, scoring='roc_auc')
    return scores.mean()

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 对每种采样方法进行超参数优化和模型训练
for method, sampler in samplers.items():
    print(f"\n=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

    # 运行Optuna优化
    print(f"Optimizing hyperparameters for {method}...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_resampled, y_resampled),
                  n_trials=50, show_progress_bar=True)

    # 使用最佳参数训练模型
    best_params = study.best_params
    print(f"\nBest parameters for {method}:")
    print(best_params)
    print(f"Best ROC-AUC score: {study.best_value:.4f}")

    rf_model = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
    y_pred_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict')
    y_pred_proba_cv = cross_val_predict(rf_model, X_resampled, y_resampled, cv=cv, method='predict_proba')[:, 1]

    # 计算指标
    accuracy = accuracy_score(y_resampled, y_pred_cv)
    precision = precision_score(y_resampled, y_pred_cv, pos_label=1)
    recall = recall_score(y_resampled, y_pred_cv, pos_label=1)
    f1 = f1_score(y_resampled, y_pred_cv, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba_cv)

    # 混淆矩阵可视化
    conf_matrix = confusion_matrix(y_resampled, y_pred_cv)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix ({method})\nBest ROC-AUC: {study.best_value:.4f}')
    plt.show()

    # 打印分类报告
    report = classification_report(y_resampled, y_pred_cv)
    print(report)

    # ROC曲线可视化
    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba_cv)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    # 记录评估指标
    metrics_data.append({
        'Balancing Method': method,
        'Best ROC-AUC': study.best_value,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

# 创建并显示metrics比较DataFrame
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print("\nFinal Performance Comparison:")
print(metrics_df)

# 绘制性能比较图
plt.figure(figsize=(12, 6))
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'Best ROC-AUC']].plot(kind='bar')
plt.title('Comparison of Balancing Methods with Optimized Parameters')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# 绘制Boruta选择的特征重要性
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df.head(10), x='Importance', y='Feature', palette='viridis')
plt.title('Top Features Selected by Boruta')
plt.xlabel('Importance Rank')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# 保存最佳模型的参数
best_method = metrics_df.iloc[0]['Balancing Method']
print(f"\nBest performing method: {best_method}")
print("Best hyperparameters:")
for method, sampler in samplers.items():
    if method == best_method:
        if sampler is None:
            X_final, y_final = X_train_scaled_top, y_train
        else:
            X_final, y_final = sampler.fit_resample(X_train_scaled_top, y_train)
        study = optuna.create_study(direction='maximize')
        study.optimize(lambda trial: objective(trial, X_final, y_final),
                      n_trials=50, show_progress_bar=True)
        print(study.best_params)

### test set

Baseline 4（H1&H2）

1126

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
import warnings
warnings.filterwarnings('ignore')

def evaluate_test_sets(X_train_scaled, y_train, X_test_scaled, y_test, params, set_name):
    """
    Evaluate model performance on test set using given parameters
    """
    # Train model
    rf_model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
    rf_model.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = rf_model.predict(X_test_scaled)
    y_pred_proba = rf_model.predict_proba(X_test_scaled)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    # Create confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - {set_name}\nROC-AUC: {roc_auc:.4f}')
    plt.show()

    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {set_name}')
    plt.legend(loc='lower right')
    plt.show()

    # Print classification report
    print(f"\nClassification Report - {set_name}:")
    print(classification_report(y_test, y_pred))

    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    }

def evaluate_all_models(X_train_scaled, y_train, X_test_scaled, y_test, best_params, set_name):
    """
    Evaluate all sampling methods on the test set
    """
    results = []
    for method, params in best_params.items():
        print(f"\n=== {method} - {set_name} ===")
        metrics = evaluate_test_sets(X_train_scaled, y_train, X_test_scaled, y_test, params, f"{method} - {set_name}")
        metrics['Method'] = method
        results.append(metrics)

    # Create comparison DataFrame
    results_df = pd.DataFrame(results).set_index('Method')

    # Plot comparison
    plt.figure(figsize=(12, 6))
    results_df.plot(kind='bar')
    plt.title(f'Model Performance Comparison - {set_name}')
    plt.ylabel('Score')
    plt.xticks(rotation=45)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

    return results_df

# Best parameters from previous optimization
best_params = {
    'No Sampling': {
        'n_estimators': 210,
        'max_depth': 15,
        'min_samples_split': 4,
        'min_samples_leaf': 1,
        'max_features': 'sqrt'
    },
    'Undersample': {
        'n_estimators': 202,
        'max_depth': 15,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'max_features': 'sqrt'
    },
    'Oversample': {
        'n_estimators': 283,
        'max_depth': 15,
        'min_samples_split': 4,
        'min_samples_leaf': 1,
        'max_features': 'sqrt'
    },
    'SMOTE': {
        'n_estimators': 226,
        'max_depth': 15,
        'min_samples_split': 6,
        'min_samples_leaf': 1,
        'max_features': 'sqrt'
    }
}

# Prepare test data (assuming you have these variables defined)
# Make sure to run the data preparation code first to have these variables available
X_test = H1_test[X_train.columns]  # Use same columns as training data
y_test = H1_test['is_canceled']
X_test_H2 = H2_test[X_train.columns]  # Use same columns as training data
y_test_H2 = H2_test['is_canceled']

# Scale test data using the same scaler used for training data
X_test_scaled = scaler.transform(X_test)
X_test_H2_scaled = scaler.transform(X_test_H2)

# Evaluate H1 test set
print("\nEvaluating H1 Test Set:")
h1_results = evaluate_all_models(X_train_scaled, y_train, X_test_scaled, y_test, best_params, "H1 Test Set")

# Evaluate H2 test set
print("\nEvaluating H2 Test Set:")
h2_results = evaluate_all_models(X_train_scaled, y_train, X_test_H2_scaled, y_test_H2, best_params, "H2 Test Set")

# Print final comparison
print("\nH1 Test Set Results:")
print(h1_results)
print("\nH2 Test Set Results:")
print(h2_results)

In [None]:
# Print H2 Test Set Results in the requested format
print("H2 Test Set Results Summary:")
print("="*70)
print(f"{'Method':<15} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1 Score':<10} {'ROC AUC':<10}")
print('-'*70)
for method, metrics in h2_results.iterrows():
    print(f"{method:<15} {metrics['Accuracy']:.6f} {metrics['Precision']:.6f} {metrics['Recall']:.6f} {metrics['F1 Score']:.6f} {metrics['ROC AUC']:.6f}")

# Print H1 Test Set Results in the requested format
print("\nH1 Test Set Results Summary:")
print("="*70)
print(f"{'Method':<15} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1 Score':<10} {'ROC AUC':<10}")
print('-'*70)
for method, metrics in h1_results.iterrows():
    print(f"{method:<15} {metrics['Accuracy']:.6f} {metrics['Precision']:.6f} {metrics['Recall']:.6f} {metrics['F1 Score']:.6f} {metrics['ROC AUC']:.6f}")

feature importance top10 4

In [None]:
# 1. 首先打印出所有数据集的列名，看看是否一致
print("X_train columns:", X_train.columns.tolist())
print("X_test columns:", X_test.columns.tolist())
print("X_test_H2 columns:", X_test_H2.columns.tolist())

# 2. 检查数据类型是否一致
print("\nX_train dtypes:\n", X_train.dtypes)
print("\nX_test dtypes:\n", X_test.dtypes)
print("\nX_test_H2 dtypes:\n", X_test_H2.dtypes)

# 3. 查看是否有缺失值
print("\nMissing values in X_train:\n", X_train.isnull().sum())
print("\nMissing values in X_test:\n", X_test.isnull().sum())
print("\nMissing values in X_test_H2:\n", X_test_H2.isnull().sum())

# 4. 检查数据集的形状
print("\nShapes:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("X_test_H2:", X_test_H2.shape)

In [None]:
# 1. 将数据集转换为 numpy 数组，避免特征名称问题
X_train_array = X_train.values
X_test_array = X_test.values
X_test_H2_array = X_test_H2.values

# 2. 对训练数据进行拟合和转换
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_array)

# 3. 使用同一个 scaler 对测试数据进行转换
X_test_scaled = scaler.transform(X_test_array)
X_test_H2_scaled = scaler.transform(X_test_H2_array)

# 4. 验证转换后的形状
print("Scaled shapes:")
print("X_train_scaled:", X_train_scaled.shape)
print("X_test_scaled:", X_test_scaled.shape)
print("X_test_H2_scaled:", X_test_H2_scaled.shape)

In [None]:
def evaluate_all_models(X_train_scaled, y_train, X_test_scaled, y_test, X_test_H2_scaled, y_test_H2, best_params):

    samplers = {
        'No Sampling': None,
        'Undersample': RandomUnderSampler(random_state=42),
        'Oversample': RandomOverSampler(random_state=42),
        'SMOTE': SMOTE(random_state=42)
    }

    results = []

    for method, sampler in samplers.items():
        print(f"\n{'='*50}")
        print(f"Evaluating {method}")
        print('='*50)

        # 对训练数据进行重采样
        if sampler is None:
            X_resampled, y_resampled = X_train_scaled, y_train
        else:
            X_resampled, y_resampled = sampler.fit_resample(X_train_scaled, y_train)

        # 使用最佳参数训练模型
        rf_model = RandomForestClassifier(**best_params[method], random_state=42, n_jobs=-1)
        rf_model.fit(X_resampled, y_resampled)

        # 评估H1测试集
        print("\nResults on H1 Test Set (City Hotel):")
        y_pred_h1 = rf_model.predict(X_test_scaled)
        y_pred_proba_h1 = rf_model.predict_proba(X_test_scaled)[:, 1]

        # 计算H1指标
        h1_metrics = calculate_metrics(y_test, y_pred_h1, y_pred_proba_h1, "H1")
        plot_evaluation_charts(y_test, y_pred_h1, y_pred_proba_h1, method, "H1 Test Set")

        # 评估H2测试集
        print("\nResults on H2 Test Set (Resort Hotel):")
        y_pred_h2 = rf_model.predict(X_test_H2_scaled)
        y_pred_proba_h2 = rf_model.predict_proba(X_test_H2_scaled)[:, 1]

        # 计算H2指标
        h2_metrics = calculate_metrics(y_test_H2, y_pred_h2, y_pred_proba_h2, "H2")
        plot_evaluation_charts(y_test_H2, y_pred_h2, y_pred_proba_h2, method, "H2 Test Set")

        # 保存结果
        results.extend([h1_metrics, h2_metrics])

    # 创建结果DataFrame
    results_df = pd.DataFrame(results)
    return results_df

def calculate_metrics(y_true, y_pred, y_pred_proba, test_set):
    """计算各种评估指标"""
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    print(f"\nClassification Report for {test_set}:")
    print(classification_report(y_true, y_pred))

    return {
        'Test Set': test_set,
        'Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    }

def plot_evaluation_charts(y_true, y_pred, y_pred_proba, method, test_set):
    """绘制评估图表"""
    # 混淆矩阵
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    conf_matrix = confusion_matrix(y_true, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Not Canceled', 'Canceled'],
                yticklabels=['Not Canceled', 'Canceled'])
    plt.title(f'Confusion Matrix\n{method} - {test_set}')

    # ROC曲线
    plt.subplot(1, 2, 2)
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.plot(fpr, tpr, label=f'ROC (AUC = {roc_auc_score(y_true, y_pred_proba):.3f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve\n{method} - {test_set}')
    plt.legend()
    plt.tight_layout()
    plt.show()

# 定义最佳参数
best_params = {
    'SMOTE': {
        'n_estimators': 241,
        'max_depth': 15,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'max_features': 'log2'
    },
    'Oversample': {
        'n_estimators': 178,
        'max_depth': 15,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'max_features': 'log2'
    },
    'Undersample': {
        'n_estimators': 300,
        'max_depth': 15,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'max_features': 'sqrt'
    },
    'No Sampling': {
        'n_estimators': 270,
        'max_depth': 15,
        'min_samples_split': 8,
        'min_samples_leaf': 1,
        'max_features': 'sqrt'
    }
}

# 运行评估
results_df = evaluate_all_models(
    X_train_scaled, y_train,
    X_test_scaled, y_test,
    X_test_H2_scaled, y_test_H2,
    best_params
)

# 打印最终结果汇总
print("\nFinal Results Summary:")
print(results_df.sort_values(['Test Set', 'ROC AUC'], ascending=[True, False]))

# 绘制综合比较图
plt.figure(figsize=(15, 6))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']

for i, test_set in enumerate(['H1', 'H2']):
    plt.subplot(1, 2, i+1)
    test_results = results_df[results_df['Test Set'] == test_set]
    test_results[metrics].plot(kind='bar', ax=plt.gca())
    plt.title(f'{test_set} Test Set Results')
    plt.xticks(range(len(test_results)), test_results['Method'], rotation=45)
    plt.ylim(0, 1)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
def evaluate_all_models(X_train_scaled, y_train, X_test_scaled, y_test, X_test_H2_scaled, y_test_H2, best_params):

    samplers = {
        'No Sampling': None,
        'Undersample': RandomUnderSampler(random_state=42),
        'Oversample': RandomOverSampler(random_state=42),
        'SMOTE': SMOTE(random_state=42)
    }

    results = []

    for method, sampler in samplers.items():
        print(f"\n{'='*50}")
        print(f"Evaluating {method}")
        print('='*50)

        # 对训练数据进行重采样
        if sampler is None:
            X_resampled, y_resampled = X_train_scaled, y_train
        else:
            X_resampled, y_resampled = sampler.fit_resample(X_train_scaled, y_train)

        # 使用最佳参数训练模型
        rf_model = RandomForestClassifier(**best_params[method], random_state=42, n_jobs=-1)
        rf_model.fit(X_resampled, y_resampled)

        # 评估H1测试集
        print("\nResults on H1 Test Set (City Hotel):")
        y_pred_h1 = rf_model.predict(X_test_scaled)
        y_pred_proba_h1 = rf_model.predict_proba(X_test_scaled)[:, 1]

        # 计算H1指标
        h1_metrics = calculate_metrics(y_test, y_pred_h1, y_pred_proba_h1, "H1")
        plot_evaluation_charts(y_test, y_pred_h1, y_pred_proba_h1, method, "H1 Test Set")

        # 评估H2测试集
        print("\nResults on H2 Test Set (Resort Hotel):")
        y_pred_h2 = rf_model.predict(X_test_H2_scaled)
        y_pred_proba_h2 = rf_model.predict_proba(X_test_H2_scaled)[:, 1]

        # 计算H2指标
        h2_metrics = calculate_metrics(y_test_H2, y_pred_h2, y_pred_proba_h2, "H2")
        plot_evaluation_charts(y_test_H2, y_pred_h2, y_pred_proba_h2, method, "H2 Test Set")

        # 保存结果
        results.extend([h1_metrics, h2_metrics])

    # 创建结果DataFrame
    results_df = pd.DataFrame(results)
    return results_df

def calculate_metrics(y_true, y_pred, y_pred_proba, test_set):
    """计算各种评估指标"""
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    print(f"\nClassification Report for {test_set}:")
    print(classification_report(y_true, y_pred))

    return {
        'Test Set': test_set,
        'Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    }

def plot_evaluation_charts(y_true, y_pred, y_pred_proba, method, test_set):
    """绘制评估图表"""
    # 混淆矩阵
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    conf_matrix = confusion_matrix(y_true, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Not Canceled', 'Canceled'],
                yticklabels=['Not Canceled', 'Canceled'])
    plt.title(f'Confusion Matrix\n{method} - {test_set}')

    # ROC曲线
    plt.subplot(1, 2, 2)
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.plot(fpr, tpr, label=f'ROC (AUC = {roc_auc_score(y_true, y_pred_proba):.3f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve\n{method} - {test_set}')
    plt.legend()
    plt.tight_layout()
    plt.show()

# 定义最佳参数
best_params = {
    'SMOTE': {
        'n_estimators': 241,
        'max_depth': 15,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'max_features': 'log2'
    },
    'Oversample': {
        'n_estimators': 178,
        'max_depth': 15,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'max_features': 'log2'
    },
    'Undersample': {
        'n_estimators': 300,
        'max_depth': 15,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'max_features': 'sqrt'
    },
    'No Sampling': {
        'n_estimators': 270,
        'max_depth': 15,
        'min_samples_split': 8,
        'min_samples_leaf': 1,
        'max_features': 'sqrt'
    }
}

# 运行评估
results_df = evaluate_all_models(
    X_train_scaled, y_train,
    X_test_scaled, y_test,
    X_test_H2_scaled, y_test_H2,
    best_params
)

# 打印最终结果汇总
print("\nFinal Results Summary:")
print(results_df.sort_values(['Test Set', 'ROC AUC'], ascending=[True, False]))

# 绘制综合比较图
plt.figure(figsize=(15, 6))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']

for i, test_set in enumerate(['H1', 'H2']):
    plt.subplot(1, 2, i+1)
    test_results = results_df[results_df['Test Set'] == test_set]
    test_results[metrics].plot(kind='bar', ax=plt.gca())
    plt.title(f'{test_set} Test Set Results')
    plt.xticks(range(len(test_results)), test_results['Method'], rotation=45)
    plt.ylim(0, 1)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

SHAP top10 4

Boruta 4（ H1H2）

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
from boruta import BorutaPy
import warnings
warnings.filterwarnings('ignore')

def evaluate_and_visualize(y_true, y_pred, y_pred_proba, method_name, dataset_name):
    """Helper function to evaluate and visualize results"""
    # 计算指标
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label=1)
    recall = recall_score(y_true, y_pred, pos_label=1)
    f1 = f1_score(y_true, y_pred, pos_label=1)
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    # 混淆矩阵
    conf_matrix = confusion_matrix(y_true, y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - {method_name}\n{dataset_name}')
    plt.show()

    # ROC曲线
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {method_name}\n{dataset_name}')
    plt.legend(loc='lower right')
    plt.show()

    # 打印分类报告
    print(f"\nClassification Report - {method_name} - {dataset_name}")
    print(classification_report(y_true, y_pred))

    return {
        'Method': method_name,
        'Dataset': dataset_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    }

# 获取之前的特征选择结果
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# 准备测试数据
X_H2_test = H2_test[top_features]
X_H2_test_scaled = scaler.transform(X_H2_test)
y_H2_test = H2_test['is_canceled']

X_H1_test = H1_test[top_features]
X_H1_test_scaled = scaler.transform(X_H1_test)
y_H1_test = H1_test['is_canceled']

# 定义采样方法和它们的最佳参数
sampling_methods = {
    'No Sampling': {
        'sampler': None,
        'params': {
            'n_estimators': 270,
            'max_depth': 15,
            'min_samples_split': 8,
            'min_samples_leaf': 1,
            'max_features': 'sqrt'
        }
    },
    'Undersample': {
        'sampler': RandomUnderSampler(random_state=42),
        'params': {
            'n_estimators': 300,
            'max_depth': 15,
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'max_features': 'sqrt'
        }
    },
    'Oversample': {
        'sampler': RandomOverSampler(random_state=42),
        'params': {
            'n_estimators': 178,
            'max_depth': 15,
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'max_features': 'log2'
        }
    },
    'SMOTE': {
        'sampler': SMOTE(random_state=42),
        'params': {
            'n_estimators': 241,
            'max_depth': 15,
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'max_features': 'log2'
        }
    }
}

# 存储所有评估结果
all_results = []

# 对每种采样方法进行评估
for method_name, method_info in sampling_methods.items():
    print(f"\n=== Evaluating {method_name} ===")

    # 使用采样方法处理训练数据
    if method_info['sampler'] is None:
        X_resampled, y_resampled = X_train_scaled_top, y_train
    else:
        X_resampled, y_resampled = method_info['sampler'].fit_resample(X_train_scaled_top, y_train)

    # 使用最佳参数训练模型
    rf_model = RandomForestClassifier(**method_info['params'], random_state=42, n_jobs=-1)
    rf_model.fit(X_resampled, y_resampled)

    # 在H2测试集上评估
    y_pred_H2 = rf_model.predict(X_H2_test_scaled)
    y_pred_proba_H2 = rf_model.predict_proba(X_H2_test_scaled)[:, 1]
    h2_results = evaluate_and_visualize(y_H2_test, y_pred_H2, y_pred_proba_H2, method_name, 'H2 Test Set')
    all_results.append(h2_results)

    # 在H1测试集上评估
    y_pred_H1 = rf_model.predict(X_H1_test_scaled)
    y_pred_proba_H1 = rf_model.predict_proba(X_H1_test_scaled)[:, 1]
    h1_results = evaluate_and_visualize(y_H1_test, y_pred_H1, y_pred_proba_H1, method_name, 'H1 Test Set')
    all_results.append(h1_results)

# 创建结果比较DataFrame
results_df = pd.DataFrame(all_results)
print("\nFinal Results Comparison:")
print(results_df)

# 创建图形和子图
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

# 准备数据
h2_data = results_df[results_df['Dataset'] == 'H2 Test Set'].set_index('Method')
h1_data = results_df[results_df['Dataset'] == 'H1 Test Set'].set_index('Method')
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']

# 设置柱状图的颜色
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']

# H2测试集结果
h2_data[metrics].plot(kind='bar', ax=ax1, width=0.8, color=colors)
ax1.set_title('Performance on H2 Test Set', pad=20, fontsize=12)
ax1.set_ylabel('Score', fontsize=10)
ax1.set_ylim(0, 1)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax1.legend(bbox_to_anchor=(1.02, 1), loc='upper left')
ax1.tick_params(axis='x', labelrotation=30)

# H1测试集结果
h1_data[metrics].plot(kind='bar', ax=ax2, width=0.8, color=colors)
ax2.set_title('Performance on H1 Test Set', pad=20, fontsize=12)
ax2.set_ylabel('Score', fontsize=10)
ax2.set_ylim(0, 1)
ax2.grid(axis='y', linestyle='--', alpha=0.7)
ax2.legend(bbox_to_anchor=(1.02, 1), loc='upper left')
ax2.tick_params(axis='x', labelrotation=30)

# 调整布局
plt.tight_layout()
plt.subplots_adjust(right=0.85, wspace=0.25)

# 显示图表
plt.show()

# 输出每个数据集的最佳方法
for dataset in ['H2 Test Set', 'H1 Test Set']:
    dataset_results = results_df[results_df['Dataset'] == dataset]
    best_method = dataset_results.loc[dataset_results['ROC AUC'].idxmax()]
    print(f"\nBest method for {dataset}:")
    print(f"Method: {best_method['Method']}")
    print(f"ROC AUC: {best_method['ROC AUC']:.4f}")
    print(f"Accuracy: {best_method['Accuracy']:.4f}")
    print(f"F1 Score: {best_method['F1 Score']:.4f}")

### cross-dataset evaluation: H1 test set

The previous test set contains the cross dataset test results

 ## 3.TabNet




### baseline model

baseline 4

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
import torch
import warnings
warnings.filterwarnings('ignore')

# 假设 H2_train 和 selected_columns 已经定义
# 定义特征和目标变量
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 初始训练TabNet来获取特征重要性
initial_tabnet = TabNetClassifier(
    n_d=8,
    n_a=8,
    n_steps=3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax',
    device_name='auto'
)

initial_tabnet.fit(
    X_train_scaled, y_train,
    max_epochs=50,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128
)

# 获取特征重要性
feature_importances = initial_tabnet.feature_importances_
features = X_train.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False).head(10)

# 提取 Top 10 特征
top_features = importance_df_sorted['Feature'].tolist()
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# 定义TabNet的基础配置
def get_tabnet_config():
    return {
        'n_d': 8,  # 维度数
        'n_a': 8,  # 注意力维度
        'n_steps': 3,  # 步数
        'gamma': 1.3,  # 功能选择单元的系数
        'n_independent': 2,  # 独立层数
        'n_shared': 2,  # 共享层数
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': dict(lr=2e-2),
        'scheduler_params': {"step_size":10, "gamma":0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    }

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 使用采样方法和 Top 10 特征训练模型
for method, sampler in samplers.items():
    print(f"\n=== {method} ===")
    if sampler is None:
        X_resampled, y_resampled = X_train_scaled_top, y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

    # 初始化TabNet模型
    tabnet_config = get_tabnet_config()
    tabnet_model = TabNetClassifier(**tabnet_config)

    # 训练模型
    tabnet_model.fit(
        X_resampled, y_resampled,
        max_epochs=50,
        patience=10,
        batch_size=1024,
        virtual_batch_size=128
    )

    # 预测和评估
    y_pred = tabnet_model.predict(X_resampled)
    y_pred_proba = tabnet_model.predict_proba(X_resampled)[:, 1]

    # 计算指标
    accuracy = accuracy_score(y_resampled, y_pred)
    precision = precision_score(y_resampled, y_pred, pos_label=1)
    recall = recall_score(y_resampled, y_pred, pos_label=1)
    f1 = f1_score(y_resampled, y_pred, pos_label=1)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba)

    # 绘制混淆矩阵
    conf_matrix = confusion_matrix(y_resampled, y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix ({method})\nROC-AUC: {roc_auc:.4f}')
    plt.show()

    # 打印分类报告
    report = classification_report(y_resampled, y_pred)
    print(report)

    # 绘制ROC曲线
    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    # 收集指标
    metrics_data.append({
        'Balancing Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

# 创建结果比较DataFrame
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print("\nFinal Results Comparison:")
print(metrics_df)

# 绘制性能比较图
plt.figure(figsize=(12, 6))
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar')
plt.title('Comparison of Balancing Methods with TabNet')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# 可视化特征重要性
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df_sorted, x='Importance', y='Feature', palette='viridis')
plt.title('Top 10 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

### different feature combinations

feature importance top10 4

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
import torch

# 使用已经获得的top 10 features
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)
X_train_scaled_top = pd.DataFrame(X_train_scaled_top, columns=top_features)  # 转换为DataFrame保持索引

# 定义TabNet基础配置
def get_tabnet_config():
    return {
        'n_d': 16,
        'n_a': 16,
        'n_steps': 4,
        'gamma': 1.3,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': dict(lr=2e-2),
        'scheduler_params': {"step_size":10, "gamma":0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    }

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 对每种采样方法训练TabNet模型
for method, sampler in samplers.items():
    print(f"\n=== {method} ===")

    if sampler is None:
        X_resampled = X_train_scaled_top.values
        y_resampled = y_train.values
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)
        X_resampled = np.array(X_resampled)  # 确保为numpy array
        y_resampled = np.array(y_resampled)

    # 存储每个fold的预测结果
    all_y_true = []
    all_y_pred = []
    all_y_pred_proba = []

    # 执行交叉验证
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_resampled, y_resampled)):
        print(f"Training fold {fold + 1}/3...")

        # 准备数据
        X_fold_train = X_resampled[train_idx]
        y_fold_train = y_resampled[train_idx]
        X_fold_val = X_resampled[val_idx]
        y_fold_val = y_resampled[val_idx]

        # 初始化并训练模型
        tabnet_model = TabNetClassifier(**get_tabnet_config())
        tabnet_model.fit(
            X_fold_train, y_fold_train,
            eval_set=[(X_fold_val, y_fold_val)],
            max_epochs=100,
            patience=15,
            batch_size=1024,
            virtual_batch_size=128
        )

        # 预测并存储结果
        fold_preds = tabnet_model.predict(X_fold_val)
        fold_pred_probas = tabnet_model.predict_proba(X_fold_val)[:, 1]

        all_y_true.extend(y_fold_val)
        all_y_pred.extend(fold_preds)
        all_y_pred_proba.extend(fold_pred_probas)

    # 转换为numpy数组以便计算指标
    all_y_true = np.array(all_y_true)
    all_y_pred = np.array(all_y_pred)
    all_y_pred_proba = np.array(all_y_pred_proba)

    # 计算评估指标
    accuracy = accuracy_score(all_y_true, all_y_pred)
    precision = precision_score(all_y_true, all_y_pred)
    recall = recall_score(all_y_true, all_y_pred)
    f1 = f1_score(all_y_true, all_y_pred)
    roc_auc = roc_auc_score(all_y_true, all_y_pred_proba)

    # 绘制混淆矩阵
    conf_matrix = confusion_matrix(all_y_true, all_y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - TabNet ({method})')
    plt.show()

    # 打印分类报告
    print("\nClassification Report:")
    print(classification_report(all_y_true, all_y_pred))

    # 绘制ROC曲线
    fpr, tpr, _ = roc_curve(all_y_true, all_y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - TabNet ({method})')
    plt.legend(loc='lower right')
    plt.show()

    # 收集评估指标
    metrics_data.append({
        'Balancing Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

# 创建并显示结果比较表
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print("\nFinal Results Comparison (TabNet):")
print(metrics_df)

# 绘制性能比较图
plt.figure(figsize=(12, 6))
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar')
plt.title('Comparison of Balancing Methods - TabNet (Top 10 Features)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

SHAP top10 4

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_tabnet.tab_model import TabNetClassifier
import shap
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
import torch
import warnings
warnings.filterwarnings('ignore')

# 准备数据
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

# 初始化TabNet模型用于SHAP分析
initial_model = TabNetClassifier(
    n_d=8,
    n_a=8,
    n_steps=3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax',
    device_name='auto'
)

# 训练模型
initial_model.fit(
    X_train_scaled.values, y_train.values,
    max_epochs=50,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128
)

# 使用SHAP进行特征重要性分析
background = shap.kmeans(X_train_scaled.values, 10)  # 创建背景数据集
explainer = shap.KernelExplainer(initial_model.predict_proba, background)
shap_values = explainer.shap_values(X_train_scaled.values[:100])  # 使用部分数据以加快计算

# 修改 SHAP 值的计算和特征重要性的转换
# 计算 SHAP 值的绝对平均值
feature_importance = np.abs(shap_values[1]).mean(axis=0)  # 确保沿正确的轴计算平均值

# 如果 feature_importance 是二维的，取第一列
if len(feature_importance.shape) > 1:
    feature_importance = feature_importance[:, 0]

# 创建特征重要性 DataFrame
importance_df = pd.DataFrame({
    'Feature': X_train_scaled.columns,
    'Importance': feature_importance
})

# 确认数据形状
print("Feature importance shape after processing:", feature_importance.shape)
print("Number of features:", len(X_train_scaled.columns))

# 排序并获取 top features
importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False)
top_features = importance_df_sorted['Feature'].head(10).tolist()

# 可视化SHAP重要性
plt.figure(figsize=(12, 6))
shap.summary_plot(shap_values[1], X_train_scaled.values,
                 feature_names=X_train_scaled.columns,
                 show=False)
plt.title("SHAP Feature Importance")
plt.tight_layout()
plt.show()

# 准备基于SHAP选择的top features的数据
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 使用SHAP选择的Top 10特征训练模型
for method, sampler in samplers.items():
    print(f"\n=== {method} ===")
    if sampler is None:
        X_resampled = X_train_scaled_top
        y_resampled = y_train.values
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

    # 初始化TabNet模型
    tabnet_model = TabNetClassifier(
        n_d=8,
        n_a=8,
        n_steps=3,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2),
        scheduler_params={"step_size":10, "gamma":0.9},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type='sparsemax',
        device_name='auto'
    )

    # 训练模型
    tabnet_model.fit(
        X_resampled, y_resampled,
        max_epochs=50,
        patience=10,
        batch_size=1024,
        virtual_batch_size=128
    )

    # 预测和评估
    y_pred = tabnet_model.predict(X_resampled)
    y_pred_proba = tabnet_model.predict_proba(X_resampled)[:, 1]

    # 计算指标
    accuracy = accuracy_score(y_resampled, y_pred)
    precision = precision_score(y_resampled, y_pred)
    recall = recall_score(y_resampled, y_pred)
    f1 = f1_score(y_resampled, y_pred)
    roc_auc = roc_auc_score(y_resampled, y_pred_proba)

    # 绘制混淆矩阵
    conf_matrix = confusion_matrix(y_resampled, y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix ({method})\nROC-AUC: {roc_auc:.4f}')
    plt.show()

    # 打印分类报告
    print("\nClassification Report:")
    print(classification_report(y_resampled, y_pred))

    # 绘制ROC曲线
    fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({method})')
    plt.legend(loc='lower right')
    plt.show()

    metrics_data.append({
        'Balancing Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

# 创建结果比较DataFrame
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print("\nFinal Results Comparison:")
print(metrics_df)

# 绘制性能比较图
plt.figure(figsize=(12, 6))
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar')
plt.title('Comparison of Balancing Methods with TabNet (SHAP-selected features)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# 保存SHAP分析结果和模型性能
results = {
    'shap_top_features': top_features,
    'feature_importance': importance_df_sorted.to_dict(),
    'model_metrics': metrics_df.to_dict()
}

import json
with open('tabnet_shap_analysis_results.json', 'w') as f:
    json.dump(results, f, indent=4)

In [None]:
print("shap_values[1] shape:", np.array(shap_values[1]).shape)
print("feature_importance shape:", feature_importance.shape)
print("X_train_scaled.columns length:", len(X_train_scaled.columns))

triple

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_tabnet.tab_model import TabNetClassifier
import shap
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
import torch
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

# 准备数据
X_train = H2_train[selected_columns].drop(columns=['is_canceled'])
y_train = H2_train['is_canceled']

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

# 1. Feature Importance
initial_model = TabNetClassifier(
    n_d=8, n_a=8, n_steps=3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax'
)

initial_model.fit(
    X_train_scaled.values, y_train.values,
    max_epochs=50,
    patience=10,
    batch_size=1024
)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': initial_model.feature_importances_
})
importance_features = feature_importance.nlargest(10, 'Importance')['Feature'].tolist()

# 2. SHAP Analysis
background = shap.kmeans(X_train_scaled.values[:100], 10)
explainer = shap.KernelExplainer(initial_model.predict_proba, background)
shap_values = explainer.shap_values(X_train_scaled.values[:100])[1]

shap_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': np.abs(shap_values).mean(0)
})
shap_features = shap_importance.nlargest(10, 'Importance')['Feature'].tolist()

# 3. Boruta
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
boruta = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=42)
boruta.fit(X_train_scaled.values, y_train.values)

boruta_features = X_train.columns[boruta.support_].tolist()
if len(boruta_features) > 10:
    boruta_features = boruta_features[:10]

# 打印选择的特征
print("\nSelected Features by Each Method:")
print("Feature Importance:", importance_features)
print("\nSHAP:", shap_features)
print("\nBoruta:", boruta_features)

# 特征选择方法字典
feature_sets = {
    'Feature Importance': importance_features,
    'SHAP': shap_features,
    'Boruta': boruta_features
}

# 平衡方法字典
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

# 存储所有结果
all_results = {}

# 对每种特征选择方法和每种平衡方法进行训练和评估
for feature_method, selected_features in feature_sets.items():
    print(f"\n{'='*50}")
    print(f"Processing {feature_method} features")
    print('='*50)

    # 准备该特征集的数据
    X_selected = X_train_scaled[selected_features]
    all_results[feature_method] = {}

    # 对每种平衡方法进行训练
    for sampler_name, sampler in samplers.items():
        print(f"\nTraining with {sampler_name}")

        # 应用采样方法
        if sampler is None:
            X_resampled = X_selected
            y_resampled = y_train
        else:
            X_resampled, y_resampled = sampler.fit_resample(X_selected, y_train)

        # 训练集和验证集分割
        X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
            X_resampled, y_resampled, test_size=0.2, random_state=42
        )

        # 初始化和训练模型
        model = TabNetClassifier(
            n_d=8, n_a=8, n_steps=3,
            optimizer_fn=torch.optim.Adam,
            optimizer_params=dict(lr=2e-2),
            scheduler_params={"step_size":10, "gamma":0.9},
            scheduler_fn=torch.optim.lr_scheduler.StepLR,
            mask_type='sparsemax'
        )

        model.fit(
            X_train_split, y_train_split,
            eval_set=[(X_val_split, y_val_split)],
            max_epochs=50,
            patience=10,
            batch_size=1024
        )

        # 预测和评估
        y_pred = model.predict(X_val_split)
        y_pred_proba = model.predict_proba(X_val_split)[:, 1]

        # 存储结果
        all_results[feature_method][sampler_name] = {
            'accuracy': accuracy_score(y_val_split, y_pred),
            'precision': precision_score(y_val_split, y_pred),
            'recall': recall_score(y_val_split, y_pred),
            'f1': f1_score(y_val_split, y_pred),
            'roc_auc': roc_auc_score(y_val_split, y_pred_proba),
            'predictions': y_pred,
            'probabilities': y_pred_proba,
            'true_values': y_val_split
        }

        # 打印分类报告
        print(f"\nClassification Report for {feature_method} - {sampler_name}:")
        print(classification_report(y_val_split, y_pred))

        # 绘制混淆矩阵
        plt.figure(figsize=(8, 6))
        conf_matrix = confusion_matrix(y_val_split, y_pred)
        conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
        for i in range(conf_matrix.shape[0]):
            for j in range(conf_matrix.shape[1]):
                percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
                plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)

        plt.title(f'Confusion Matrix\n{feature_method} - {sampler_name}')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.show()

# 创建综合性能比较表
comparison_data = []
for feature_method in feature_sets.keys():
    for sampler_name in samplers.keys():
        results = all_results[feature_method][sampler_name]
        comparison_data.append({
            'Feature Method': feature_method,
            'Sampling Method': sampler_name,
            'Accuracy': results['accuracy'],
            'Precision': results['precision'],
            'Recall': results['recall'],
            'F1 Score': results['f1'],
            'ROC AUC': results['roc_auc']
        })

comparison_df = pd.DataFrame(comparison_data)

# 绘制综合性能比较图
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']
for metric in metrics:
    plt.figure(figsize=(12, 6))
    comparison_pivot = comparison_df.pivot(
        index='Sampling Method',
        columns='Feature Method',
        values=metric
    )
    comparison_pivot.plot(kind='bar')
    plt.title(f'{metric} Comparison')
    plt.xlabel('Sampling Method')
    plt.ylabel(metric)
    plt.legend(title='Feature Selection Method', bbox_to_anchor=(1.05, 1))
    plt.tight_layout()
    plt.show()

# 为每种特征选择方法绘制ROC曲线比较图
for feature_method in feature_sets.keys():
    plt.figure(figsize=(10, 8))
    for sampler_name in samplers.keys():
        results = all_results[feature_method][sampler_name]
        fpr, tpr, _ = roc_curve(results['true_values'], results['probabilities'])
        plt.plot(fpr, tpr,
                label=f'{sampler_name} (AUC = {results["roc_auc"]:.3f})')

    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curves Comparison - {feature_method}')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.show()

# 保存完整结果
final_results = {
    'feature_sets': {k: v for k, v in feature_sets.items()},
    'metrics': comparison_df.to_dict('records'),
    'feature_importance_scores': {
        'tabnet': feature_importance.to_dict(),
        'shap': shap_importance.to_dict(),
        'boruta': {'selected_features': boruta_features}
    }
}

# 保存为JSON文件
with open('complete_comparison_results.json', 'w') as f:
    json.dump(final_results, f, indent=4)

# 显示最佳组合
print("\nBest Combinations by Different Metrics:")
for metric in metrics:
    best_idx = comparison_df[metric].idxmax()
    best_combo = comparison_df.iloc[best_idx]
    print(f"\nBest {metric}:")
    print(f"Feature Method: {best_combo['Feature Method']}")
    print(f"Sampling Method: {best_combo['Sampling Method']}")
    print(f"Score: {best_combo[metric]:.4f}")

BORUTA 4

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
import torch
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

# 首先进行Boruta特征选择
print("Starting Boruta feature selection...")
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
boruta_selector = BorutaPy(
    rf,
    n_estimators='auto',  # 自动选择评估次数
    verbose=2,
    random_state=42,
    max_iter=100  # 最大迭代次数
)

# 准备数据用于Boruta（需要numpy array格式）
X_for_boruta = X_train.values
y_for_boruta = y_train.values

# 运行Boruta特征选择
boruta_selector.fit(X_for_boruta, y_for_boruta)

# 获取选中的特征
feature_names = X_train.columns.tolist()
boruta_features = [feature for feature, selected in zip(feature_names, boruta_selector.support_) if selected]

print(f"\nBoruta selected {len(boruta_features)} features:")
print(boruta_features)

# 使用boruta筛选出的特征
X_train_boruta = X_train[boruta_features]
X_train_scaled_boruta = scaler.fit_transform(X_train_boruta)
X_train_scaled_boruta = pd.DataFrame(X_train_scaled_boruta, columns=boruta_features)

# 定义TabNet基础配置
def get_tabnet_config():
    return {
        'n_d': 16,
        'n_a': 16,
        'n_steps': 4,
        'gamma': 1.3,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': dict(lr=2e-2),
        'scheduler_params': {"step_size":10, "gamma":0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    }

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

metrics_data = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 对每种采样方法训练TabNet模型
for method, sampler in samplers.items():
    print(f"\n=== {method} ===")

    if sampler is None:
        X_resampled = X_train_scaled_boruta.values
        y_resampled = y_train.values
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_boruta, y_train)
        X_resampled = np.array(X_resampled)
        y_resampled = np.array(y_resampled)

    # 存储每个fold的预测结果
    all_y_true = []
    all_y_pred = []
    all_y_pred_proba = []

    # 执行交叉验证
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_resampled, y_resampled)):
        print(f"Training fold {fold + 1}/3...")

        X_fold_train = X_resampled[train_idx]
        y_fold_train = y_resampled[train_idx]
        X_fold_val = X_resampled[val_idx]
        y_fold_val = y_resampled[val_idx]

        tabnet_model = TabNetClassifier(**get_tabnet_config())
        tabnet_model.fit(
            X_fold_train, y_fold_train,
            eval_set=[(X_fold_val, y_fold_val)],
            max_epochs=100,
            patience=15,
            batch_size=1024,
            virtual_batch_size=128
        )

        fold_preds = tabnet_model.predict(X_fold_val)
        fold_pred_probas = tabnet_model.predict_proba(X_fold_val)[:, 1]

        all_y_true.extend(y_fold_val)
        all_y_pred.extend(fold_preds)
        all_y_pred_proba.extend(fold_pred_probas)

    # 计算评估指标
    all_y_true = np.array(all_y_true)
    all_y_pred = np.array(all_y_pred)
    all_y_pred_proba = np.array(all_y_pred_proba)

    accuracy = accuracy_score(all_y_true, all_y_pred)
    precision = precision_score(all_y_true, all_y_pred)
    recall = recall_score(all_y_true, all_y_pred)
    f1 = f1_score(all_y_true, all_y_pred)
    roc_auc = roc_auc_score(all_y_true, all_y_pred_proba)

    # 绘制混淆矩阵
    conf_matrix = confusion_matrix(all_y_true, all_y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - TabNet with Boruta Features ({method})')
    plt.show()

    # 打印分类报告
    print("\nClassification Report:")
    print(classification_report(all_y_true, all_y_pred))

    # 绘制ROC曲线
    fpr, tpr, _ = roc_curve(all_y_true, all_y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - TabNet with Boruta Features ({method})')
    plt.legend(loc='lower right')
    plt.show()

    metrics_data.append({
        'Balancing Method': method,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

# 创建结果比较表
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print("\nFinal Results Comparison (TabNet with Boruta Features):")
print(metrics_df)

# 绘制性能比较图
plt.figure(figsize=(12, 6))
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar')
plt.title('Comparison of Balancing Methods - TabNet (Boruta Selected Features)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# 保存Boruta特征选择结果和模型评估结果
results = {
    'boruta_features': boruta_features,
    'metrics': metrics_df.to_dict()
}

import json
with open('tabnet_boruta_results.json', 'w') as f:
    json.dump(results, f, indent=4)

### hyperparameter tuning

BASELINE 4

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
import torch
import optuna
import warnings
warnings.filterwarnings('ignore')

# 准备数据 - 使用所有特征
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = np.array(X_train_scaled)
y_train = np.array(y_train)

def objective(trial, X, y):
    # 简化的超参数搜索空间
    param = {
        'n_d': trial.suggest_int('n_d', 8, 32),
        'n_a': trial.suggest_int('n_a', 8, 32),
        'n_steps': trial.suggest_int('n_steps', 3, 5),
        'gamma': trial.suggest_float('gamma', 1.0, 1.5),
        'n_independent': 2,
        'n_shared': 2,
        'momentum': 0.02,
        'lambda_sparse': trial.suggest_float('lambda_sparse', 1e-6, 1e-3, log=True),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {
            'lr': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True)
        },
        'scheduler_params': {
            'step_size': 10,
            'gamma': 0.9
        },
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax'
    }

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in cv.split(X, y):
        X_train_fold = X[train_idx]
        y_train_fold = y[train_idx]
        X_val_fold = X[val_idx]
        y_val_fold = y[val_idx]

        model = TabNetClassifier(
            device_name='auto',
            **param
        )

        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            max_epochs=50,  # 减少训练轮次
            patience=10,    # 减少patience
            batch_size=512, # 减小batch size
            virtual_batch_size=64
        )

        y_pred = model.predict_proba(X_val_fold)[:, 1]
        score = roc_auc_score(y_val_fold, y_pred)
        scores.append(score)

    return np.mean(scores)

# 运行Optuna优化
print("\n=== Optimizing Baseline Model (All Features) ===")
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X_train_scaled, y_train),
              n_trials=15,  # 使用15次试验
              show_progress_bar=True)

print("\nBest parameters:")
print(study.best_params)
print(f"Best ROC-AUC: {study.best_value:.4f}")

# 使用最佳参数训练最终模型
best_config = {
    'n_d': study.best_params['n_d'],
    'n_a': study.best_params['n_a'],
    'n_steps': study.best_params['n_steps'],
    'gamma': study.best_params['gamma'],
    'n_independent': 2,
    'n_shared': 2,
    'momentum': 0.02,
    'lambda_sparse': study.best_params['lambda_sparse'],
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': {'lr': study.best_params['learning_rate']},
    'scheduler_params': {
        'step_size': 10,
        'gamma': 0.9
    },
    'scheduler_fn': torch.optim.lr_scheduler.StepLR,
    'mask_type': 'sparsemax',
    'device_name': 'auto'
}

# 评估最终模型
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
all_y_true = []
all_y_pred = []
all_y_pred_proba = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_scaled, y_train)):
    print(f"Training fold {fold + 1}/3...")

    X_fold_train = X_train_scaled[train_idx]
    y_fold_train = y_train[train_idx]
    X_fold_val = X_train_scaled[val_idx]
    y_fold_val = y_train[val_idx]

    best_model = TabNetClassifier(**best_config)
    best_model.fit(
        X_fold_train, y_fold_train,
        eval_set=[(X_fold_val, y_fold_val)],
        max_epochs=50,
        patience=10,
        batch_size=512,
        virtual_batch_size=64
    )

    fold_preds = best_model.predict(X_fold_val)
    fold_pred_probas = best_model.predict_proba(X_fold_val)[:, 1]

    all_y_true.extend(y_fold_val)
    all_y_pred.extend(fold_preds)
    all_y_pred_proba.extend(fold_pred_probas)

# 计算评估指标
all_y_true = np.array(all_y_true)
all_y_pred = np.array(all_y_pred)
all_y_pred_proba = np.array(all_y_pred_proba)

accuracy = accuracy_score(all_y_true, all_y_pred)
precision = precision_score(all_y_true, all_y_pred)
recall = recall_score(all_y_true, all_y_pred)
f1 = f1_score(all_y_true, all_y_pred)
roc_auc = roc_auc_score(all_y_true, all_y_pred_proba)

print("\nModel Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# 绘制混淆矩阵
conf_matrix = confusion_matrix(all_y_true, all_y_pred)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title(f'Confusion Matrix - Baseline TabNet (All Features)\nBest ROC-AUC: {study.best_value:.4f}')
plt.show()

# 打印分类报告
print("\nClassification Report:")
print(classification_report(all_y_true, all_y_pred))

# 绘制ROC曲线
fpr, tpr, _ = roc_curve(all_y_true, all_y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Baseline (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Baseline TabNet (All Features)')
plt.legend(loc='lower right')
plt.show()

# 绘制优化历史
plt.figure(figsize=(10, 6))
plt.plot([trial.number for trial in study.trials], [trial.value for trial in study.trials], marker='o')
plt.xlabel('Trial Number')
plt.ylabel('ROC-AUC Score')
plt.title('Optimization History (Baseline)')
plt.show()

# 保存结果
results = {
    'best_params': study.best_params,
    'best_score': study.best_value,
    'metrics': {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }
}

with open('tabnet_baseline_results.json', 'w') as f:
    json.dump(results, f, indent=4)

The remaining three balance methods

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
import torch
import optuna
import warnings
warnings.filterwarnings('ignore')

def objective(trial, X, y, sampler=None):
    # 定义超参数搜索空间
    param = {
        'n_d': trial.suggest_int('n_d', 8, 32),
        'n_a': trial.suggest_int('n_a', 8, 32),
        'n_steps': trial.suggest_int('n_steps', 3, 5),
        'gamma': trial.suggest_float('gamma', 1.0, 1.5),
        'n_independent': 2,
        'n_shared': 2,
        'momentum': 0.02,
        'lambda_sparse': trial.suggest_float('lambda_sparse', 1e-6, 1e-3, log=True),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {
            'lr': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True)
        },
        'scheduler_params': {
            'step_size': 10,
            'gamma': 0.9
        },
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax'
    }

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []

    if sampler is not None:
        X_resampled, y_resampled = sampler.fit_resample(X, y)
    else:
        X_resampled, y_resampled = X, y

    for train_idx, val_idx in cv.split(X_resampled, y_resampled):
        X_train_fold = X_resampled[train_idx]
        y_train_fold = y_resampled[train_idx]
        X_val_fold = X_resampled[val_idx]
        y_val_fold = y_resampled[val_idx]

        model = TabNetClassifier(
            device_name='auto',
            **param
        )

        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            max_epochs=50,
            patience=10,
            batch_size=512,
            virtual_batch_size=64
        )

        y_pred = model.predict_proba(X_val_fold)[:, 1]
        score = roc_auc_score(y_val_fold, y_pred)
        scores.append(score)

    return np.mean(scores)

# 准备数据
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = np.array(X_train_scaled)
y_train = np.array(y_train)

# 定义采样方法
samplers = {
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

# 存储结果
best_params = {}
metrics_data = []

# 对每种采样方法进行优化
for method, sampler in samplers.items():
    print(f"\n=== Optimizing {method} ===")

    # 运行Optuna优化
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train_scaled, y_train, sampler),
                  n_trials=15,
                  show_progress_bar=True)

    best_params[method] = study.best_params
    print(f"\nBest parameters for {method}:")
    print(study.best_params)
    print(f"Best ROC-AUC: {study.best_value:.4f}")

    # 使用最佳参数训练最终模型
    best_config = {
        'n_d': study.best_params['n_d'],
        'n_a': study.best_params['n_a'],
        'n_steps': study.best_params['n_steps'],
        'gamma': study.best_params['gamma'],
        'n_independent': 2,
        'n_shared': 2,
        'momentum': 0.02,
        'lambda_sparse': study.best_params['lambda_sparse'],
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': study.best_params['learning_rate']},
        'scheduler_params': {
            'step_size': 10,
            'gamma': 0.9
        },
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    }

    # 应用采样方法
    X_resampled, y_resampled = sampler.fit_resample(X_train_scaled, y_train)

    # 评估最终模型
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    all_y_true = []
    all_y_pred = []
    all_y_pred_proba = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X_resampled, y_resampled)):
        print(f"Training fold {fold + 1}/3...")

        X_fold_train = X_resampled[train_idx]
        y_fold_train = y_resampled[train_idx]
        X_fold_val = X_resampled[val_idx]
        y_fold_val = y_resampled[val_idx]

        best_model = TabNetClassifier(**best_config)
        best_model.fit(
            X_fold_train, y_fold_train,
            eval_set=[(X_fold_val, y_fold_val)],
            max_epochs=50,
            patience=10,
            batch_size=512,
            virtual_batch_size=64
        )

        fold_preds = best_model.predict(X_fold_val)
        fold_pred_probas = best_model.predict_proba(X_fold_val)[:, 1]

        all_y_true.extend(y_fold_val)
        all_y_pred.extend(fold_preds)
        all_y_pred_proba.extend(fold_pred_probas)

    # 计算评估指标
    all_y_true = np.array(all_y_true)
    all_y_pred = np.array(all_y_pred)
    all_y_pred_proba = np.array(all_y_pred_proba)

    accuracy = accuracy_score(all_y_true, all_y_pred)
    precision = precision_score(all_y_true, all_y_pred)
    recall = recall_score(all_y_true, all_y_pred)
    f1 = f1_score(all_y_true, all_y_pred)
    roc_auc = roc_auc_score(all_y_true, all_y_pred_proba)

    metrics_data.append({
        'Method': method,
        'Best ROC-AUC': study.best_value,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

    # 绘制混淆矩阵
    conf_matrix = confusion_matrix(all_y_true, all_y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - TabNet ({method})\nBest ROC-AUC: {study.best_value:.4f}')
    plt.show()

    # 绘制ROC曲线
    fpr, tpr, _ = roc_curve(all_y_true, all_y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - TabNet ({method})')
    plt.legend(loc='lower right')
    plt.show()

    # 绘制优化历史
    plt.figure(figsize=(10, 6))
    plt.plot([trial.number for trial in study.trials], [trial.value for trial in study.trials], marker='o')
    plt.xlabel('Trial Number')
    plt.ylabel('ROC-AUC Score')
    plt.title(f'Optimization History ({method})')
    plt.show()

# 创建结果比较表
metrics_df = pd.DataFrame(metrics_data).set_index('Method')
print("\nFinal Results Comparison:")
print(metrics_df)

# 保存结果
results = {
    'best_params': best_params,
    'metrics': metrics_df.to_dict()
}

with open('tabnet_balance_methods_results.json', 'w') as f:
    json.dump(results, f, indent=4)

feature importance top10 4

SMOTE cannot run, but the other three can produce results normally, which can be used as a reference; SMOTE is the only one in the back cell

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
import torch
import optuna
import warnings
warnings.filterwarnings('ignore')

# 使用已经获得的top 10 features
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)
X_train_scaled_top = pd.DataFrame(X_train_scaled_top, columns=top_features)

# 定义Optuna目标函数
def objective(trial, X, y):
    # 定义超参数搜索空间
    param = {
        'n_d': trial.suggest_int('n_d', 8, 64),
        'n_a': trial.suggest_int('n_a', 8, 64),
        'n_steps': trial.suggest_int('n_steps', 3, 10),
        'gamma': trial.suggest_float('gamma', 1.0, 2.0),
        'n_independent': trial.suggest_int('n_independent', 1, 5),
        'n_shared': trial.suggest_int('n_shared', 1, 5),
        'momentum': trial.suggest_float('momentum', 0.01, 0.4),
        'lambda_sparse': trial.suggest_float('lambda_sparse', 1e-6, 1e-3, log=True),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {
            'lr': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
        },
        'scheduler_params': {
            'step_size': trial.suggest_int('step_size', 5, 20),
            'gamma': trial.suggest_float('scheduler_gamma', 0.8, 0.95)
        },
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax'
    }

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in cv.split(X, y):
        X_train_fold = X[train_idx]
        y_train_fold = y[train_idx]
        X_val_fold = X[val_idx]
        y_val_fold = y[val_idx]

        model = TabNetClassifier(
            device_name='auto',
            **param
        )

        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            max_epochs=100,
            patience=15,
            batch_size=1024,
            virtual_batch_size=128
        )

        y_pred = model.predict_proba(X_val_fold)[:, 1]
        score = roc_auc_score(y_val_fold, y_pred)
        scores.append(score)

    return np.mean(scores)

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

# 存储每种方法的最佳参数和结果
best_params = {}
metrics_data = []

# 对每种采样方法进行优化
for method, sampler in samplers.items():
    print(f"\n=== Optimizing {method} ===")

    if sampler is None:
        X_resampled = X_train_scaled_top.values
        y_resampled = y_train.values
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)
        X_resampled = np.array(X_resampled)
        y_resampled = np.array(y_resampled)

    # 运行Optuna优化
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_resampled, y_resampled),
                  n_trials=20,  # 可以根据需要调整试验次数
                  show_progress_bar=True)

    best_params[method] = study.best_params
    print(f"\nBest parameters for {method}:")
    print(study.best_params)
    print(f"Best ROC-AUC: {study.best_value:.4f}")

    # 使用最佳参数训练最终模型
    best_config = {
        'n_d': study.best_params['n_d'],
        'n_a': study.best_params['n_a'],
        'n_steps': study.best_params['n_steps'],
        'gamma': study.best_params['gamma'],
        'n_independent': study.best_params['n_independent'],
        'n_shared': study.best_params['n_shared'],
        'momentum': study.best_params['momentum'],
        'lambda_sparse': study.best_params['lambda_sparse'],
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': study.best_params['learning_rate']},
        'scheduler_params': {
            'step_size': study.best_params['step_size'],
            'gamma': study.best_params['scheduler_gamma']
        },
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    }

    # 存储每个fold的预测结果
    all_y_true = []
    all_y_pred = []
    all_y_pred_proba = []

    # 执行交叉验证
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_resampled, y_resampled)):
        print(f"Training fold {fold + 1}/3...")

        X_fold_train = X_resampled[train_idx]
        y_fold_train = y_resampled[train_idx]
        X_fold_val = X_resampled[val_idx]
        y_fold_val = y_resampled[val_idx]

        # 使用最佳参数训练模型
        best_model = TabNetClassifier(**best_config)
        best_model.fit(
            X_fold_train, y_fold_train,
            eval_set=[(X_fold_val, y_fold_val)],
            max_epochs=100,
            patience=15,
            batch_size=1024,
            virtual_batch_size=128
        )

        fold_preds = best_model.predict(X_fold_val)
        fold_pred_probas = best_model.predict_proba(X_fold_val)[:, 1]

        all_y_true.extend(y_fold_val)
        all_y_pred.extend(fold_preds)
        all_y_pred_proba.extend(fold_pred_probas)

    # 转换为numpy数组
    all_y_true = np.array(all_y_true)
    all_y_pred = np.array(all_y_pred)
    all_y_pred_proba = np.array(all_y_pred_proba)

    # 计算评估指标
    accuracy = accuracy_score(all_y_true, all_y_pred)
    precision = precision_score(all_y_true, all_y_pred)
    recall = recall_score(all_y_true, all_y_pred)
    f1 = f1_score(all_y_true, all_y_pred)
    roc_auc = roc_auc_score(all_y_true, all_y_pred_proba)

    # 绘制混淆矩阵
    conf_matrix = confusion_matrix(all_y_true, all_y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - TabNet ({method})\nBest ROC-AUC: {study.best_value:.4f}')
    plt.show()

    # 打印分类报告
    print("\nClassification Report:")
    print(classification_report(all_y_true, all_y_pred))

    # 绘制ROC曲线
    fpr, tpr, _ = roc_curve(all_y_true, all_y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - Optimized TabNet ({method})')
    plt.legend(loc='lower right')
    plt.show()

    # 存储评估指标
    metrics_data.append({
        'Balancing Method': method,
        'Best ROC-AUC': study.best_value,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

    # 绘制优化历史
    plt.figure(figsize=(10, 6))
    plt.plot([trial.number for trial in study.trials], [trial.value for trial in study.trials], marker='o')
    plt.xlabel('Trial Number')
    plt.ylabel('ROC-AUC Score')
    plt.title(f'Optimization History ({method})')
    plt.show()

# 创建并显示结果比较表
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print("\nFinal Results Comparison (Optimized TabNet):")
print(metrics_df)

# 绘制性能比较图
plt.figure(figsize=(12, 6))
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'Best ROC-AUC']].plot(kind='bar')
plt.title('Comparison of Balancing Methods - Optimized TabNet (Top 10 Features)')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# 保存最佳参数和结果
results = {
    'best_params': best_params,
    'metrics': metrics_df.to_dict()
}

# 保存结果到文件
import json
with open('tabnet_optimization_results.json', 'w') as f:
    json.dump(results, f, indent=4)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
import torch
import optuna
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# 使用已经获得的top 10 features
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)
X_train_scaled_top = pd.DataFrame(X_train_scaled_top, columns=top_features)

def objective(trial, X, y):

    param = {
        'n_d': trial.suggest_int('n_d', 8, 32),
        'n_a': trial.suggest_int('n_a', 8, 32),
        'n_steps': trial.suggest_int('n_steps', 3, 5),
        'gamma': trial.suggest_float('gamma', 1.0, 1.5),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {
            'lr': trial.suggest_float('learning_rate', 0.01, 0.1)
        },
        'scheduler_params': {
            'step_size': 10,
            'gamma': 0.9
        },
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    }


    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    model = TabNetClassifier(**param)

    try:
        model.fit(
            X_train_split, y_train_split,
            eval_set=[(X_val_split, y_val_split)],
            max_epochs=30,  # 减少训练轮数
            patience=5,     # 减少早停轮数
            batch_size=2048,  # 增大批量
            virtual_batch_size=256
        )

        y_pred = model.predict_proba(X_val_split)[:, 1]
        score = roc_auc_score(y_val_split, y_pred)
        return score
    except Exception as e:
        print(f"Trial failed: {e}")
        return 0  # 返回一个很差的分数

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)  # 只使用最常用的两种重采样方法
}

# 存储结果
best_params = {}
metrics_data = []
start_time = datetime.now()

# 对每种采样方法进行优化
for method, sampler in samplers.items():
    print(f"\n=== Optimizing {method} ===")
    print(f"Start time: {datetime.now()}")

    if sampler is None:
        X_resampled = X_train_scaled_top.values
        y_resampled = y_train.values
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)
        X_resampled = np.array(X_resampled)
        y_resampled = np.array(y_resampled)

    # 创建和运行优化研究
    study = optuna.create_study(direction='maximize')
    study.optimize(
        lambda trial: objective(trial, X_resampled, y_resampled),
        n_trials=10,  # 减少试验次数
        timeout=1800,  # 设置30分钟超时
        show_progress_bar=True
    )

    best_params[method] = study.best_params
    print(f"\nBest parameters for {method}:")
    print(study.best_params)
    print(f"Best ROC-AUC: {study.best_value:.4f}")

    # 使用最佳参数训练最终模型
    best_config = {
        'n_d': study.best_params['n_d'],
        'n_a': study.best_params['n_a'],
        'n_steps': study.best_params['n_steps'],
        'gamma': study.best_params['gamma'],
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': study.best_params['learning_rate']},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    }

    # 使用train_test_split进行最终评估
    X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
        X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
    )

    # 训练最终模型
    final_model = TabNetClassifier(**best_config)
    final_model.fit(
        X_train_final, y_train_final,
        eval_set=[(X_test_final, y_test_final)],
        max_epochs=50,  # 最终模型使用更多轮数
        patience=10,
        batch_size=2048,
        virtual_batch_size=256
    )

    # 预测和评估
    y_pred = final_model.predict(X_test_final)
    y_pred_proba = final_model.predict_proba(X_test_final)[:, 1]

    # 计算指标
    metrics_data.append({
        'Balancing Method': method,
        'Best ROC-AUC': study.best_value,
        'Accuracy': accuracy_score(y_test_final, y_pred),
        'Precision': precision_score(y_test_final, y_pred),
        'Recall': recall_score(y_test_final, y_pred),
        'F1 Score': f1_score(y_test_final, y_pred),
        'ROC AUC': roc_auc_score(y_test_final, y_pred_proba)
    })

    print(f"Time elapsed for {method}: {datetime.now() - start_time}")

# 创建结果比较表
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print("\nFinal Results Comparison:")
print(metrics_df)

# 绘制性能比较图
plt.figure(figsize=(10, 6))
metrics_df.set_index('Balancing Method')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(kind='bar')
plt.title('Comparison of Balancing Methods - Optimized TabNet')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# 保存结果
results = {
    'best_params': best_params,
    'metrics': metrics_df.to_dict(),
    'total_time': str(datetime.now() - start_time)
}

import json
with open('tabnet_efficient_optimization_results.json', 'w') as f:
    json.dump(results, f, indent=4)

smote

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import torch
import optuna
import warnings
warnings.filterwarnings('ignore')

# 假设X_train_scaled_top和y_train已经准备好
# 如果没有，需要先进行特征选择和标准化
# X_train_top = X_train[top_features]
# X_train_scaled_top = scaler.fit_transform(X_train_top)
# X_train_scaled_top = pd.DataFrame(X_train_scaled_top, columns=top_features)

def objective(trial, X, y):
    param = {
        'n_d': trial.suggest_int('n_d', 8, 64),
        'n_a': trial.suggest_int('n_a', 8, 64),
        'n_steps': trial.suggest_int('n_steps', 3, 10),
        'gamma': trial.suggest_float('gamma', 1.0, 2.0),
        'n_independent': trial.suggest_int('n_independent', 1, 5),
        'n_shared': trial.suggest_int('n_shared', 1, 5),
        'momentum': trial.suggest_float('momentum', 0.01, 0.4),
        'lambda_sparse': trial.suggest_float('lambda_sparse', 1e-6, 1e-3, log=True),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {
            'lr': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
        },
        'scheduler_params': {
            'step_size': trial.suggest_int('step_size', 5, 20),
            'gamma': trial.suggest_float('scheduler_gamma', 0.8, 0.95)
        },
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax'
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # 增加到5折交叉验证
    scores = []

    for train_idx, val_idx in cv.split(X, y):
        X_train_fold = X[train_idx]
        y_train_fold = y[train_idx]
        X_val_fold = X[val_idx]
        y_val_fold = y[val_idx]

        model = TabNetClassifier(
            device_name='auto',
            **param
        )

        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            max_epochs=100,
            patience=20,  # 增加patience
            batch_size=1024,
            virtual_batch_size=128
        )

        y_pred = model.predict_proba(X_val_fold)[:, 1]
        score = roc_auc_score(y_val_fold, y_pred)
        scores.append(score)

    return np.mean(scores)

# 应用SMOTE
print("Applying SMOTE...")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled_top, y_train)
X_resampled = np.array(X_resampled)
y_resampled = np.array(y_resampled)

# 运行Optuna优化
print("\nStarting Optuna optimization...")
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X_resampled, y_resampled),
              n_trials=50,  # 增加到50次试验
              show_progress_bar=True)

print(f"\nBest parameters:")
print(study.best_params)
print(f"Best ROC-AUC: {study.best_value:.4f}")

# 使用最佳参数训练最终模型
best_config = {
    'n_d': study.best_params['n_d'],
    'n_a': study.best_params['n_a'],
    'n_steps': study.best_params['n_steps'],
    'gamma': study.best_params['gamma'],
    'n_independent': study.best_params['n_independent'],
    'n_shared': study.best_params['n_shared'],
    'momentum': study.best_params['momentum'],
    'lambda_sparse': study.best_params['lambda_sparse'],
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': {'lr': study.best_params['learning_rate']},
    'scheduler_params': {
        'step_size': study.best_params['step_size'],
        'gamma': study.best_params['scheduler_gamma']
    },
    'scheduler_fn': torch.optim.lr_scheduler.StepLR,
    'mask_type': 'sparsemax',
    'device_name': 'auto'
}

# 存储每个fold的预测结果
all_y_true = []
all_y_pred = []
all_y_pred_proba = []

# 执行交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # 使用5折交叉验证
for fold, (train_idx, val_idx) in enumerate(cv.split(X_resampled, y_resampled)):
    print(f"Training fold {fold + 1}/5...")

    X_fold_train = X_resampled[train_idx]
    y_fold_train = y_resampled[train_idx]
    X_fold_val = X_resampled[val_idx]
    y_fold_val = y_resampled[val_idx]

    best_model = TabNetClassifier(**best_config)
    best_model.fit(
        X_fold_train, y_fold_train,
        eval_set=[(X_fold_val, y_fold_val)],
        max_epochs=100,
        patience=20,
        batch_size=1024,
        virtual_batch_size=128
    )

    fold_preds = best_model.predict(X_fold_val)
    fold_pred_probas = best_model.predict_proba(X_fold_val)[:, 1]

    all_y_true.extend(y_fold_val)
    all_y_pred.extend(fold_preds)
    all_y_pred_proba.extend(fold_pred_probas)

# 计算评估指标
all_y_true = np.array(all_y_true)
all_y_pred = np.array(all_y_pred)
all_y_pred_proba = np.array(all_y_pred_proba)

accuracy = accuracy_score(all_y_true, all_y_pred)
precision = precision_score(all_y_true, all_y_pred)
recall = recall_score(all_y_true, all_y_pred)
f1 = f1_score(all_y_true, all_y_pred)
roc_auc = roc_auc_score(all_y_true, all_y_pred_proba)

# 打印评估结果
print("\nModel Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# 绘制混淆矩阵
conf_matrix = confusion_matrix(all_y_true, all_y_pred)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title(f'Confusion Matrix - TabNet with SMOTE\nBest ROC-AUC: {study.best_value:.4f}')
plt.show()

# 打印分类报告
print("\nClassification Report:")
print(classification_report(all_y_true, all_y_pred))

# 绘制ROC曲线
fpr, tpr, _ = roc_curve(all_y_true, all_y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'SMOTE (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Optimized TabNet with SMOTE')
plt.legend(loc='lower right')
plt.show()

# 绘制优化历史
plt.figure(figsize=(10, 6))
plt.plot([trial.number for trial in study.trials], [trial.value for trial in study.trials], marker='o')
plt.xlabel('Trial Number')
plt.ylabel('ROC-AUC Score')
plt.title('Optimization History (SMOTE)')
plt.show()

# 保存最佳参数和结果
results = {
    'best_params': study.best_params,
    'best_score': study.best_value,
    'metrics': {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }
}

# 保存结果到文件
import json
with open('tabnet_smote_results.json', 'w') as f:
    json.dump(results, f, indent=4)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
import numpy as np
import torch
import optuna
import warnings
warnings.filterwarnings('ignore')

def objective(trial, X, y):
    # 简化的超参数搜索空间
    param = {
        'n_d': trial.suggest_int('n_d', 8, 32),  # 缩小范围
        'n_a': trial.suggest_int('n_a', 8, 32),  # 缩小范围
        'n_steps': trial.suggest_int('n_steps', 3, 5),  # 缩小范围
        'gamma': 1.5,  # 固定值
        'n_independent': 2,  # 固定值
        'n_shared': 2,  # 固定值
        'momentum': 0.02,  # 固定值
        'lambda_sparse': trial.suggest_float('lambda_sparse', 1e-6, 1e-3, log=True),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {
            'lr': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True)
        },
        'scheduler_params': {
            'step_size': 10,  # 固定值
            'gamma': 0.9  # 固定值
        },
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax'
    }

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)  # 减少到3折
    scores = []

    for train_idx, val_idx in cv.split(X, y):
        X_train_fold = X[train_idx]
        y_train_fold = y[train_idx]
        X_val_fold = X[val_idx]
        y_val_fold = y[val_idx]

        model = TabNetClassifier(
            device_name='auto',
            **param
        )

        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            max_epochs=50,  # 减少训练轮次
            patience=10,  # 减少patience
            batch_size=512,  # 减小batch size
            virtual_batch_size=64
        )

        y_pred = model.predict_proba(X_val_fold)[:, 1]
        score = roc_auc_score(y_val_fold, y_pred)
        scores.append(score)

    return np.mean(scores)

# 应用SMOTE
print("Applying SMOTE...")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled_top, y_train)
X_resampled = np.array(X_resampled)
y_resampled = np.array(y_resampled)

# 运行Optuna优化
print("\nStarting Optuna optimization...")
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X_resampled, y_resampled),
              n_trials=20,  # 减少试验次数
              show_progress_bar=True)

print(f"\nBest parameters:")
print(study.best_params)
print(f"Best ROC-AUC: {study.best_value:.4f}")

# 使用最佳参数训练最终模型
best_config = {
    'n_d': study.best_params['n_d'],
    'n_a': study.best_params['n_a'],
    'n_steps': study.best_params['n_steps'],
    'gamma': 1.5,
    'n_independent': 2,
    'n_shared': 2,
    'momentum': 0.02,
    'lambda_sparse': study.best_params['lambda_sparse'],
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': {'lr': study.best_params['learning_rate']},
    'scheduler_params': {
        'step_size': 10,
        'gamma': 0.9
    },
    'scheduler_fn': torch.optim.lr_scheduler.StepLR,
    'mask_type': 'sparsemax',
    'device_name': 'auto'
}

# 训练最终模型
final_model = TabNetClassifier(**best_config)
final_model.fit(
    X_resampled, y_resampled,
    eval_set=[(X_resampled, y_resampled)],
    max_epochs=50,
    patience=10,
    batch_size=512,
    virtual_batch_size=64
)

# 获取预测结果
y_pred = final_model.predict(X_resampled)
y_pred_proba = final_model.predict_proba(X_resampled)[:, 1]

# 保存模型和结果
results = {
    'best_params': study.best_params,
    'best_score': study.best_value
}

import json
with open('fast_tabnet_smote_results.json', 'w') as f:
    json.dump(results, f, indent=4)

Best parameters:
{'n_d': 23, 'n_a': 32, 'n_steps': 3, 'lambda_sparse': 2.298427358180666e-06, 'learning_rate': 0.0063223512856359545}

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

# 计算评估指标
y_pred = final_model.predict(X_resampled)
y_pred_proba = final_model.predict_proba(X_resampled)[:, 1]

accuracy = accuracy_score(y_resampled, y_pred)
precision = precision_score(y_resampled, y_pred)
recall = recall_score(y_resampled, y_pred)
f1 = f1_score(y_resampled, y_pred)
roc_auc = roc_auc_score(y_resampled, y_pred_proba)

# 打印评估指标
print("\nModel Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# 绘制混淆矩阵
conf_matrix = confusion_matrix(y_resampled, y_pred)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title(f'Confusion Matrix - TabNet with SMOTE\nBest ROC-AUC: {study.best_value:.4f}')
plt.show()

# 打印分类报告
print("\nClassification Report:")
print(classification_report(y_resampled, y_pred))

# 绘制ROC曲线
fpr, tpr, _ = roc_curve(y_resampled, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'SMOTE (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Optimized TabNet with SMOTE')
plt.legend(loc='lower right')
plt.show()

# 绘制优化历史
plt.figure(figsize=(10, 6))
plt.plot([trial.number for trial in study.trials], [trial.value for trial in study.trials], marker='o')
plt.xlabel('Trial Number')
plt.ylabel('ROC-AUC Score')
plt.title('Optimization History (SMOTE)')
plt.show()

# 保存完整结果
results = {
    'best_params': study.best_params,
    'best_score': study.best_value,
    'metrics': {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }
}

# 保存结果到文件
with open('tabnet_smote_visualization_results.json', 'w') as f:
    json.dump(results, f, indent=4)

BORUTA 4种

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import torch
import optuna
import warnings
warnings.filterwarnings('ignore')

# 首先进行Boruta特征选择
print("Starting Boruta feature selection...")
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
boruta_selector = BorutaPy(
    rf,
    n_estimators='auto',
    verbose=2,
    random_state=42,
    max_iter=50  # 减少迭代次数以加快速度
)

# 准备数据用于Boruta
X_for_boruta = X_train.values
y_for_boruta = y_train.values

# 运行Boruta特征选择
boruta_selector.fit(X_for_boruta, y_for_boruta)

# 获取选中的特征
feature_names = X_train.columns.tolist()
boruta_features = [feature for feature, selected in zip(feature_names, boruta_selector.support_) if selected]

print(f"\nBoruta selected {len(boruta_features)} features:")
print(boruta_features)

# 使用boruta筛选出的特征
X_train_boruta = X_train[boruta_features]
X_train_scaled_boruta = scaler.fit_transform(X_train_boruta)
X_train_scaled_boruta = pd.DataFrame(X_train_scaled_boruta, columns=boruta_features)

# 简化的Optuna目标函数
def objective(trial, X, y):
    param = {
        'n_d': trial.suggest_int('n_d', 8, 32),  # 缩小范围
        'n_a': trial.suggest_int('n_a', 8, 32),  # 缩小范围
        'n_steps': trial.suggest_int('n_steps', 3, 5),  # 缩小范围
        'gamma': trial.suggest_float('gamma', 1.0, 1.5),  # 缩小范围
        'n_independent': 2,  # 固定值
        'n_shared': 2,  # 固定值
        'momentum': 0.02,  # 固定值
        'lambda_sparse': trial.suggest_float('lambda_sparse', 1e-6, 1e-3, log=True),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {
            'lr': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True)
        },
        'scheduler_params': {
            'step_size': 10,  # 固定值
            'gamma': 0.9  # 固定值
        },
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax'
    }

    # 减少交叉验证折数
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in cv.split(X, y):
        X_train_fold = X[train_idx]
        y_train_fold = y[train_idx]
        X_val_fold = X[val_idx]
        y_val_fold = y[val_idx]

        model = TabNetClassifier(
            device_name='auto',
            **param
        )

        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            max_epochs=50,  # 减少训练轮次
            patience=10,  # 减少patience
            batch_size=512,  # 减小batch size
            virtual_batch_size=64  # 减小virtual batch size
        )

        y_pred = model.predict_proba(X_val_fold)[:, 1]
        score = roc_auc_score(y_val_fold, y_pred)
        scores.append(score)

    return np.mean(scores)

# 定义采样方法
samplers = {
    'No Sampling': None,
    'SMOTE': SMOTE(random_state=42)  # 只保留SMOTE方法来加快速度
}

# 存储结果
best_params = {}
metrics_data = []

# 对每种采样方法进行优化
for method, sampler in samplers.items():
    print(f"\n=== Optimizing {method} ===")

    if sampler is None:
        X_resampled = X_train_scaled_boruta.values
        y_resampled = y_train.values
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_boruta, y_train)
        X_resampled = np.array(X_resampled)
        y_resampled = np.array(y_resampled)

    # 运行Optuna优化
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_resampled, y_resampled),
                  n_trials=15,  # 减少试验次数
                  show_progress_bar=True)

    best_params[method] = study.best_params
    print(f"\nBest parameters for {method}:")
    print(study.best_params)
    print(f"Best ROC-AUC: {study.best_value:.4f}")

    # 使用最佳参数训练最终模型
    best_config = {
        'n_d': study.best_params['n_d'],
        'n_a': study.best_params['n_a'],
        'n_steps': study.best_params['n_steps'],
        'gamma': study.best_params['gamma'],
        'n_independent': 2,
        'n_shared': 2,
        'momentum': 0.02,
        'lambda_sparse': study.best_params['lambda_sparse'],
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': study.best_params['learning_rate']},
        'scheduler_params': {
            'step_size': 10,
            'gamma': 0.9
        },
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    }

    # 评估最终模型
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    all_y_true = []
    all_y_pred = []
    all_y_pred_proba = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X_resampled, y_resampled)):
        print(f"Training fold {fold + 1}/3...")

        X_fold_train = X_resampled[train_idx]
        y_fold_train = y_resampled[train_idx]
        X_fold_val = X_resampled[val_idx]
        y_fold_val = y_resampled[val_idx]

        best_model = TabNetClassifier(**best_config)
        best_model.fit(
            X_fold_train, y_fold_train,
            eval_set=[(X_fold_val, y_fold_val)],
            max_epochs=50,
            patience=10,
            batch_size=512,
            virtual_batch_size=64
        )

        fold_preds = best_model.predict(X_fold_val)
        fold_pred_probas = best_model.predict_proba(X_fold_val)[:, 1]

        all_y_true.extend(y_fold_val)
        all_y_pred.extend(fold_preds)
        all_y_pred_proba.extend(fold_pred_probas)

    # 计算评估指标
    all_y_true = np.array(all_y_true)
    all_y_pred = np.array(all_y_pred)
    all_y_pred_proba = np.array(all_y_pred_proba)

    accuracy = accuracy_score(all_y_true, all_y_pred)
    precision = precision_score(all_y_true, all_y_pred)
    recall = recall_score(all_y_true, all_y_pred)
    f1 = f1_score(all_y_true, all_y_pred)
    roc_auc = roc_auc_score(all_y_true, all_y_pred_proba)

    # 存储评估指标
    metrics_data.append({
        'Balancing Method': method,
        'Best ROC-AUC': study.best_value,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

    # 绘制混淆矩阵
    conf_matrix = confusion_matrix(all_y_true, all_y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - TabNet with Boruta Features ({method})')
    plt.show()

    # 绘制ROC曲线
    fpr, tpr, _ = roc_curve(all_y_true, all_y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - TabNet with Boruta Features ({method})')
    plt.legend(loc='lower right')
    plt.show()

# 保存结果
results = {
    'boruta_features': boruta_features,
    'best_params': best_params,
    'metrics': metrics_df.to_dict()
}

# 保存到文件
with open('fast_tabnet_boruta_optuna_results.json', 'w') as f:
    json.dump(results, f, indent=4)

No classification report is output. The Excel result is derived from the confusion matrix. The following code does not need to be run to output

Undersampling和Oversampling

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
import torch
import optuna
import warnings
warnings.filterwarnings('ignore')

# 定义Optuna目标函数（简化版）
def objective(trial, X, y):
    param = {
        'n_d': trial.suggest_int('n_d', 8, 32),
        'n_a': trial.suggest_int('n_a', 8, 32),
        'n_steps': trial.suggest_int('n_steps', 3, 5),
        'gamma': trial.suggest_float('gamma', 1.0, 1.5),
        'n_independent': 2,
        'n_shared': 2,
        'momentum': 0.02,
        'lambda_sparse': trial.suggest_float('lambda_sparse', 1e-6, 1e-3, log=True),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {
            'lr': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True)
        },
        'scheduler_params': {
            'step_size': 10,
            'gamma': 0.9
        },
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax'
    }

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in cv.split(X, y):
        X_train_fold = X[train_idx]
        y_train_fold = y[train_idx]
        X_val_fold = X[val_idx]
        y_val_fold = y[val_idx]

        model = TabNetClassifier(
            device_name='auto',
            **param
        )

        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            max_epochs=50,
            patience=10,
            batch_size=512,
            virtual_batch_size=64
        )

        y_pred = model.predict_proba(X_val_fold)[:, 1]
        score = roc_auc_score(y_val_fold, y_pred)
        scores.append(score)

    return np.mean(scores)

# 定义采样方法
samplers = {
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42)
}

# 存储结果
best_params = {}
metrics_data = []

# 对两种采样方法进行优化
for method, sampler in samplers.items():
    print(f"\n=== Optimizing {method} ===")

    # 应用采样方法
    X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_boruta, y_train)
    X_resampled = np.array(X_resampled)
    y_resampled = np.array(y_resampled)

    # 运行Optuna优化
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_resampled, y_resampled),
                  n_trials=15,
                  show_progress_bar=True)

    best_params[method] = study.best_params
    print(f"\nBest parameters for {method}:")
    print(study.best_params)
    print(f"Best ROC-AUC: {study.best_value:.4f}")

    # 使用最佳参数训练最终模型
    best_config = {
        'n_d': study.best_params['n_d'],
        'n_a': study.best_params['n_a'],
        'n_steps': study.best_params['n_steps'],
        'gamma': study.best_params['gamma'],
        'n_independent': 2,
        'n_shared': 2,
        'momentum': 0.02,
        'lambda_sparse': study.best_params['lambda_sparse'],
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': study.best_params['learning_rate']},
        'scheduler_params': {
            'step_size': 10,
            'gamma': 0.9
        },
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    }

    # 评估最终模型
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    all_y_true = []
    all_y_pred = []
    all_y_pred_proba = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X_resampled, y_resampled)):
        print(f"Training fold {fold + 1}/3...")

        X_fold_train = X_resampled[train_idx]
        y_fold_train = y_resampled[train_idx]
        X_fold_val = X_resampled[val_idx]
        y_fold_val = y_resampled[val_idx]

        best_model = TabNetClassifier(**best_config)
        best_model.fit(
            X_fold_train, y_fold_train,
            eval_set=[(X_fold_val, y_fold_val)],
            max_epochs=50,
            patience=10,
            batch_size=512,
            virtual_batch_size=64
        )

        fold_preds = best_model.predict(X_fold_val)
        fold_pred_probas = best_model.predict_proba(X_fold_val)[:, 1]

        all_y_true.extend(y_fold_val)
        all_y_pred.extend(fold_preds)
        all_y_pred_proba.extend(fold_pred_probas)

    # 计算评估指标
    all_y_true = np.array(all_y_true)
    all_y_pred = np.array(all_y_pred)
    all_y_pred_proba = np.array(all_y_pred_proba)

    accuracy = accuracy_score(all_y_true, all_y_pred)
    precision = precision_score(all_y_true, all_y_pred)
    recall = recall_score(all_y_true, all_y_pred)
    f1 = f1_score(all_y_true, all_y_pred)
    roc_auc = roc_auc_score(all_y_true, all_y_pred_proba)

    # 存储评估指标
    metrics_data.append({
        'Balancing Method': method,
        'Best ROC-AUC': study.best_value,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

    # 绘制混淆矩阵
    conf_matrix = confusion_matrix(all_y_true, all_y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - TabNet with Boruta Features ({method})')
    plt.show()

    # 绘制ROC曲线
    fpr, tpr, _ = roc_curve(all_y_true, all_y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - TabNet with Boruta Features ({method})')
    plt.legend(loc='lower right')
    plt.show()

    # 绘制优化历史
    plt.figure(figsize=(10, 6))
    plt.plot([trial.number for trial in study.trials], [trial.value for trial in study.trials], marker='o')
    plt.xlabel('Trial Number')
    plt.ylabel('ROC-AUC Score')
    plt.title(f'Optimization History ({method})')
    plt.show()

# 创建结果比较表
metrics_df = pd.DataFrame(metrics_data).sort_values(by='ROC AUC', ascending=False)
print("\nFinal Results Comparison:")
print(metrics_df)

# 保存结果
results = {
    'best_params': best_params,
    'metrics': metrics_df.to_dict()
}

with open('tabnet_under_over_sampling_results.json', 'w') as f:
    json.dump(results, f, indent=4)

### test set

BASELINE 4种

In [None]:
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_curve
)
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings('ignore')

print("\nPreparing test data...")

# 确保特征列的一致性
feature_columns = [col for col in selected_columns if col != 'is_canceled']

# 准备数据
scaler = StandardScaler()

# 修改：准备训练数据
# 使用整体训练集（包含H1和H2的组合）
X_train = pd.concat([H1_train, H2_train])[feature_columns]
X_train_scaled = scaler.fit_transform(X_train)
y_train = pd.concat([H1_train, H2_train])['is_canceled'].values

# H2测试集数据准备
X_H2_test = H2_test[feature_columns]
X_H2_test_scaled = scaler.transform(X_H2_test)
y_H2_test = H2_test['is_canceled'].values

# H1测试集数据准备
X_H1_test = H1_test[feature_columns]
X_H1_test_scaled = scaler.transform(X_H1_test)
y_H1_test = H1_test['is_canceled'].values

# 确认数据形状
print(f"Training features: {feature_columns}")
print(f"Number of features: {len(feature_columns)}")
print(f"Training set shape: {X_train_scaled.shape}")
print(f"H2 test set shape: {X_H2_test_scaled.shape}")
print(f"H1 test set shape: {X_H1_test_scaled.shape}")

# 定义最佳配置（保持不变）
best_configs = {
    'No Sampling': {
        'n_d': 28,
        'n_a': 27,
        'n_steps': 3,
        'gamma': 1.3091649976992965,
        'lambda_sparse': 9.421983411532464e-06,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 0.01880243119491898},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    },
    'Oversample': {
        'n_d': 24,
        'n_a': 25,
        'n_steps': 4,
        'gamma': 1.2008839910132765,
        'lambda_sparse': 0.00033250842281105973,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 0.015659252876444294},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    },
    'Undersample': {
        'n_d': 24,
        'n_a': 17,
        'n_steps': 3,
        'gamma': 1.1198251412675357,
        'lambda_sparse': 4.621721552688596e-06,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 0.02327241095705265},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    },
    'SMOTE': {
        'n_d': 26,
        'n_a': 15,
        'n_steps': 3,
        'gamma': 1.127935814907823,
        'lambda_sparse': 0.00010489631802618515,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 0.026034337812760464},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    }
}

results_h1 = []
results_h2 = []

samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

# 对每种方法训练和评估模型
for method, config in best_configs.items():
    print(f"\n{'='*50}")
    print(f"Training and evaluating {method} model")
    print('='*50)

    # 使用相应的采样方法处理训练数据
    sampler = samplers[method]
    if sampler is None:
        X_resampled = X_train_scaled
        y_resampled = y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled, y_train)
        X_resampled = np.array(X_resampled)
        y_resampled = np.array(y_resampled)

    print(f"Training data shape after {method}: {X_resampled.shape}")

    # 训练模型
    model = TabNetClassifier(**config)
    model.fit(
        X_resampled, y_resampled,
        max_epochs=50,
        patience=10,
        batch_size=512,
        virtual_batch_size=64
    )

    # 评估 H2 测试集
    y_pred_h2 = model.predict(X_H2_test_scaled)
    y_pred_proba_h2 = model.predict_proba(X_H2_test_scaled)[:, 1]

    results_h2.append({
        'Method': method,
        'Accuracy': accuracy_score(y_H2_test, y_pred_h2),
        'Precision': precision_score(y_H2_test, y_pred_h2),
        'Recall': recall_score(y_H2_test, y_pred_h2),
        'F1 Score': f1_score(y_H2_test, y_pred_h2),
        'ROC AUC': roc_auc_score(y_H2_test, y_pred_proba_h2)
    })

    # 评估 H1 测试集和其他评估代码保持不变...

    print("\nH1 Test Set Results Summary:")
    print("="*80)
    print(results_df_h1)

# 保存结果部分保持不变...

In [None]:
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_H2_test_scaled shape: {X_H2_test_scaled.shape}")
print(f"y_H2_test shape: {y_H2_test.shape}")
print(f"X_H1_test_scaled shape: {X_H1_test_scaled.shape}")
print(f"y_H1_test shape: {y_H1_test.shape}")

update version

In [None]:
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
import torch
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings('ignore')

print("\nPreparing test data...")



# 首先确保特征列的一致性
feature_columns = [col for col in selected_columns if col != 'is_canceled']

# 准备测试数据
X_H2_test_scaled = scaler.transform(H2_test[feature_columns])
y_H2_test = np.array(H2_test['is_canceled'])  # 直接转换为numpy数组

X_H1_test_scaled = scaler.transform(H1_test[feature_columns])
y_H1_test = np.array(H1_test['is_canceled'])  # 直接转换为numpy数组

# 准备训练数据
X_train_scaled = scaler.transform(X_train[feature_columns])
if isinstance(y_train, pd.Series) or isinstance(y_train, pd.DataFrame):
    y_train = y_train.values
else:
    y_train = np.array(y_train)  # 确保是numpy数组


# 定义最佳配置
best_configs = {
    'No Sampling': {
        'n_d': 28,
        'n_a': 27,
        'n_steps': 3,
        'gamma': 1.3091649976992965,
        'lambda_sparse': 9.421983411532464e-06,
        'n_independent': 2,
        'n_shared': 2,
        'momentum': 0.02,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 0.01880243119491898},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    },
    'Oversample': {
        'n_d': 24,
        'n_a': 25,
        'n_steps': 4,
        'gamma': 1.2008839910132765,
        'lambda_sparse': 0.00033250842281105973,
        'n_independent': 2,
        'n_shared': 2,
        'momentum': 0.02,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 0.015659252876444294},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    },
    'Undersample': {
        'n_d': 24,
        'n_a': 17,
        'n_steps': 3,
        'gamma': 1.1198251412675357,
        'lambda_sparse': 4.621721552688596e-06,
        'n_independent': 2,
        'n_shared': 2,
        'momentum': 0.02,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 0.02327241095705265},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    },
    'SMOTE': {
        'n_d': 26,
        'n_a': 15,
        'n_steps': 3,
        'gamma': 1.127935814907823,
        'lambda_sparse': 0.00010489631802618515,
        'n_independent': 2,
        'n_shared': 2,
        'momentum': 0.02,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 0.026034337812760464},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    }
}

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

results_h1 = []
results_h2 = []

# 对每种采样方法训练和评估模型
for method, config in best_configs.items():
    print(f"\n{'='*50}")
    print(f"Training and evaluating {method} model")
    print('='*50)

    # 使用相应的采样方法处理训练数据
    sampler = samplers[method]
    if sampler is None:
        X_resampled = X_train_scaled
        y_resampled = y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled, y_train)
        X_resampled = np.array(X_resampled)
        y_resampled = np.array(y_resampled)

    # 训练模型
    model = TabNetClassifier(**config)
    model.fit(
        X_resampled, y_resampled,
        max_epochs=50,
        patience=10,
        batch_size=512,
        virtual_batch_size=64
    )

    # 评估 H2 测试集
    y_pred_h2 = model.predict(X_H2_test_scaled)
    y_pred_proba_h2 = model.predict_proba(X_H2_test_scaled)[:, 1]

    results_h2.append({
        'Method': method,
        'Accuracy': accuracy_score(y_H2_test, y_pred_h2),
        'Precision': precision_score(y_H2_test, y_pred_h2),
        'Recall': recall_score(y_H2_test, y_pred_h2),
        'F1 Score': f1_score(y_H2_test, y_pred_h2),
        'ROC AUC': roc_auc_score(y_H2_test, y_pred_proba_h2)
    })

    print(f"\nH2 Test Set Results for {method}:")
    print(classification_report(y_H2_test, y_pred_h2))

    # 评估 H1 测试集
    y_pred_h1 = model.predict(X_H1_test_scaled)
    y_pred_proba_h1 = model.predict_proba(X_H1_test_scaled)[:, 1]

    results_h1.append({
        'Method': method,
        'Accuracy': accuracy_score(y_H1_test, y_pred_h1),
        'Precision': precision_score(y_H1_test, y_pred_h1),
        'Recall': recall_score(y_H1_test, y_pred_h1),
        'F1 Score': f1_score(y_H1_test, y_pred_h1),
        'ROC AUC': roc_auc_score(y_H1_test, y_pred_proba_h1)
    })

    print(f"\nH1 Test Set Results for {method}:")
    print(classification_report(y_H1_test, y_pred_h1))

# 创建结果DataFrame
results_df_h2 = pd.DataFrame(results_h2).set_index('Method')
results_df_h1 = pd.DataFrame(results_h1).set_index('Method')

# 显示结果
print("\nH2 Test Set Results Summary:")
print("="*80)
print(results_df_h2)

print("\nH1 Test Set Results Summary:")
print("="*80)
print(results_df_h1)

# 绘制结果对比图
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))

results_df_h2.plot(kind='bar', ax=ax1)
ax1.set_title('H2 Test Set Performance')
ax1.set_xticklabels(results_df_h2.index, rotation=45)
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

results_df_h1.plot(kind='bar', ax=ax2)
ax2.set_title('H1 Test Set Performance')
ax2.set_xticklabels(results_df_h1.index, rotation=45)
ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

# 保存结果
test_results = {
    'features_used': selected_columns,
    'H1_test_results': results_df_h1.to_dict(),
    'H2_test_results': results_df_h2.to_dict(),
    'best_configs': {k: {kk: str(vv) if isinstance(vv, type) else vv
                        for kk, vv in v.items()}
                    for k, v in best_configs.items()}
}

with open('tabnet_final_test_results.json', 'w') as f:
    json.dump(test_results, f, indent=4)

feature importance top10 4（H2+H1 test set)

In [None]:
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
import torch

print("\nUsing Top 10 Features:", top_features)

# 准备测试数据并转换为numpy数组
X_H2_test_scaled = scaler.transform(H2_test[top_features])
y_H2_test = H2_test['is_canceled'].values  # 转换为numpy数组

X_H1_test_scaled = scaler.transform(H1_test[top_features])
y_H1_test = H1_test['is_canceled'].values  # 转换为numpy数组

# 确保训练数据也是numpy数组格式
X_train_scaled_top = scaler.transform(X_train[top_features])  # 直接使用numpy数组
y_train = y_train.values  # 转换为numpy数组

best_configs = {
    'SMOTE': {
        'n_d': 24,
        'n_a': 13,
        'n_steps': 5,
        'gamma': 1.199946860351268,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 0.03975153101157348},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    },
    'Oversample': {
        'n_d': 28,
        'n_a': 25,
        'n_steps': 4,
        'gamma': 1.1337063206604574,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 0.014560530996164776},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    },
    'Undersample': {
        'n_d': 30,
        'n_a': 25,
        'n_steps': 5,
        'gamma': 1.0982928910742795,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 0.05171330246308425},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    },
    'No Sampling': {
        'n_d': 27,
        'n_a': 15,
        'n_steps': 4,
        'gamma': 1.0020143247588822,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 0.021460813045738256},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    }
}

results_h1 = []
results_h2 = []

samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

# 对每种采样方法训练和评估模型
for method, config in best_configs.items():
    print(f"\n{'='*50}")
    print(f"Training and evaluating {method} model")
    print('='*50)

    # 使用相应的采样方法处理训练数据
    sampler = samplers[method]
    if sampler is None:
        X_resampled = X_train_scaled_top
        y_resampled = y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)
        X_resampled = np.array(X_resampled)  # 确保是numpy数组
        y_resampled = np.array(y_resampled)

    # 训练模型
    model = TabNetClassifier(**config)
    model.fit(
        X_resampled, y_resampled,
        max_epochs=50,
        patience=10,
        batch_size=2048,
        virtual_batch_size=256
    )

    # 评估 H2 测试集
    y_pred_h2 = model.predict(X_H2_test_scaled)
    y_pred_proba_h2 = model.predict_proba(X_H2_test_scaled)[:, 1]

    results_h2.append({
        'Method': method,
        'Accuracy': accuracy_score(y_H2_test, y_pred_h2),
        'Precision': precision_score(y_H2_test, y_pred_h2),
        'Recall': recall_score(y_H2_test, y_pred_h2),
        'F1 Score': f1_score(y_H2_test, y_pred_h2),
        'ROC AUC': roc_auc_score(y_H2_test, y_pred_proba_h2)
    })

    print(f"\nH2 Test Set Results for {method}:")
    print(classification_report(y_H2_test, y_pred_h2))

    # 评估 H1 测试集
    y_pred_h1 = model.predict(X_H1_test_scaled)
    y_pred_proba_h1 = model.predict_proba(X_H1_test_scaled)[:, 1]

    results_h1.append({
        'Method': method,
        'Accuracy': accuracy_score(y_H1_test, y_pred_h1),
        'Precision': precision_score(y_H1_test, y_pred_h1),
        'Recall': recall_score(y_H1_test, y_pred_h1),
        'F1 Score': f1_score(y_H1_test, y_pred_h1),
        'ROC AUC': roc_auc_score(y_H1_test, y_pred_proba_h1)
    })

    print(f"\nH1 Test Set Results for {method}:")
    print(classification_report(y_H1_test, y_pred_h1))

# 创建结果DataFrame
results_df_h2 = pd.DataFrame(results_h2).set_index('Method')
results_df_h1 = pd.DataFrame(results_h1).set_index('Method')

# 显示结果
print("\nH2 Test Set Results Summary:")
print("="*80)
print(results_df_h2)

print("\nH1 Test Set Results Summary:")
print("="*80)
print(results_df_h1)

# 绘制结果对比图
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))

results_df_h2.plot(kind='bar', ax=ax1)
ax1.set_title('H2 Test Set Performance (Top 10 Features)')
ax1.set_xticklabels(results_df_h2.index, rotation=45)
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

results_df_h1.plot(kind='bar', ax=ax2)
ax2.set_title('H1 Test Set Performance (Top 10 Features)')
ax2.set_xticklabels(results_df_h1.index, rotation=45)
ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

# 保存结果
test_results = {
    'top_features': top_features,
    'H1_test_results': results_df_h1.to_dict(),
    'H2_test_results': results_df_h2.to_dict(),
    'best_configs': {k: {kk: str(vv) if isinstance(vv, type) else vv
                        for kk, vv in v.items()}
                    for k, v in best_configs.items()}
}

with open('tabnet_test_results_top10.json', 'w') as f:
    json.dump(test_results, f, indent=4)

### cross-dataset evaluation: H1 test set

Previous test set

BORUTA 4

In [None]:
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
import torch
import matplotlib.pyplot as plt

# 准备测试数据并转换为numpy数组
X_H2_test_scaled = scaler.transform(H2_test[boruta_features])
y_H2_test = H2_test['is_canceled'].values

X_H1_test_scaled = scaler.transform(H1_test[boruta_features])
y_H1_test = H1_test['is_canceled'].values

# 确保训练数据也是numpy数组格式
X_train_scaled_boruta = scaler.transform(X_train[boruta_features])
y_train = y_train.values

# 定义每种方法的最佳参数
best_configs = {
    'Oversample': {
        'n_d': 10,
        'n_a': 31,
        'n_steps': 3,
        'gamma': 1.197029011602726,
        'lambda_sparse': 0.00010791353833278552,
        'n_independent': 2,
        'n_shared': 2,
        'momentum': 0.02,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 0.014912435726843684},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    },
    'Undersample': {
        'n_d': 32,
        'n_a': 23,
        'n_steps': 3,
        'gamma': 1.2038363939243872,
        'lambda_sparse': 3.4596259800340263e-05,
        'n_independent': 2,
        'n_shared': 2,
        'momentum': 0.02,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 0.013128614771897024},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    },
    'SMOTE': {
        'n_d': 17,
        'n_a': 32,
        'n_steps': 3,
        'gamma': 1.0130948574947876,
        'lambda_sparse': 0.00017931836416446544,
        'n_independent': 2,
        'n_shared': 2,
        'momentum': 0.02,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 0.01603917398477004},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    },
    'No Sampling': {
        'n_d': 19,
        'n_a': 23,
        'n_steps': 4,
        'gamma': 1.4992406257748137,
        'lambda_sparse': 1.0000732710913913e-06,
        'n_independent': 2,
        'n_shared': 2,
        'momentum': 0.02,
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {'lr': 0.0897224699282855},
        'scheduler_params': {'step_size': 10, 'gamma': 0.9},
        'scheduler_fn': torch.optim.lr_scheduler.StepLR,
        'mask_type': 'sparsemax',
        'device_name': 'auto'
    }
}

# 定义采样方法
samplers = {
    'No Sampling': None,
    'Undersample': RandomUnderSampler(random_state=42),
    'Oversample': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42)
}

results_h1 = []
results_h2 = []

# 对每种采样方法训练和评估模型
for method, config in best_configs.items():
    print(f"\n{'='*50}")
    print(f"Training and evaluating {method} model")
    print('='*50)

    # 使用相应的采样方法处理训练数据
    sampler = samplers[method]
    if sampler is None:
        X_resampled = X_train_scaled_boruta
        y_resampled = y_train
    else:
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_boruta, y_train)
        X_resampled = np.array(X_resampled)
        y_resampled = np.array(y_resampled)

    # 训练模型
    model = TabNetClassifier(**config)
    model.fit(
        X_resampled, y_resampled,
        max_epochs=50,
        patience=10,
        batch_size=512,
        virtual_batch_size=64
    )

    # 评估 H2 测试集
    y_pred_h2 = model.predict(X_H2_test_scaled)
    y_pred_proba_h2 = model.predict_proba(X_H2_test_scaled)[:, 1]

    results_h2.append({
        'Method': method,
        'Accuracy': accuracy_score(y_H2_test, y_pred_h2),
        'Precision': precision_score(y_H2_test, y_pred_h2),
        'Recall': recall_score(y_H2_test, y_pred_h2),
        'F1 Score': f1_score(y_H2_test, y_pred_h2),
        'ROC AUC': roc_auc_score(y_H2_test, y_pred_proba_h2)
    })

    print(f"\nH2 Test Set Results for {method}:")
    print(classification_report(y_H2_test, y_pred_h2))

    # 评估 H1 测试集
    y_pred_h1 = model.predict(X_H1_test_scaled)
    y_pred_proba_h1 = model.predict_proba(X_H1_test_scaled)[:, 1]

    results_h1.append({
        'Method': method,
        'Accuracy': accuracy_score(y_H1_test, y_pred_h1),
        'Precision': precision_score(y_H1_test, y_pred_h1),
        'Recall': recall_score(y_H1_test, y_pred_h1),
        'F1 Score': f1_score(y_H1_test, y_pred_h1),
        'ROC AUC': roc_auc_score(y_H1_test, y_pred_proba_h1)
    })

    print(f"\nH1 Test Set Results for {method}:")
    print(classification_report(y_H1_test, y_pred_h1))

# 创建结果DataFrame
results_df_h2 = pd.DataFrame(results_h2).set_index('Method')
results_df_h1 = pd.DataFrame(results_h1).set_index('Method')

# 显示结果
print("\nH2 Test Set Results Summary:")
print("="*80)
print(results_df_h2)

print("\nH1 Test Set Results Summary:")
print("="*80)
print(results_df_h1)

# 绘制结果对比图
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))

results_df_h2.plot(kind='bar', ax=ax1)
ax1.set_title('H2 Test Set Performance (Boruta Features)')
ax1.set_xticklabels(results_df_h2.index, rotation=45)
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

results_df_h1.plot(kind='bar', ax=ax2)
ax2.set_title('H1 Test Set Performance (Boruta Features)')
ax2.set_xticklabels(results_df_h1.index, rotation=45)
ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

# 保存结果
test_results = {
    'boruta_features': boruta_features,
    'H1_test_results': results_df_h1.to_dict(),
    'H2_test_results': results_df_h2.to_dict(),
    'best_configs': {k: {kk: str(vv) if isinstance(vv, type) else vv
                        for kk, vv in v.items()}
                    for k, v in best_configs.items()}
}

with open('tabnet_test_results_boruta.json', 'w') as f:
    json.dump(test_results, f, indent=4)

Forgot to output the classification report, extract and organize from the text, see the excel table



#H1+H2

 ## 1.Logistic Regression





### baseline model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Use the new combined dataset instead of concatenating H1 and H2
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

X_test = combined_test[selected_columns].drop(columns=['is_canceled'])
y_test = combined_test['is_canceled']

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Create Logistic Regression model
log_reg_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)

# Perform 3-fold cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(log_reg_model, X_train_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(log_reg_model, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

# Add percentage labels
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Combined Dataset Confusion Matrix with Counts and Percentages')
plt.show()

# Print classification report
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

# Calculate overall ROC AUC
print(f"\nOverall ROC AUC Score: {roc_auc_score(y_train, y_pred_proba_cv):.4f}")

# Plot ROC curves for each fold
plt.figure(figsize=(12, 8))
fold_count = 1

for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    log_reg_model.fit(X_train_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = log_reg_model.predict_proba(X_train_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Combined Dataset (3-fold CV)')
plt.legend(loc='lower right')
plt.show()

# Print dataset information
print("\nDataset Information:")
print(f"Total Training Samples: {len(X_train)}")
print(f"Number of Features: {X_train.shape[1]}")
print(f"Class Distribution in Training Set:")
print(y_train.value_counts(normalize=True).round(4) * 100)

# Train final model and get feature importance
final_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
final_model.fit(X_train_scaled, y_train)

# Create feature importance DataFrame
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': np.abs(final_model.coef_[0])
}).sort_values(by='Importance', ascending=False)

# Visualize feature importance
plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in Logistic Regression Model')
plt.xlabel('Absolute Coefficient Value')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
print(train_data.columns)


In [None]:
print(selected_columns)


### different feature combinations

coefficient

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Use combined dataset
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

# Standardize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Initial model for feature selection
initial_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
initial_model.fit(X_train_scaled, y_train)

# Get feature importance
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': initial_model.coef_[0],
    'Absolute_Coefficient': np.abs(initial_model.coef_[0])
}).sort_values(by='Absolute_Coefficient', ascending=False)

# Select top 10 features
top_features = feature_importance.head(10)['Feature'].tolist()

print("Selected Top 10 Features:")
for idx, (feature, coef) in enumerate(zip(feature_importance.head(10)['Feature'],
                                        feature_importance.head(10)['Coefficient']), 1):
    print(f"{idx}. {feature}: {coef:.4f}")

# Prepare training data with top features
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# Create new logistic regression model
log_reg_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)

# 3-fold cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(log_reg_model, X_train_scaled_top, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(log_reg_model, X_train_scaled_top, y_train, cv=cv, method='predict_proba')[:, 1]

# Plot confusion matrix
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Top 10 Features Model)')
plt.show()

print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

overall_auc = roc_auc_score(y_train, y_pred_proba_cv)
print(f"\nOverall ROC AUC Score: {overall_auc:.4f}")

# Plot ROC curves
plt.figure(figsize=(12, 8))
fold_count = 1

for train_idx, test_idx in cv.split(X_train_scaled_top, y_train):
    log_reg_model.fit(X_train_scaled_top[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = log_reg_model.predict_proba(X_train_scaled_top[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Top 10 Features Model (3-fold CV)')
plt.legend(loc='lower right')
plt.show()

# Train final model
final_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
final_model.fit(X_train_scaled_top, y_train)

# Get final coefficients
final_coefficients = pd.DataFrame({
    'Feature': top_features,
    'Coefficient': final_model.coef_[0],
    'Absolute_Coefficient': np.abs(final_model.coef_[0])
}).sort_values(by='Absolute_Coefficient', ascending=False)

# Visualize coefficients
plt.figure(figsize=(12, 6))
sns.barplot(data=final_coefficients, x='Absolute_Coefficient', y='Feature', palette='viridis')
plt.title('Feature Coefficients in Top 10 Features Model')
plt.xlabel('Absolute Coefficient Value')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# Print coefficients with direction
print("\nFeature Coefficients (with direction):")
for idx, (feature, coef) in enumerate(zip(final_coefficients['Feature'],
                                        final_coefficients['Coefficient']), 1):
    sign = '+' if coef > 0 else ''
    print(f"{idx}. {feature}: {sign}{coef:.4f}")

# Print model information
print("\nModel and Dataset Information:")
print(f"Number of features used: {len(top_features)}")
print(f"Total training samples: {len(X_train)}")
print("\nClass Distribution:")
print(y_train.value_counts(normalize=True).round(4) * 100)

# Compare performance
print("\nPerformance Comparison:")
print(f"Top 10 Features Model ROC AUC: {overall_auc:.4f}")
print(f"Full Feature Model ROC AUC: {roc_auc_score(y_train, y_pred_proba_cv):.4f}")

shap

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import shap
warnings.filterwarnings('ignore')

# Use combined dataset
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

# Standardize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Initial model for SHAP values
initial_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
initial_model.fit(X_train_scaled, y_train)

# Calculate SHAP values
explainer = shap.LinearExplainer(initial_model, X_train_scaled)
shap_values = explainer.shap_values(X_train_scaled)

# Create feature importance DataFrame
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': np.mean(np.abs(shap_values), axis=0)
}).sort_values(by='Importance', ascending=False)

# Visualize SHAP importance
plt.figure(figsize=(12, 6))
shap.summary_plot(shap_values, X_train_scaled, feature_names=X_train.columns, show=False)
plt.title('SHAP Feature Importance')
plt.tight_layout()
plt.show()

# Select top 10 features
top_features = feature_importance.head(10)['Feature'].tolist()
print("\nSelected Top 10 Features by SHAP:")
for idx, (feature, importance) in enumerate(zip(feature_importance.head(10)['Feature'],
                                              feature_importance.head(10)['Importance']), 1):
    print(f"{idx}. {feature}: {importance:.4f}")

# Prepare selected features data
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# Create and train logistic regression
log_reg_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)

# 3-fold cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(log_reg_model, X_train_scaled_top, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(log_reg_model, X_train_scaled_top, y_train, cv=cv, method='predict_proba')[:, 1]

# Plot confusion matrix
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (SHAP Top 10 Features)')
plt.show()

print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

overall_auc = roc_auc_score(y_train, y_pred_proba_cv)
print(f"\nOverall ROC AUC Score: {overall_auc:.4f}")

# Plot ROC curves
plt.figure(figsize=(12, 8))
fold_count = 1

for train_idx, test_idx in cv.split(X_train_scaled_top, y_train):
    log_reg_model.fit(X_train_scaled_top[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = log_reg_model.predict_proba(X_train_scaled_top[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves (SHAP Top 10 Features)')
plt.legend(loc='lower right')
plt.show()

# Train final model
final_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
final_model.fit(X_train_scaled_top, y_train)

# Get feature coefficients
feature_coefficients = pd.DataFrame({
    'Feature': top_features,
    'Coefficient': final_model.coef_[0],
    'Absolute_Coefficient': np.abs(final_model.coef_[0])
}).sort_values(by='Absolute_Coefficient', ascending=False)

# Plot coefficients
plt.figure(figsize=(12, 6))
sns.barplot(data=feature_coefficients, x='Absolute_Coefficient', y='Feature', palette='viridis')
plt.title('Feature Coefficients (SHAP Top 10 Features)')
plt.xlabel('Absolute Coefficient Value')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

print("\nModel and Dataset Information:")
print(f"Number of original features: {X_train.shape[1]}")
print(f"Number of selected features: {len(top_features)}")
print(f"Total training samples: {len(X_train)}")
print("\nClass Distribution:")
print(y_train.value_counts(normalize=True).round(4) * 100)

print("\nFeature Coefficients (with direction):")
for idx, (feature, coef) in enumerate(zip(feature_coefficients['Feature'],
                                        feature_coefficients['Coefficient']), 1):
    sign = '+' if coef > 0 else ''
    print(f"{idx}. {feature}: {sign}{coef:.4f}")

# SHAP analysis for final model
final_explainer = shap.LinearExplainer(final_model, X_train_scaled_top)
final_shap_values = final_explainer.shap_values(X_train_scaled_top)

plt.figure(figsize=(12, 8))
shap.summary_plot(final_shap_values, X_train_scaled_top,
                 feature_names=top_features, show=False)
plt.title('SHAP Summary Plot for Selected Features')
plt.tight_layout()
plt.show()

# SHAP dependence plots
for feature in top_features[:3]:
    plt.figure(figsize=(10, 6))
    feature_idx = top_features.index(feature)
    shap.dependence_plot(feature_idx, final_shap_values, X_train_scaled_top,
                        feature_names=top_features, show=False)
    plt.title(f'SHAP Dependence Plot for {feature}')
    plt.tight_layout()
    plt.show()

boruta

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Use combined dataset
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Boruta feature selection
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
boruta_selector = BorutaPy(
    rf,
    n_estimators='auto',
    verbose=2,
    random_state=42
)

print("Running Boruta feature selection...")
boruta_selector.fit(X_train_scaled, y_train)

selected_feat_mask = boruta_selector.support_
selected_features = X_train.columns[selected_feat_mask].tolist()

print("\nSelected Features by Boruta:")
for idx, feature in enumerate(selected_features, 1):
    print(f"{idx}. {feature}")

importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Ranking': boruta_selector.ranking_,
    'Selected': selected_feat_mask
})
importance_df['Importance'] = max(importance_df['Ranking']) - importance_df['Ranking'] + 1
importance_df = importance_df[importance_df['Selected']].sort_values(by='Importance', ascending=False)

X_train_selected = X_train[selected_features]
X_train_scaled_selected = scaler.fit_transform(X_train_selected)

# Logistic regression with selected features
log_reg_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(log_reg_model, X_train_scaled_selected, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(log_reg_model, X_train_scaled_selected, y_train, cv=cv, method='predict_proba')[:, 1]

# Plot confusion matrix
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Boruta Selected Features)')
plt.show()

print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

overall_auc = roc_auc_score(y_train, y_pred_proba_cv)
print(f"\nOverall ROC AUC Score: {overall_auc:.4f}")

# Plot ROC curves
plt.figure(figsize=(12, 8))
fold_count = 1

for train_idx, test_idx in cv.split(X_train_scaled_selected, y_train):
    log_reg_model.fit(X_train_scaled_selected[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = log_reg_model.predict_proba(X_train_scaled_selected[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves (Boruta Selected Features)')
plt.legend(loc='lower right')
plt.show()

# Train final model
final_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
final_model.fit(X_train_scaled_selected, y_train)

feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Coefficient': final_model.coef_[0],
    'Absolute_Coefficient': np.abs(final_model.coef_[0])
}).sort_values(by='Absolute_Coefficient', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Absolute_Coefficient', y='Feature', palette='viridis')
plt.title('Feature Importance in Logistic Regression (Boruta Selected Features)')
plt.xlabel('Absolute Coefficient Value')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

print("\nModel and Dataset Information:")
print(f"Number of original features: {X_train.shape[1]}")
print(f"Number of selected features: {len(selected_features)}")
print(f"Total training samples: {len(X_train)}")
print("\nClass Distribution:")
print(y_train.value_counts(normalize=True).round(4) * 100)

print("\nFeature Coefficients (with direction):")
for idx, (feature, coef) in enumerate(zip(feature_importance['Feature'],
                                        feature_importance['Coefficient']), 1):
    sign = '+' if coef > 0 else ''
    print(f"{idx}. {feature}: {sign}{coef:.4f}")

### hyperparameter tuning

Baseline

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import optuna
import warnings
warnings.filterwarnings('ignore')

# Use combined dataset
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

def objective(trial):
    params = {
        'C': trial.suggest_float('C', 1e-4, 1e4, log=True),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
        'max_iter': 2000,
        'random_state': 42
    }

    model = LogisticRegression(**params)
    scores = cross_val_score(model, X_train_scaled, y_train, cv=3, scoring='roc_auc')
    return scores.mean()

print("Optimizing hyperparameters...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\nBest parameters found:")
print(study.best_params)
print(f"Best ROC-AUC score: {study.best_value:.4f}")

best_params = study.best_params
best_params['max_iter'] = 2000
best_params['random_state'] = 42
final_model = LogisticRegression(**best_params)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(final_model, X_train_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(final_model, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]

conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title(f'Confusion Matrix (Optimized Model)\nBest ROC-AUC: {study.best_value:.4f}')
plt.show()

print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

plt.figure(figsize=(10, 6))
optimization_history = np.array([t.value for t in study.trials])
plt.plot(optimization_history)
plt.title('Optimization History')
plt.xlabel('Trial')
plt.ylabel('ROC-AUC Score')
plt.show()

plt.figure(figsize=(12, 8))
fold_count = 1

for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    final_model.fit(X_train_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = final_model.predict_proba(X_train_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Optimized Model (3-fold CV)')
plt.legend(loc='lower right')
plt.show()

final_model.fit(X_train_scaled, y_train)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': np.abs(final_model.coef_[0])
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in Optimized Logistic Regression Model')
plt.xlabel('Absolute Coefficient Value')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

optuna.visualization.plot_param_importances(study)
plt.title('Hyperparameter Importance')
plt.show()

print("\nDataset Information:")
print(f"Total Training Samples: {len(X_train)}")
print(f"Number of Features: {X_train.shape[1]}")
print("\nClass Distribution in Training Set:")
print(y_train.value_counts(normalize=True).round(4) * 100)

best_logistic_params = study.best_params
print("\nBest Hyperparameters for future use:")
print(best_logistic_params)

coefficient

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import optuna
import warnings
warnings.filterwarnings('ignore')

# Use combined dataset
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

initial_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
initial_model.fit(X_train_scaled, y_train)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': initial_model.coef_[0],
    'Absolute_Coefficient': np.abs(initial_model.coef_[0])
}).sort_values(by='Absolute_Coefficient', ascending=False)

top_features = feature_importance.head(10)['Feature'].tolist()

print("Selected Top 10 Features:")
for idx, (feature, coef) in enumerate(zip(feature_importance.head(10)['Feature'],
                                        feature_importance.head(10)['Coefficient']), 1):
    print(f"{idx}. {feature}: {coef:.4f}")

X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

def objective(trial):
    params = {
        'C': trial.suggest_float('C', 1e-4, 1e4, log=True),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
        'max_iter': 2000,
        'random_state': 42
    }
    model = LogisticRegression(**params)
    scores = cross_val_score(model, X_train_scaled_top, y_train, cv=3, scoring='roc_auc')
    return scores.mean()

print("\nOptimizing hyperparameters...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\nBest parameters found:")
print(study.best_params)
print(f"Best ROC-AUC score: {study.best_value:.4f}")

best_params = study.best_params.copy()
best_params['max_iter'] = 2000
best_params['random_state'] = 42
best_model = LogisticRegression(**best_params)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(best_model, X_train_scaled_top, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(best_model, X_train_scaled_top, y_train, cv=cv, method='predict_proba')[:, 1]

conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title(f'Confusion Matrix (Optimized Top 10 Features Model)\nBest ROC-AUC: {study.best_value:.4f}')
plt.show()

print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

plt.figure(figsize=(10, 6))
optimization_history = np.array([t.value for t in study.trials])
plt.plot(optimization_history)
plt.title('Optimization History')
plt.xlabel('Trial')
plt.ylabel('ROC-AUC Score')
plt.show()

plt.figure(figsize=(12, 8))
fold_count = 1

for train_idx, test_idx in cv.split(X_train_scaled_top, y_train):
    best_model.fit(X_train_scaled_top[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = best_model.predict_proba(X_train_scaled_top[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Optimized Top 10 Features Model (3-fold CV)')
plt.legend(loc='lower right')
plt.show()

final_model = LogisticRegression(**best_params)
final_model.fit(X_train_scaled_top, y_train)

final_coefficients = pd.DataFrame({
    'Feature': top_features,
    'Coefficient': final_model.coef_[0],
    'Absolute_Coefficient': np.abs(final_model.coef_[0])
}).sort_values(by='Absolute_Coefficient', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=final_coefficients, x='Absolute_Coefficient', y='Feature', palette='viridis')
plt.title('Feature Coefficients in Optimized Top 10 Features Model')
plt.xlabel('Absolute Coefficient Value')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

optuna.visualization.plot_param_importances(study)
plt.title('Hyperparameter Importance')
plt.show()

print("\nFeature Coefficients (with direction):")
for idx, (feature, coef) in enumerate(zip(final_coefficients['Feature'],
                                        final_coefficients['Coefficient']), 1):
    sign = '+' if coef > 0 else ''
    print(f"{idx}. {feature}: {sign}{coef:.4f}")

print("\nModel and Dataset Information:")
print(f"Number of features used: {len(top_features)}")
print(f"Total training samples: {len(X_train)}")
print("\nClass Distribution:")
print(y_train.value_counts(normalize=True).round(4) * 100)

shap

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import shap
import optuna
import warnings
warnings.filterwarnings('ignore')

# Use combined dataset
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Get SHAP feature importance
initial_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
initial_model.fit(X_train_scaled, y_train)

explainer = shap.LinearExplainer(initial_model, X_train_scaled)
shap_values = explainer.shap_values(X_train_scaled)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': np.mean(np.abs(shap_values), axis=0)
}).sort_values(by='Importance', ascending=False)

top_features = feature_importance.head(10)['Feature'].tolist()
print("\nSelected Top 10 Features by SHAP:")
for idx, (feature, importance) in enumerate(zip(feature_importance.head(10)['Feature'],
                                              feature_importance.head(10)['Importance']), 1):
    print(f"{idx}. {feature}: {importance:.4f}")

X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

def objective(trial):
    params = {
        'C': trial.suggest_float('C', 1e-4, 1e4, log=True),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
        'max_iter': 2000,
        'random_state': 42
    }

    model = LogisticRegression(**params)
    scores = cross_val_score(model, X_train_scaled_top, y_train, cv=3, scoring='roc_auc')
    return scores.mean()

print("\nOptimizing hyperparameters...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\nBest parameters found:")
print(study.best_params)
print(f"Best ROC-AUC score: {study.best_value:.4f}")

best_params = study.best_params.copy()
best_params['max_iter'] = 2000
best_params['random_state'] = 42
best_model = LogisticRegression(**best_params)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(best_model, X_train_scaled_top, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(best_model, X_train_scaled_top, y_train, cv=cv, method='predict_proba')[:, 1]

conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title(f'Confusion Matrix (Optimized Model)\nBest ROC-AUC: {study.best_value:.4f}')
plt.show()

print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

plt.figure(figsize=(10, 6))
optimization_history = np.array([t.value for t in study.trials])
plt.plot(optimization_history)
plt.title('Optimization History')
plt.xlabel('Trial')
plt.ylabel('ROC-AUC Score')
plt.show()

plt.figure(figsize=(12, 8))
fold_count = 1

for train_idx, test_idx in cv.split(X_train_scaled_top, y_train):
    best_model.fit(X_train_scaled_top[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = best_model.predict_proba(X_train_scaled_top[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Optimized Model (3-fold CV)')
plt.legend(loc='lower right')
plt.show()

final_model = LogisticRegression(**best_params)
final_model.fit(X_train_scaled_top, y_train)

feature_coefficients = pd.DataFrame({
    'Feature': top_features,
    'Coefficient': final_model.coef_[0],
    'Absolute_Coefficient': np.abs(final_model.coef_[0])
}).sort_values(by='Absolute_Coefficient', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_coefficients, x='Absolute_Coefficient', y='Feature', palette='viridis')
plt.title('Feature Coefficients in Optimized Model')
plt.xlabel('Absolute Coefficient Value')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

optuna.visualization.plot_param_importances(study)
plt.title('Hyperparameter Importance')
plt.show()

final_explainer = shap.LinearExplainer(final_model, X_train_scaled_top)
final_shap_values = final_explainer.shap_values(X_train_scaled_top)

plt.figure(figsize=(12, 8))
shap.summary_plot(final_shap_values, X_train_scaled_top,
                 feature_names=top_features, show=False)
plt.title('SHAP Summary Plot for Optimized Model')
plt.tight_layout()
plt.show()

print("\nFeature Coefficients in Optimized Model (with direction):")
for idx, (feature, coef) in enumerate(zip(feature_coefficients['Feature'],
                                        feature_coefficients['Coefficient']), 1):
    sign = '+' if coef > 0 else ''
    print(f"{idx}. {feature}: {sign}{coef:.4f}")

print("\nModel and Dataset Information:")
print(f"Number of features used: {len(top_features)}")
print(f"Total training samples: {len(X_train)}")
print("\nClass Distribution:")
print(y_train.value_counts(normalize=True).round(4) * 100)

best_lr_params = study.best_params
print("\nBest Hyperparameters for future use:")
print(best_lr_params)

boruta

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import optuna
import warnings
warnings.filterwarnings('ignore')

X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

print("Running Boruta feature selection...")
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
boruta_selector = BorutaPy(
    rf,
    n_estimators='auto',
    verbose=2,
    random_state=42
)

boruta_selector.fit(X_train_scaled, y_train)

selected_feat_mask = boruta_selector.support_
selected_features = X_train.columns[selected_feat_mask].tolist()

print("\nSelected Features by Boruta:")
for idx, feature in enumerate(selected_features, 1):
    print(f"{idx}. {feature}")

X_train_selected = X_train[selected_features]
X_train_scaled_selected = scaler.fit_transform(X_train_selected)

def objective(trial):
    params = {
        'C': trial.suggest_float('C', 1e-4, 1e4, log=True),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
        'max_iter': 2000,
        'random_state': 42
    }

    model = LogisticRegression(**params)
    scores = cross_val_score(model, X_train_scaled_selected, y_train, cv=3, scoring='roc_auc')
    return scores.mean()

print("\nOptimizing hyperparameters...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\nBest parameters found:")
print(study.best_params)
print(f"Best ROC-AUC score: {study.best_value:.4f}")

best_params = study.best_params.copy()
best_params['max_iter'] = 2000
best_params['random_state'] = 42
best_model = LogisticRegression(**best_params)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(best_model, X_train_scaled_selected, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(best_model, X_train_scaled_selected, y_train, cv=cv, method='predict_proba')[:, 1]

conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title(f'Confusion Matrix (Optimized Model)\nBest ROC-AUC: {study.best_value:.4f}')
plt.show()

print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

plt.figure(figsize=(10, 6))
optimization_history = np.array([t.value for t in study.trials])
plt.plot(optimization_history)
plt.title('Optimization History')
plt.xlabel('Trial')
plt.ylabel('ROC-AUC Score')
plt.show()

plt.figure(figsize=(12, 8))
fold_count = 1

for train_idx, test_idx in cv.split(X_train_scaled_selected, y_train):
    best_model.fit(X_train_scaled_selected[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = best_model.predict_proba(X_train_scaled_selected[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Optimized Model (3-fold CV)')
plt.legend(loc='lower right')
plt.show()

final_model = LogisticRegression(**best_params)
final_model.fit(X_train_scaled_selected, y_train)

feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Coefficient': final_model.coef_[0],
    'Absolute_Coefficient': np.abs(final_model.coef_[0])
}).sort_values(by='Absolute_Coefficient', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Absolute_Coefficient', y='Feature', palette='viridis')
plt.title('Feature Coefficients in Optimized Model')
plt.xlabel('Absolute Coefficient Value')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

optuna.visualization.plot_param_importances(study)
plt.title('Hyperparameter Importance')
plt.show()

print("\nModel and Dataset Information:")
print(f"Number of original features: {X_train.shape[1]}")
print(f"Number of selected features: {len(selected_features)}")
print(f"Total training samples: {len(X_train)}")
print("\nFeature Coefficients (with direction):")
for idx, (feature, coef) in enumerate(zip(feature_importance['Feature'],
                                        feature_importance['Coefficient']), 1):
    sign = '+' if coef > 0 else ''
    print(f"{idx}. {feature}: {sign}{coef:.4f}")

best_lr_params = study.best_params
print("\nBest Hyperparameters for future use:")
print(best_lr_params)

### test set

baseline model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

def evaluate_and_visualize(y_true, y_pred, y_pred_proba, dataset_name):
    """Helper function to evaluate and visualize results"""
    accuracy = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    conf_matrix = confusion_matrix(y_true, y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - {dataset_name}\nROC-AUC: {roc_auc:.4f}')
    plt.show()

    plt.figure(figsize=(8, 6))
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {dataset_name}')
    plt.legend(loc='lower right')
    plt.show()

    print(f"\nClassification Report - {dataset_name}")
    print(classification_report(y_true, y_pred))

    return {
        'Dataset': dataset_name,
        'Accuracy': accuracy,
        'ROC AUC': roc_auc,
        'Confusion Matrix': conf_matrix,
        'Classification Report': classification_report(y_true, y_pred)
    }

# Prepare data
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

X_test = combined_test[selected_columns].drop(columns=['is_canceled'])
y_test = combined_test['is_canceled']

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train model with best parameters
best_params = {
    'C': 7666.758151745492,
    'penalty': 'l2',
    'solver': 'liblinear',
    'max_iter': 2000,
    'random_state': 42
}

best_model = LogisticRegression(**best_params)
best_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = best_model.predict(X_test_scaled)
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate on combined test set
print("\nEvaluating on Combined Test Set:")
combined_results = evaluate_and_visualize(y_test, y_pred, y_pred_proba, 'Combined Test Set')

# Previous code remains same until masks
# Update masks for H1 (City Hotel) and H2 (Resort Hotel)
mask_h1 = combined_test['hotel'] == 0  # City Hotel
y_pred_h1 = best_model.predict(X_test_scaled[mask_h1])
y_pred_proba_h1 = best_model.predict_proba(X_test_scaled[mask_h1])[:, 1]
print("\nEvaluating on H1 Test Set:")
h1_results = evaluate_and_visualize(y_test[mask_h1], y_pred_h1, y_pred_proba_h1, 'H1 Test Set')

mask_h2 = combined_test['hotel'] == 1  # Resort Hotel
y_pred_h2 = best_model.predict(X_test_scaled[mask_h2])
y_pred_proba_h2 = best_model.predict_proba(X_test_scaled[mask_h2])[:, 1]
print("\nEvaluating on H2 Test Set:")
h2_results = evaluate_and_visualize(y_test[mask_h2], y_pred_h2, y_pred_proba_h2, 'H2 Test Set')

# Visualize feature importance
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': np.abs(best_model.coef_[0])
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in Optimized Logistic Regression Model')
plt.xlabel('Absolute Coefficient Value')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# Print test set information
print("\nTest Set Information:")
print(f"Total Test Samples: {len(X_test)}")
print(f"H1 Test Samples: {sum(mask_h1)}")
print(f"H2 Test Samples: {sum(mask_h2)}")
print("\nClass Distribution in Test Set:")
print(y_test.value_counts(normalize=True).round(4) * 100)

# Create performance comparison
performance_comparison = pd.DataFrame({
    'Dataset': ['Combined Test Set', 'H1 Test Set', 'H2 Test Set'],
    'ROC AUC': [
        roc_auc_score(y_test, y_pred_proba),
        roc_auc_score(y_test[mask_h1], y_pred_proba_h1),
        roc_auc_score(y_test[mask_h2], y_pred_proba_h2)
    ],
    'Accuracy': [
        accuracy_score(y_test, y_pred),
        accuracy_score(y_test[mask_h1], y_pred_h1),
        accuracy_score(y_test[mask_h2], y_pred_h2)
    ]
})

print("\nPerformance Comparison:")
print(performance_comparison.round(4))

# Plot performance comparison
plt.figure(figsize=(10, 6))
performance_metrics = performance_comparison.melt(id_vars=['Dataset'], var_name='Metric', value_name='Score')
sns.barplot(data=performance_metrics, x='Dataset', y='Score', hue='Metric')
plt.title('Performance Comparison Across Test Sets')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Create detailed performance comparison
performance_comparison = pd.DataFrame({
    'Dataset': ['Combined Test Set', 'H1 Test Set', 'H2 Test Set'],
    'ROC AUC': [
        roc_auc_score(y_test, y_pred_proba),
        roc_auc_score(y_test[mask_h1], y_pred_proba_h1),
        roc_auc_score(y_test[mask_h2], y_pred_proba_h2)
    ],
    'Accuracy': [
        accuracy_score(y_test, y_pred),
        accuracy_score(y_test[mask_h1], y_pred_h1),
        accuracy_score(y_test[mask_h2], y_pred_h2)
    ],
    'Precision': [
        precision_score(y_test, y_pred),
        precision_score(y_test[mask_h1], y_pred_h1),
        precision_score(y_test[mask_h2], y_pred_h2)
    ],
    'Recall': [
        recall_score(y_test, y_pred),
        recall_score(y_test[mask_h1], y_pred_h1),
        recall_score(y_test[mask_h2], y_pred_h2)
    ],
    'F1 Score': [
        f1_score(y_test, y_pred),
        f1_score(y_test[mask_h1], y_pred_h1),
        f1_score(y_test[mask_h2], y_pred_h2)
    ]
})

print("\nDetailed Performance Comparison:")
print(performance_comparison.round(4))

# Grouped bar plot
plt.figure(figsize=(15, 8))
bar_width = 0.15
metrics = ['ROC AUC', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
x = np.arange(len(performance_comparison['Dataset']))

for i, metric in enumerate(metrics):
    plt.bar(x + i * bar_width,
           performance_comparison[metric],
           bar_width,
           label=metric,
           alpha=0.8)

plt.xlabel('Dataset')
plt.ylabel('Score')
plt.title('Complete Performance Comparison Across Test Sets')
plt.xticks(x + bar_width * 2, performance_comparison['Dataset'], rotation=45)
plt.legend(loc='lower right')
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Performance heatmap
plt.figure(figsize=(12, 6))
metrics_df = performance_comparison.set_index('Dataset')
sns.heatmap(metrics_df, annot=True, cmap='YlOrRd', fmt='.4f', cbar_kws={'label': 'Score'})
plt.title('Performance Metrics Heatmap')
plt.tight_layout()
plt.show()

# Relative performance heatmap
relative_performance = metrics_df.div(metrics_df.max()) * 100
plt.figure(figsize=(12, 6))
sns.heatmap(relative_performance, annot=True, cmap='YlOrRd', fmt='.1f',
            cbar_kws={'label': 'Relative Performance (%)'})
plt.title('Relative Performance Metrics Heatmap (%)')
plt.tight_layout()
plt.show()

# Individual metric comparisons
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
axes = axes.ravel()
colors = sns.color_palette('husl', n_colors=len(metrics))

for i, metric in enumerate(metrics):
    sns.barplot(data=performance_comparison, x='Dataset', y=metric, ax=axes[i],
                color=colors[i])
    axes[i].set_title(f'{metric} Comparison')
    axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)
    axes[i].grid(True, axis='y', linestyle='--', alpha=0.7)

# Remove extra subplots
for i in range(len(metrics), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

# Print detailed statistics
print("\nDetailed Statistics:")
print("\nMean Performance Across Datasets:")
print(metrics_df.mean().round(4))
print("\nStandard Deviation of Performance:")
print(metrics_df.std().round(4))
print("\nRange of Performance (Max - Min):")
print((metrics_df.max() - metrics_df.min()).round(4))

# Calculate and display performance differences
print("\nPerformance Difference from Combined Test Set (%):")
baseline = metrics_df.loc['Combined Test Set']
for dataset in ['H1 Test Set', 'H2 Test Set']:
    diff_percentage = ((metrics_df.loc[dataset] - baseline) / baseline * 100).round(2)
    print(f"\n{dataset}:")
    for metric, value in diff_percentage.items():
        print(f"{metric}: {value:+.2f}%")

coefficient

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

def evaluate_and_visualize(y_true, y_pred, y_pred_proba, method_name, dataset_name):
    """Helper function to evaluate and visualize results"""
    accuracy = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    conf_matrix = confusion_matrix(y_true, y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - {method_name}\n{dataset_name} (ROC-AUC: {roc_auc:.4f})')
    plt.show()

    plt.figure(figsize=(8, 6))
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {method_name}\n{dataset_name}')
    plt.legend(loc='lower right')
    plt.show()

    print(f"\nClassification Report - {method_name} - {dataset_name}")
    print(classification_report(y_true, y_pred))

    return {
        'Dataset': dataset_name,
        'Accuracy': accuracy,
        'ROC AUC': roc_auc,
        'Confusion Matrix': conf_matrix
    }

# Data preparation
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

X_test = combined_test[selected_columns].drop(columns=['is_canceled'])
y_test = combined_test['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Get top 10 features
initial_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
initial_model.fit(X_train_scaled, y_train)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': initial_model.coef_[0],
    'Absolute_Coefficient': np.abs(initial_model.coef_[0])
}).sort_values(by='Absolute_Coefficient', ascending=False)

top_features = feature_importance.head(10)['Feature'].tolist()

# Prepare top features data
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

X_train_scaled_top = scaler.fit_transform(X_train_top)
X_test_scaled_top = scaler.transform(X_test_top)

# Create model with best parameters
best_params = {
    'C': 38.31866597407751,
    'penalty': 'l2',
    'solver': 'liblinear',
    'max_iter': 2000,
    'random_state': 42
}

best_model = LogisticRegression(**best_params)
best_model.fit(X_train_scaled_top, y_train)

# Evaluate on combined test set
y_pred = best_model.predict(X_test_scaled_top)
y_pred_proba = best_model.predict_proba(X_test_scaled_top)[:, 1]
combined_results = evaluate_and_visualize(y_test, y_pred, y_pred_proba,
                                        'Optimized Top 10 Model', 'Combined Test Set')

# Evaluate on H1 test set
mask_h1 = combined_test['hotel'] == 0  # City Hotel
y_pred_h1 = best_model.predict(X_test_scaled_top[mask_h1])
y_pred_proba_h1 = best_model.predict_proba(X_test_scaled_top[mask_h1])[:, 1]
h1_results = evaluate_and_visualize(y_test[mask_h1], y_pred_h1, y_pred_proba_h1,
                                  'Optimized Top 10 Model', 'H1 Test Set')

# Evaluate on H2 test set
mask_h2 = combined_test['hotel'] == 1  # Resort Hotel
y_pred_h2 = best_model.predict(X_test_scaled_top[mask_h2])
y_pred_proba_h2 = best_model.predict_proba(X_test_scaled_top[mask_h2])[:, 1]
h2_results = evaluate_and_visualize(y_test[mask_h2], y_pred_h2, y_pred_proba_h2,
                                  'Optimized Top 10 Model', 'H2 Test Set')

# Create performance comparison
results_df = pd.DataFrame([
    {
        'Dataset': 'Combined Test Set',
        'ROC AUC': roc_auc_score(y_test, y_pred_proba),
        'Accuracy': accuracy_score(y_test, y_pred),
    },
    {
        'Dataset': 'H1 Test Set',
        'ROC AUC': roc_auc_score(y_test[mask_h1], y_pred_proba_h1),
        'Accuracy': accuracy_score(y_test[mask_h1], y_pred_h1),
    },
    {
        'Dataset': 'H2 Test Set',
        'ROC AUC': roc_auc_score(y_test[mask_h2], y_pred_proba_h2),
        'Accuracy': accuracy_score(y_test[mask_h2], y_pred_h2),
    }
])

print("\nPerformance Summary:")
print(results_df.round(4))

plt.figure(figsize=(10, 6))
metrics = ['ROC AUC', 'Accuracy']
results_melted = results_df.melt(id_vars=['Dataset'], value_vars=metrics,
                                var_name='Metric', value_name='Score')

sns.barplot(x='Dataset', y='Score', hue='Metric', data=results_melted)
plt.title('Performance Comparison Across Test Sets')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("\nFeature Coefficients in Final Model:")
final_coefficients = pd.DataFrame({
    'Feature': top_features,
    'Coefficient': best_model.coef_[0],
    'Absolute_Coefficient': np.abs(best_model.coef_[0])
}).sort_values(by='Absolute_Coefficient', ascending=False)

for idx, row in final_coefficients.iterrows():
    print(f"{row['Feature']}: {row['Coefficient']:.4f}")

plt.figure(figsize=(12, 6))
sns.barplot(data=final_coefficients, x='Absolute_Coefficient', y='Feature', palette='viridis')
plt.title('Feature Importance in Final Model')
plt.xlabel('Absolute Coefficient Value')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report,
                           roc_auc_score, roc_curve, precision_score, recall_score, f1_score)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

def evaluate_and_visualize(y_true, y_pred, y_pred_proba, method_name, dataset_name):
    accuracy = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    conf_matrix = confusion_matrix(y_true, y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - {method_name}\n{dataset_name}')
    plt.show()

    plt.figure(figsize=(8, 6))
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {method_name}\n{dataset_name}')
    plt.legend(loc='lower right')
    plt.show()

    print(f"\nClassification Report - {method_name} - {dataset_name}")
    print(classification_report(y_true, y_pred))

performance_comparison = pd.DataFrame({
    'Dataset': ['Combined Test Set', 'H1 Test Set', 'H2 Test Set'],
    'ROC AUC': [
        roc_auc_score(y_test, y_pred_proba),
        roc_auc_score(y_test[mask_h1], y_pred_proba_h1),
        roc_auc_score(y_test[mask_h2], y_pred_proba_h2)
    ],
    'Accuracy': [
        accuracy_score(y_test, y_pred),
        accuracy_score(y_test[mask_h1], y_pred_h1),
        accuracy_score(y_test[mask_h2], y_pred_h2)
    ],
    'Precision': [
        precision_score(y_test, y_pred),
        precision_score(y_test[mask_h1], y_pred_h1),
        precision_score(y_test[mask_h2], y_pred_h2)
    ],
    'Recall': [
        recall_score(y_test, y_pred),
        recall_score(y_test[mask_h1], y_pred_h1),
        recall_score(y_test[mask_h2], y_pred_h2)
    ],
    'F1 Score': [
        f1_score(y_test, y_pred),
        f1_score(y_test[mask_h1], y_pred_h1),
        f1_score(y_test[mask_h2], y_pred_h2)
    ]
})

print("\nDetailed Performance Comparison:")
print(performance_comparison.round(4))

# Grouped bar plot
plt.figure(figsize=(15, 8))
bar_width = 0.15
metrics = ['ROC AUC', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
x = np.arange(len(performance_comparison['Dataset']))

for i, metric in enumerate(metrics):
    plt.bar(x + i * bar_width,
           performance_comparison[metric],
           bar_width,
           label=metric,
           alpha=0.8)

plt.xlabel('Dataset')
plt.ylabel('Score')
plt.title('Complete Performance Comparison Across Test Sets')
plt.xticks(x + bar_width * 2, performance_comparison['Dataset'], rotation=45)
plt.legend(loc='lower right')
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Performance metrics heatmap
plt.figure(figsize=(12, 6))
metrics_df = performance_comparison.set_index('Dataset')
sns.heatmap(metrics_df, annot=True, cmap='YlOrRd', fmt='.4f', cbar_kws={'label': 'Score'})
plt.title('Performance Metrics Heatmap')
plt.tight_layout()
plt.show()

# Relative performance heatmap
relative_performance = metrics_df.div(metrics_df.max()) * 100
plt.figure(figsize=(12, 6))
sns.heatmap(relative_performance, annot=True, cmap='YlOrRd', fmt='.1f',
            cbar_kws={'label': 'Relative Performance (%)'})
plt.title('Relative Performance Metrics Heatmap (%)')
plt.tight_layout()
plt.show()

# Individual metric comparisons
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
axes = axes.ravel()
colors = sns.color_palette('husl', n_colors=len(metrics))

for i, metric in enumerate(metrics):
    sns.barplot(data=performance_comparison, x='Dataset', y=metric, ax=axes[i],
                color=colors[i])
    axes[i].set_title(f'{metric} Comparison')
    axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)
    axes[i].grid(True, axis='y', linestyle='--', alpha=0.7)

for i in range(len(metrics), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

print("\nDetailed Statistics:")
print("\nMean Performance Across Datasets:")
print(metrics_df.mean().round(4))
print("\nStandard Deviation of Performance:")
print(metrics_df.std().round(4))
print("\nRange of Performance (Max - Min):")
print((metrics_df.max() - metrics_df.min()).round(4))

print("\nPerformance Difference from Combined Test Set (%):")
baseline = metrics_df.loc['Combined Test Set']
for dataset in ['H1 Test Set', 'H2 Test Set']:
    diff_percentage = ((metrics_df.loc[dataset] - baseline) / baseline * 100).round(2)
    print(f"\n{dataset}:")
    for metric, value in diff_percentage.items():
        print(f"{metric}: {value:+.2f}%")

shap

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report,
                           roc_auc_score, roc_curve, precision_score, recall_score, f1_score)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import shap
import warnings
warnings.filterwarnings('ignore')

def evaluate_and_visualize(y_true, y_pred, y_pred_proba, dataset_name):
    accuracy = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    conf_matrix = confusion_matrix(y_true, y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix\n{dataset_name}')
    plt.show()

    plt.figure(figsize=(8, 6))
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve\n{dataset_name}')
    plt.legend(loc='lower right')
    plt.show()

    print(f"\nClassification Report - {dataset_name}")
    print(classification_report(y_true, y_pred))

X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']
X_test = combined_test[selected_columns].drop(columns=['is_canceled'])
y_test = combined_test['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

initial_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
initial_model.fit(X_train_scaled, y_train)

explainer = shap.LinearExplainer(initial_model, X_train_scaled)
shap_values = explainer.shap_values(X_train_scaled)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': np.mean(np.abs(shap_values), axis=0)
}).sort_values(by='Importance', ascending=False)

top_features = feature_importance.head(10)['Feature'].tolist()

X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

X_train_scaled_top = scaler.fit_transform(X_train_top)
X_test_scaled_top = scaler.transform(X_test_top)

best_params = {
    'C': 5221.17572683451,
    'penalty': 'l2',
    'solver': 'liblinear',
    'max_iter': 2000,
    'random_state': 42
}

best_model = LogisticRegression(**best_params)
best_model.fit(X_train_scaled_top, y_train)

mask_h1 = combined_test['hotel'] == 0
mask_h2 = combined_test['hotel'] == 1

y_pred = best_model.predict(X_test_scaled_top)
y_pred_proba = best_model.predict_proba(X_test_scaled_top)[:, 1]

y_pred_h1 = best_model.predict(X_test_scaled_top[mask_h1])
y_pred_proba_h1 = best_model.predict_proba(X_test_scaled_top[mask_h1])[:, 1]

y_pred_h2 = best_model.predict(X_test_scaled_top[mask_h2])
y_pred_proba_h2 = best_model.predict_proba(X_test_scaled_top[mask_h2])[:, 1]

performance_comparison = pd.DataFrame({
    'Dataset': ['Combined Test Set', 'H1 Test Set', 'H2 Test Set'],
    'ROC AUC': [
        roc_auc_score(y_test, y_pred_proba),
        roc_auc_score(y_test[mask_h1], y_pred_proba_h1),
        roc_auc_score(y_test[mask_h2], y_pred_proba_h2)
    ],
    'Accuracy': [
        accuracy_score(y_test, y_pred),
        accuracy_score(y_test[mask_h1], y_pred_h1),
        accuracy_score(y_test[mask_h2], y_pred_h2)
    ],
    'Precision': [
        precision_score(y_test, y_pred),
        precision_score(y_test[mask_h1], y_pred_h1),
        precision_score(y_test[mask_h2], y_pred_h2)
    ],
    'Recall': [
        recall_score(y_test, y_pred),
        recall_score(y_test[mask_h1], y_pred_h1),
        recall_score(y_test[mask_h2], y_pred_h2)
    ],
    'F1 Score': [
        f1_score(y_test, y_pred),
        f1_score(y_test[mask_h1], y_pred_h1),
        f1_score(y_test[mask_h2], y_pred_h2)
    ]
})

print("\nDetailed Performance Comparison:")
print(performance_comparison.round(4))

plt.figure(figsize=(15, 8))
bar_width = 0.15
metrics = ['ROC AUC', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
x = np.arange(len(performance_comparison['Dataset']))

for i, metric in enumerate(metrics):
    plt.bar(x + i * bar_width,
           performance_comparison[metric],
           bar_width,
           label=metric,
           alpha=0.8)

plt.xlabel('Dataset')
plt.ylabel('Score')
plt.title('Complete Performance Comparison Across Test Sets')
plt.xticks(x + bar_width * 2, performance_comparison['Dataset'], rotation=45)
plt.legend(loc='lower right')
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
metrics_df = performance_comparison.set_index('Dataset')
sns.heatmap(metrics_df, annot=True, cmap='YlOrRd', fmt='.4f', cbar_kws={'label': 'Score'})
plt.title('Performance Metrics Heatmap')
plt.tight_layout()
plt.show()

relative_performance = metrics_df.div(metrics_df.max()) * 100
plt.figure(figsize=(12, 6))
sns.heatmap(relative_performance, annot=True, cmap='YlOrRd', fmt='.1f',
            cbar_kws={'label': 'Relative Performance (%)'})
plt.title('Relative Performance Metrics Heatmap (%)')
plt.tight_layout()
plt.show()

fig, axes = plt.subplots(2, 3, figsize=(20, 12))
axes = axes.ravel()
colors = sns.color_palette('husl', n_colors=len(metrics))

for i, metric in enumerate(metrics):
    sns.barplot(data=performance_comparison, x='Dataset', y=metric, ax=axes[i],
                color=colors[i])
    axes[i].set_title(f'{metric} Comparison')
    axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)
    axes[i].grid(True, axis='y', linestyle='--', alpha=0.7)

for i in range(len(metrics), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

print("\nDetailed Statistics:")
print("\nMean Performance Across Datasets:")
print(metrics_df.mean().round(4))
print("\nStandard Deviation of Performance:")
print(metrics_df.std().round(4))
print("\nRange of Performance (Max - Min):")
print((metrics_df.max() - metrics_df.min()).round(4))

print("\nPerformance Difference from Combined Test Set (%):")
baseline = metrics_df.loc['Combined Test Set']
for dataset in ['H1 Test Set', 'H2 Test Set']:
    diff_percentage = ((metrics_df.loc[dataset] - baseline) / baseline * 100).round(2)
    print(f"\n{dataset}:")
    for metric, value in diff_percentage.items():
        print(f"{metric}: {value:+.2f}%")

boruta

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report,
                           roc_auc_score, roc_curve, precision_score, recall_score, f1_score)
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

def evaluate_and_visualize(y_true, y_pred, y_pred_proba, dataset_name):
    accuracy = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    conf_matrix = confusion_matrix(y_true, y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix\n{dataset_name}')
    plt.show()

    plt.figure(figsize=(8, 6))
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve\n{dataset_name}')
    plt.legend(loc='lower right')
    plt.show()

    print(f"\nClassification Report - {dataset_name}")
    print(classification_report(y_true, y_pred))

X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']
X_test = combined_test[selected_columns].drop(columns=['is_canceled'])
y_test = combined_test['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

rf = RandomForestClassifier(n_jobs=-1, random_state=42)
boruta_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42)
boruta_selector.fit(X_train_scaled, y_train)

selected_features = X_train.columns[boruta_selector.support_].tolist()

X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

X_train_scaled_selected = scaler.fit_transform(X_train_selected)
X_test_scaled_selected = scaler.transform(X_test_selected)

best_params = {
    'C': 18.0836808047728,
    'penalty': 'l1',
    'solver': 'liblinear',
    'max_iter': 2000,
    'random_state': 42
}

best_model = LogisticRegression(**best_params)
best_model.fit(X_train_scaled_selected, y_train)

mask_h1 = combined_test['hotel'] == 0
mask_h2 = combined_test['hotel'] == 1

y_pred = best_model.predict(X_test_scaled_selected)
y_pred_proba = best_model.predict_proba(X_test_scaled_selected)[:, 1]

y_pred_h1 = best_model.predict(X_test_scaled_selected[mask_h1])
y_pred_proba_h1 = best_model.predict_proba(X_test_scaled_selected[mask_h1])[:, 1]

y_pred_h2 = best_model.predict(X_test_scaled_selected[mask_h2])
y_pred_proba_h2 = best_model.predict_proba(X_test_scaled_selected[mask_h2])[:, 1]

performance_comparison = pd.DataFrame({
    'Dataset': ['Combined Test Set', 'H1 Test Set', 'H2 Test Set'],
    'ROC AUC': [
        roc_auc_score(y_test, y_pred_proba),
        roc_auc_score(y_test[mask_h1], y_pred_proba_h1),
        roc_auc_score(y_test[mask_h2], y_pred_proba_h2)
    ],
    'Accuracy': [
        accuracy_score(y_test, y_pred),
        accuracy_score(y_test[mask_h1], y_pred_h1),
        accuracy_score(y_test[mask_h2], y_pred_h2)
    ],
    'Precision': [
        precision_score(y_test, y_pred),
        precision_score(y_test[mask_h1], y_pred_h1),
        precision_score(y_test[mask_h2], y_pred_h2)
    ],
    'Recall': [
        recall_score(y_test, y_pred),
        recall_score(y_test[mask_h1], y_pred_h1),
        recall_score(y_test[mask_h2], y_pred_h2)
    ],
    'F1 Score': [
        f1_score(y_test, y_pred),
        f1_score(y_test[mask_h1], y_pred_h1),
        f1_score(y_test[mask_h2], y_pred_h2)
    ]
})

print("\nDetailed Performance Comparison:")
print(performance_comparison.round(4))

plt.figure(figsize=(15, 8))
bar_width = 0.15
metrics = ['ROC AUC', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
x = np.arange(len(performance_comparison['Dataset']))

for i, metric in enumerate(metrics):
    plt.bar(x + i * bar_width,
           performance_comparison[metric],
           bar_width,
           label=metric,
           alpha=0.8)

plt.xlabel('Dataset')
plt.ylabel('Score')
plt.title('Complete Performance Comparison Across Test Sets')
plt.xticks(x + bar_width * 2, performance_comparison['Dataset'], rotation=45)
plt.legend(loc='lower right')
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
metrics_df = performance_comparison.set_index('Dataset')
sns.heatmap(metrics_df, annot=True, cmap='YlOrRd', fmt='.4f', cbar_kws={'label': 'Score'})
plt.title('Performance Metrics Heatmap')
plt.tight_layout()
plt.show()

relative_performance = metrics_df.div(metrics_df.max()) * 100
plt.figure(figsize=(12, 6))
sns.heatmap(relative_performance, annot=True, cmap='YlOrRd', fmt='.1f',
            cbar_kws={'label': 'Relative Performance (%)'})
plt.title('Relative Performance Metrics Heatmap (%)')
plt.tight_layout()
plt.show()

fig, axes = plt.subplots(2, 3, figsize=(20, 12))
axes = axes.ravel()
colors = sns.color_palette('husl', n_colors=len(metrics))

for i, metric in enumerate(metrics):
    sns.barplot(data=performance_comparison, x='Dataset', y=metric, ax=axes[i],
                color=colors[i])
    axes[i].set_title(f'{metric} Comparison')
    axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)
    axes[i].grid(True, axis='y', linestyle='--', alpha=0.7)

for i in range(len(metrics), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

print("\nDetailed Statistics:")
print("\nMean Performance Across Datasets:")
print(metrics_df.mean().round(4))
print("\nStandard Deviation of Performance:")
print(metrics_df.std().round(4))
print("\nRange of Performance (Max - Min):")
print((metrics_df.max() - metrics_df.min()).round(4))

print("\nPerformance Difference from Combined Test Set (%):")
baseline = metrics_df.loc['Combined Test Set']
for dataset in ['H1 Test Set', 'H2 Test Set']:
    diff_percentage = ((metrics_df.loc[dataset] - baseline) / baseline * 100).round(2)
    print(f"\n{dataset}:")
    for metric, value in diff_percentage.items():
        print(f"{metric}: {value:+.2f}%")

 ## 2.Random Forest





### baseline model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Prepare data with combined dataset
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Create and evaluate random forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(rf_model, X_train_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(rf_model, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]

conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Combined Dataset Confusion Matrix (Random Forest)')
plt.show()

print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))
print(f"\nOverall ROC AUC Score: {roc_auc_score(y_train, y_pred_proba_cv):.4f}")

plt.figure(figsize=(12, 8))
fold_count = 1

for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    rf_model.fit(X_train_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = rf_model.predict_proba(X_train_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Combined Dataset (Random Forest)')
plt.legend(loc='lower right')
plt.show()

print("\nDataset Information:")
print(f"Total Training Samples: {len(X_train)}")
print(f"Number of Features: {X_train.shape[1]}")
print(f"Class Distribution in Training Set:")
print(y_train.value_counts(normalize=True).round(4) * 100)

final_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
final_model.fit(X_train_scaled, y_train)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': final_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in Random Forest Model')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

print("\nModel Parameters:")
print("n_estimators: 100")
print("random_state: 42")
print("n_jobs: -1 (using all available cores)")

print("\nTop 10 Most Important Features:")
for idx, (feature, importance) in enumerate(zip(feature_importance['Feature'][:10],
                                              feature_importance['Importance'][:10]), 1):
    print(f"{idx}. {feature}: {importance:.4f}")

tree_depths = [tree.get_depth() for tree in final_model.estimators_]

plt.figure(figsize=(10, 6))
plt.hist(tree_depths, bins=20, edgecolor='black')
plt.title('Distribution of Tree Depths in Random Forest')
plt.xlabel('Tree Depth')
plt.ylabel('Count')
plt.show()

print("\nTree Depth Statistics:")
print(f"Mean Depth: {np.mean(tree_depths):.2f}")
print(f"Max Depth: {np.max(tree_depths)}")
print(f"Min Depth: {np.min(tree_depths)}")

importance_cum = np.cumsum(feature_importance['Importance'])
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(importance_cum) + 1), importance_cum)
plt.title('Cumulative Feature Importance')
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Importance')
plt.grid(True)
plt.show()

n_features_90 = np.argmax(importance_cum >= 0.9) + 1
print(f"\nNumber of features needed to explain 90% of variance: {n_features_90}")

### different feature combinations

feature importance

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Prepare data
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Get feature importance
initial_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
initial_model.fit(X_train_scaled, y_train)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': initial_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

top_features = feature_importance.head(10)['Feature'].tolist()

print("Selected Top 10 Features:")
for idx, (feature, importance) in enumerate(zip(feature_importance.head(10)['Feature'],
                                              feature_importance.head(10)['Importance']), 1):
    print(f"{idx}. {feature}: {importance:.4f}")

# Prepare top 10 features data
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# Create and evaluate model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(rf_model, X_train_scaled_top, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(rf_model, X_train_scaled_top, y_train, cv=cv, method='predict_proba')[:, 1]

# Plot confusion matrix
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Top 10 Features)')
plt.show()

print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

overall_auc = roc_auc_score(y_train, y_pred_proba_cv)
print(f"\nOverall ROC AUC Score: {overall_auc:.4f}")

# Plot ROC curves
plt.figure(figsize=(12, 8))
fold_count = 1

for train_idx, test_idx in cv.split(X_train_scaled_top, y_train):
    rf_model.fit(X_train_scaled_top[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = rf_model.predict_proba(X_train_scaled_top[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves (Top 10 Features)')
plt.legend(loc='lower right')
plt.show()

# Train final model and analyze
final_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
final_model.fit(X_train_scaled_top, y_train)

tree_depths = [tree.get_depth() for tree in final_model.estimators_]
plt.figure(figsize=(10, 6))
plt.hist(tree_depths, bins=20, edgecolor='black')
plt.title('Distribution of Tree Depths (Top 10 Features Model)')
plt.xlabel('Tree Depth')
plt.ylabel('Count')
plt.show()

new_feature_importance = pd.DataFrame({
    'Feature': top_features,
    'Importance': final_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=new_feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in Top 10 Features Model')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

importance_cum = np.cumsum(new_feature_importance['Importance'])
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(importance_cum) + 1), importance_cum)
plt.title('Cumulative Feature Importance (Top 10 Features)')
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Importance')
plt.grid(True)
plt.show()

print("\nModel Information:")
print(f"Number of original features: {X_train.shape[1]}")
print(f"Number of selected features: {len(top_features)}")
print(f"Total training samples: {len(X_train)}")

print("\nTree Depth Statistics:")
print(f"Mean Depth: {np.mean(tree_depths):.2f}")
print(f"Max Depth: {np.max(tree_depths)}")
print(f"Min Depth: {np.min(tree_depths)}")

print("\nFeature Importance Ranking in Top 10 Features Model:")
for idx, (feature, importance) in enumerate(zip(new_feature_importance['Feature'],
                                              new_feature_importance['Importance']), 1):
    print(f"{idx}. {feature}: {importance:.4f}")

print("\nPerformance Comparison:")
print(f"Top 10 Features Model ROC AUC: {overall_auc:.4f}")

selected_features = top_features
print("\nSelected features saved for future use.")

shap

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import shap
import warnings
warnings.filterwarnings('ignore')

# 合并H1和H2的训练数据
train_data = pd.concat([H1_train, H2_train], axis=0)
test_data = pd.concat([H1_test, H2_test], axis=0)

# 定义特征和目标变量
X_train = train_data[selected_columns].drop(columns=['is_canceled'])
y_train = train_data['is_canceled']

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 创建和训练初始模型用于SHAP值计算
initial_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
initial_model.fit(X_train_scaled, y_train)

# 计算SHAP值
explainer = shap.TreeExplainer(initial_model)
shap_values = explainer.shap_values(X_train_scaled)

if isinstance(shap_values, list):
    shap_values = shap_values[1]  # For binary classification, get class 1 SHAP values

# 计算每个特征的平均绝对SHAP值
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': np.mean(np.abs(shap_values), axis=0)
}).sort_values(by='Importance', ascending=False)

# 选择top 10特征
top_features = feature_importance.head(10)['Feature'].tolist()

print("Selected Top 10 Features by SHAP:")
for idx, (feature, importance) in enumerate(zip(feature_importance.head(10)['Feature'],
                                              feature_importance.head(10)['Importance']), 1):
    print(f"{idx}. {feature}: {importance:.4f}")

# 可视化SHAP重要性
plt.figure(figsize=(12, 6))
shap.summary_plot(shap_values, X_train_scaled, feature_names=X_train.columns, show=False)
plt.title('SHAP Feature Importance')
plt.tight_layout()
plt.show()

# 准备选定特征的数据
X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

# 创建随机森林模型
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# 进行3-fold交叉验证并预测
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(rf_model, X_train_scaled_top, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(rf_model, X_train_scaled_top, y_train, cv=cv, method='predict_proba')[:, 1]

# 计算混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

# 绘制混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (SHAP Top 10 Features)')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

# 计算总体ROC AUC
overall_auc = roc_auc_score(y_train, y_pred_proba_cv)
print(f"\nOverall ROC AUC Score: {overall_auc:.4f}")

# 绘制ROC曲线
plt.figure(figsize=(12, 8))
fold_count = 1

for train_idx, test_idx in cv.split(X_train_scaled_top, y_train):
    rf_model.fit(X_train_scaled_top[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = rf_model.predict_proba(X_train_scaled_top[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves (SHAP Top 10 Features)')
plt.legend(loc='lower right')
plt.show()

# 训练最终模型
final_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
final_model.fit(X_train_scaled_top, y_train)

# 评估树的深度分布
tree_depths = [tree.get_depth() for tree in final_model.estimators_]

plt.figure(figsize=(10, 6))
plt.hist(tree_depths, bins=20, edgecolor='black')
plt.title('Distribution of Tree Depths')
plt.xlabel('Tree Depth')
plt.ylabel('Count')
plt.show()

# 获取并显示特征重要性
rf_feature_importance = pd.DataFrame({
    'Feature': top_features,
    'Importance': final_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# 可视化随机森林特征重要性
plt.figure(figsize=(12, 6))
sns.barplot(data=rf_feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in Random Forest (SHAP Selected Features)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# 计算和显示SHAP值的详细分析
final_explainer = shap.TreeExplainer(final_model)
final_shap_values = final_explainer.shap_values(X_train_scaled_top)

if isinstance(final_shap_values, list):
    final_shap_values = final_shap_values[1]

plt.figure(figsize=(12, 8))
shap.summary_plot(final_shap_values, X_train_scaled_top,
                 feature_names=top_features, show=False)
plt.title('SHAP Summary Plot for Selected Features')
plt.tight_layout()
plt.show()

# 计算特征重要性的累积分布
importance_cum = np.cumsum(rf_feature_importance['Importance'])
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(importance_cum) + 1), importance_cum)
plt.title('Cumulative Feature Importance')
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Importance')
plt.grid(True)
plt.show()

# 打印模型信息
print("\nModel Information:")
print(f"Number of selected features: {len(top_features)}")
print(f"Total training samples: {len(X_train)}")

print("\nTree Depth Statistics:")
print(f"Mean Depth: {np.mean(tree_depths):.2f}")
print(f"Max Depth: {np.max(tree_depths)}")
print(f"Min Depth: {np.min(tree_depths)}")

# 打印特征重要性
print("\nFeature Importance Ranking:")
for idx, (feature, importance) in enumerate(zip(rf_feature_importance['Feature'],
                                              rf_feature_importance['Importance']), 1):
    print(f"{idx}. {feature}: {importance:.4f}")

# 保存选定的特征列表
selected_features_list = top_features
print("\nSelected features saved for future use.")

In [None]:
# SHAP计算部分的修改（其他代码保持不变）
import shap
import numpy as np

# 从训练集随机抽取样本
np.random.seed(42)
n_samples = 1000  # 可以根据需要调整样本量
sample_indices = np.random.choice(len(X_train_scaled), n_samples, replace=False)
X_train_sample = X_train_scaled[sample_indices]

# 使用background数据来加速计算
background = shap.sample(X_train_scaled, 100)  # 使用100个背景样本

# 创建和训练初始模型
initial_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
initial_model.fit(X_train_scaled, y_train)

# 计算SHAP值 - 使用近似方法
explainer = shap.TreeExplainer(
    initial_model,
    data=background,
    feature_perturbation='interventional',
    model_output='probability'
)
shap_values = explainer.shap_values(X_train_sample)

if isinstance(shap_values, list):
    shap_values = shap_values[1]  # For binary classification, get class 1 SHAP values

# 计算每个特征的平均绝对SHAP值
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': np.mean(np.abs(shap_values), axis=0)
}).sort_values(by='Importance', ascending=False)

# 选择top 10特征
top_features = feature_importance.head(10)['Feature'].tolist()

print("Selected Top 10 Features by SHAP:")
for idx, (feature, importance) in enumerate(zip(feature_importance.head(10)['Feature'],
                                              feature_importance.head(10)['Importance']), 1):
    print(f"{idx}. {feature}: {importance:.4f}")

# 可视化SHAP重要性 - 只使用样本数据
plt.figure(figsize=(12, 6))
shap.summary_plot(
    shap_values,
    X_train_sample,
    feature_names=X_train.columns,
    show=False,
    max_display=10  # 只显示前10个特征
)
plt.title('SHAP Feature Importance (Based on Sample)')
plt.tight_layout()
plt.show()

In [None]:
import shap
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Sample 1000 instances from the training data
np.random.seed(42)
n_samples = 1000
sample_indices = np.random.choice(len(X_train_scaled), n_samples, replace=False)
X_train_sample = X_train_scaled[sample_indices]

# Use 100 background samples
background = shap.sample(X_train_scaled, 100)

# Create and train the initial model
initial_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
initial_model.fit(X_train_scaled, y_train)

# Compute SHAP values using the TreeExplainer
explainer = shap.TreeExplainer(
    initial_model,
    data=background,
    feature_perturbation='interventional',
    model_output='probability'
)
shap_values = explainer.shap_values(X_train_sample)
if isinstance(shap_values, list):
    shap_values = shap_values[1]  # For binary classification, get class 1 SHAP values

# Calculate feature importance
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': np.mean(np.abs(shap_values), axis=0)
}).sort_values(by='Importance', ascending=False)

# Select the top 10 features
top_features = feature_importance.head(10)['Feature'].tolist()
print("Selected Top 10 Features by SHAP:")
for idx, (feature, importance) in enumerate(zip(feature_importance.head(10)['Feature'],
                                              feature_importance.head(10)['Importance']), 1):
    print(f"{idx}. {feature}: {importance:.4f}")

# Visualize the SHAP feature importance
plt.figure(figsize=(12, 6))
shap.summary_plot(
    shap_values,
    X_train_sample,
    feature_names=X_train.columns,
    show=False,
    max_display=10
)
plt.title('SHAP Feature Importance (Based on Sample)')
plt.tight_layout()
plt.show()

In [None]:
import shap
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Assume X_train_scaled and y_train are already prepared
X_train_sample = X_train_scaled[:1000]  # Sample 1000 instances

# Flatten any higher-dimensional features in X_train_sample
X_train_sample = [x.ravel() for x in X_train_sample]

# Create and train the initial model
initial_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
initial_model.fit(X_train_scaled, y_train)

# Compute SHAP values using the TreeExplainer
explainer = shap.TreeExplainer(initial_model)
shap_values = explainer.shap_values(X_train_sample)

# Calculate feature importance
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': np.mean(np.abs(shap_values), axis=0)
}).sort_values(by='Importance', ascending=False)

# Select the top 10 features
top_features = feature_importance.head(10)['Feature'].tolist()
print("Selected Top 10 Features by SHAP:")
for idx, (feature, importance) in enumerate(zip(feature_importance.head(10)['Feature'],
                                              feature_importance.head(10)['Importance']), 1):
    print(f"{idx}. {feature}: {importance:.4f}")

# Visualize the SHAP feature importance
shap.summary_plot(shap_values, X_train_sample, feature_names=X_train.columns)

boruta

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from boruta import BorutaPy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Prepare data
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Boruta feature selection
print("Running Boruta feature selection...")
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
boruta_selector = BorutaPy(
    rf,
    n_estimators='auto',
    verbose=2,
    random_state=42
)

boruta_selector.fit(X_train_scaled, y_train)

selected_feat_mask = boruta_selector.support_
selected_features = X_train.columns[selected_feat_mask].tolist()

print("\nSelected Features by Boruta:")
for idx, feature in enumerate(selected_features, 1):
    print(f"{idx}. {feature}")

importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Ranking': boruta_selector.ranking_,
    'Selected': selected_feat_mask
})
importance_df['Importance'] = max(importance_df['Ranking']) - importance_df['Ranking'] + 1
importance_df = importance_df[importance_df['Selected']].sort_values(by='Importance', ascending=False)

X_train_selected = X_train[selected_features]
X_train_scaled_selected = scaler.fit_transform(X_train_selected)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(rf_model, X_train_scaled_selected, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(rf_model, X_train_scaled_selected, y_train, cv=cv, method='predict_proba')[:, 1]

conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Boruta Selected Features)')
plt.show()

print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))
print(f"\nOverall ROC AUC Score: {roc_auc_score(y_train, y_pred_proba_cv):.4f}")

plt.figure(figsize=(12, 8))
fold_count = 1

for train_idx, test_idx in cv.split(X_train_scaled_selected, y_train):
    rf_model.fit(X_train_scaled_selected[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = rf_model.predict_proba(X_train_scaled_selected[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves (Boruta Selected Features)')
plt.legend(loc='lower right')
plt.show()

final_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
final_model.fit(X_train_scaled_selected, y_train)

feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': final_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance (Boruta Selected Features)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

tree_depths = [tree.get_depth() for tree in final_model.estimators_]

plt.figure(figsize=(10, 6))
plt.hist(tree_depths, bins=20, edgecolor='black')
plt.title('Distribution of Tree Depths')
plt.xlabel('Tree Depth')
plt.ylabel('Count')
plt.show()

importance_cum = np.cumsum(feature_importance['Importance'])
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(importance_cum) + 1), importance_cum)
plt.title('Cumulative Feature Importance')
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Importance')
plt.grid(True)
plt.show()

print("\nModel Information:")
print(f"Number of selected features: {len(selected_features)}")
print(f"Total training samples: {len(X_train)}")
print("\nClass Distribution:")
print(y_train.value_counts(normalize=True).round(4) * 100)

print("\nTree Depth Statistics:")
print(f"Mean Depth: {np.mean(tree_depths):.2f}")
print(f"Max Depth: {np.max(tree_depths)}")
print(f"Min Depth: {np.min(tree_depths)}")

print("\nFeature Importance Ranking:")
for idx, (feature, importance) in enumerate(zip(feature_importance['Feature'],
                                              feature_importance['Importance']), 1):
    print(f"{idx}. {feature}: {importance:.4f}")

n_features_90 = np.argmax(importance_cum >= 0.9) + 1
print(f"\nNumber of features needed to explain 90% of variance: {n_features_90}")

selected_features_list = selected_features
print("\nSelected features saved for future use.")

### hyperparameter tuning

baseline model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import optuna
import warnings
warnings.filterwarnings('ignore')

# Prepare data
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'random_state': 42,
        'n_jobs': -1
    }

    model = RandomForestClassifier(**params)
    scores = cross_val_score(model, X_train_scaled, y_train, cv=3, scoring='roc_auc')
    return scores.mean()

print("Optimizing hyperparameters...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\nBest parameters found:")
print(study.best_params)
print(f"Best ROC-AUC score: {study.best_value:.4f}")

best_params = study.best_params.copy()
best_params['random_state'] = 42
best_params['n_jobs'] = -1
best_model = RandomForestClassifier(**best_params)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(best_model, X_train_scaled, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(best_model, X_train_scaled, y_train, cv=cv, method='predict_proba')[:, 1]

conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title(f'Confusion Matrix (Optimized Random Forest)\nBest ROC-AUC: {study.best_value:.4f}')
plt.show()

print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

plt.figure(figsize=(10, 6))
optimization_history = np.array([t.value for t in study.trials])
plt.plot(optimization_history)
plt.title('Optimization History')
plt.xlabel('Trial')
plt.ylabel('ROC-AUC Score')
plt.show()

plt.figure(figsize=(12, 8))
fold_count = 1

for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    best_model.fit(X_train_scaled[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = best_model.predict_proba(X_train_scaled[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Optimized Random Forest (3-fold CV)')
plt.legend(loc='lower right')
plt.show()

final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_scaled, y_train)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': final_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in Optimized Random Forest')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

optuna.visualization.plot_param_importances(study)
plt.title('Hyperparameter Importance')
plt.show()

tree_depths = [tree.get_depth() for tree in final_model.estimators_]

plt.figure(figsize=(10, 6))
plt.hist(tree_depths, bins=20, edgecolor='black')
plt.title('Distribution of Tree Depths in Optimized Random Forest')
plt.xlabel('Tree Depth')
plt.ylabel('Count')
plt.show()

importance_cum = np.cumsum(feature_importance['Importance'])
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(importance_cum) + 1), importance_cum)
plt.title('Cumulative Feature Importance')
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Importance')
plt.grid(True)
plt.show()

print("\nOptimized Model Information:")
print(f"Number of features: {X_train.shape[1]}")
print(f"Total training samples: {len(X_train)}")
print("\nTree Depth Statistics:")
print(f"Mean Depth: {np.mean(tree_depths):.2f}")
print(f"Max Depth: {np.max(tree_depths)}")
print(f"Min Depth: {np.min(tree_depths)}")

n_features_90 = np.argmax(importance_cum >= 0.9) + 1
print(f"\nNumber of features needed to explain 90% of variance: {n_features_90}")

print("\nTop 10 Most Important Features:")
for idx, (feature, importance) in enumerate(zip(feature_importance['Feature'][:10],
                                              feature_importance['Importance'][:10]), 1):
    print(f"{idx}. {feature}: {importance:.4f}")

best_rf_params = study.best_params
print("\nBest Hyperparameters for future use:")
print(best_rf_params)

feature importance

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import optuna
import warnings
warnings.filterwarnings('ignore')

X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

initial_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
initial_model.fit(X_train_scaled, y_train)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': initial_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

top_features = feature_importance.head(10)['Feature'].tolist()

print("Selected Top 10 Features:")
for idx, (feature, importance) in enumerate(zip(feature_importance.head(10)['Feature'],
                                              feature_importance.head(10)['Importance']), 1):
    print(f"{idx}. {feature}: {importance:.4f}")

X_train_top = X_train[top_features]
X_train_scaled_top = scaler.fit_transform(X_train_top)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'random_state': 42,
        'n_jobs': -1
    }

    model = RandomForestClassifier(**params)
    scores = cross_val_score(model, X_train_scaled_top, y_train, cv=3, scoring='roc_auc')
    return scores.mean()

print("\nOptimizing hyperparameters...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\nBest parameters found:")
print(study.best_params)
print(f"Best ROC-AUC score: {study.best_value:.4f}")

best_params = study.best_params.copy()
best_params['random_state'] = 42
best_params['n_jobs'] = -1
best_model = RandomForestClassifier(**best_params)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(best_model, X_train_scaled_top, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(best_model, X_train_scaled_top, y_train, cv=cv, method='predict_proba')[:, 1]

conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title(f'Confusion Matrix (Optimized Model)\nBest ROC-AUC: {study.best_value:.4f}')
plt.show()

print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

plt.figure(figsize=(10, 6))
optimization_history = np.array([t.value for t in study.trials])
plt.plot(optimization_history)
plt.title('Optimization History')
plt.xlabel('Trial')
plt.ylabel('ROC-AUC Score')
plt.show()

plt.figure(figsize=(12, 8))
fold_count = 1

for train_idx, test_idx in cv.split(X_train_scaled_top, y_train):
    best_model.fit(X_train_scaled_top[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = best_model.predict_proba(X_train_scaled_top[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Optimized Model (3-fold CV)')
plt.legend(loc='lower right')
plt.show()

final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_scaled_top, y_train)

tree_depths = [tree.get_depth() for tree in final_model.estimators_]

plt.figure(figsize=(10, 6))
plt.hist(tree_depths, bins=20, edgecolor='black')
plt.title('Distribution of Tree Depths in Optimized Model')
plt.xlabel('Tree Depth')
plt.ylabel('Count')
plt.show()

feature_importance = pd.DataFrame({
    'Feature': top_features,
    'Importance': final_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in Optimized Model')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

optuna.visualization.plot_param_importances(study)
plt.title('Hyperparameter Importance')
plt.show()

importance_cum = np.cumsum(feature_importance['Importance'])
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(importance_cum) + 1), importance_cum)
plt.title('Cumulative Feature Importance')
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Importance')
plt.grid(True)
plt.show()

print("\nModel Information:")
print(f"Number of selected features: {len(top_features)}")
print(f"Total training samples: {len(X_train)}")
print("\nTree Depth Statistics:")
print(f"Mean Depth: {np.mean(tree_depths):.2f}")
print(f"Max Depth: {np.max(tree_depths)}")
print(f"Min Depth: {np.min(tree_depths)}")

print("\nFeature Importance Ranking in Optimized Model:")
for idx, (feature, importance) in enumerate(zip(feature_importance['Feature'],
                                              feature_importance['Importance']), 1):
    print(f"{idx}. {feature}: {importance:.4f}")

best_rf_params = study.best_params
print("\nBest Hyperparameters for future use:")
print(best_rf_params)

shap

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score
from boruta import BorutaPy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import optuna
import warnings
warnings.filterwarnings('ignore')

# Prepare data
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Boruta feature selection
print("Running Boruta feature selection...")
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
boruta_selector = BorutaPy(
    rf,
    n_estimators='auto',
    verbose=2,
    random_state=42
)

boruta_selector.fit(X_train_scaled, y_train)

selected_feat_mask = boruta_selector.support_
selected_features = X_train.columns[selected_feat_mask].tolist()

print("\nSelected Features by Boruta:")
for idx, feature in enumerate(selected_features, 1):
    print(f"{idx}. {feature}")

X_train_selected = X_train[selected_features]
X_train_scaled_selected = scaler.fit_transform(X_train_selected)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'random_state': 42,
        'n_jobs': -1
    }

    model = RandomForestClassifier(**params)
    scores = cross_val_score(model, X_train_scaled_selected, y_train, cv=3, scoring='roc_auc')
    return scores.mean()

print("\nOptimizing hyperparameters...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\nBest parameters found:")
print(study.best_params)
print(f"Best ROC-AUC score: {study.best_value:.4f}")

best_params = study.best_params.copy()
best_params['random_state'] = 42
best_params['n_jobs'] = -1
best_model = RandomForestClassifier(**best_params)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(best_model, X_train_scaled_selected, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(best_model, X_train_scaled_selected, y_train, cv=cv, method='predict_proba')[:, 1]

conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title(f'Confusion Matrix (Optimized Model)\nBest ROC-AUC: {study.best_value:.4f}')
plt.show()

print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

plt.figure(figsize=(10, 6))
optimization_history = np.array([t.value for t in study.trials])
plt.plot(optimization_history)
plt.title('Optimization History')
plt.xlabel('Trial')
plt.ylabel('ROC-AUC Score')
plt.show()

plt.figure(figsize=(12, 8))
fold_count = 1

for train_idx, test_idx in cv.split(X_train_scaled_selected, y_train):
    best_model.fit(X_train_scaled_selected[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = best_model.predict_proba(X_train_scaled_selected[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Optimized Model (3-fold CV)')
plt.legend(loc='lower right')
plt.show()

final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_scaled_selected, y_train)

tree_depths = [tree.get_depth() for tree in final_model.estimators_]

plt.figure(figsize=(10, 6))
plt.hist(tree_depths, bins=20, edgecolor='black')
plt.title('Distribution of Tree Depths in Optimized Model')
plt.xlabel('Tree Depth')
plt.ylabel('Count')
plt.show()

feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': final_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in Optimized Model')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

optuna.visualization.plot_param_importances(study)
plt.title('Hyperparameter Importance')
plt.show()

importance_cum = np.cumsum(feature_importance['Importance'])
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(importance_cum) + 1), importance_cum)
plt.title('Cumulative Feature Importance')
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Importance')
plt.grid(True)
plt.show()

print("\nModel Information:")
print(f"Number of selected features: {len(selected_features)}")
print(f"Total training samples: {len(X_train)}")

print("\nTree Depth Statistics:")
print(f"Mean Depth: {np.mean(tree_depths):.2f}")
print(f"Max Depth: {np.max(tree_depths)}")
print(f"Min Depth: {np.min(tree_depths)}")

print("\nFeature Importance Ranking:")
for idx, (feature, importance) in enumerate(zip(feature_importance['Feature'],
                                              feature_importance['Importance']), 1):
    print(f"{idx}. {feature}: {importance:.4f}")

best_rf_params = study.best_params
print("\nBest Hyperparameters for future use:")
print(best_rf_params)

Boruta

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict, StratifiedKFold, cross_val_score
from boruta import BorutaPy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import optuna
import warnings
warnings.filterwarnings('ignore')

# Prepare data
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Boruta feature selection
print("Running Boruta feature selection...")
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
boruta_selector = BorutaPy(
    rf,
    n_estimators='auto',
    verbose=2,
    random_state=42
)

boruta_selector.fit(X_train_scaled, y_train)

selected_feat_mask = boruta_selector.support_
selected_features = X_train.columns[selected_feat_mask].tolist()

print("\nSelected Features by Boruta:")
for idx, feature in enumerate(selected_features, 1):
    print(f"{idx}. {feature}")

X_train_selected = X_train[selected_features]
X_train_scaled_selected = scaler.fit_transform(X_train_selected)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'random_state': 42,
        'n_jobs': -1
    }

    model = RandomForestClassifier(**params)
    scores = cross_val_score(model, X_train_scaled_selected, y_train, cv=3, scoring='roc_auc')
    return scores.mean()

print("\nOptimizing hyperparameters...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\nBest parameters found:")
print(study.best_params)
print(f"Best ROC-AUC score: {study.best_value:.4f}")

best_params = study.best_params.copy()
best_params['random_state'] = 42
best_params['n_jobs'] = -1
best_model = RandomForestClassifier(**best_params)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(best_model, X_train_scaled_selected, y_train, cv=cv, method='predict')
y_pred_proba_cv = cross_val_predict(best_model, X_train_scaled_selected, y_train, cv=cv, method='predict_proba')[:, 1]

conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title(f'Confusion Matrix (Optimized Model)\nBest ROC-AUC: {study.best_value:.4f}')
plt.show()

print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

plt.figure(figsize=(10, 6))
optimization_history = np.array([t.value for t in study.trials])
plt.plot(optimization_history)
plt.title('Optimization History')
plt.xlabel('Trial')
plt.ylabel('ROC-AUC Score')
plt.show()

plt.figure(figsize=(12, 8))
fold_count = 1

for train_idx, test_idx in cv.split(X_train_scaled_selected, y_train):
    best_model.fit(X_train_scaled_selected[train_idx], y_train.iloc[train_idx])
    y_pred_proba_fold = best_model.predict_proba(X_train_scaled_selected[test_idx])[:, 1]
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_pred_proba_fold)
    roc_auc_fold = roc_auc_score(y_train.iloc[test_idx], y_pred_proba_fold)
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Optimized Model (3-fold CV)')
plt.legend(loc='lower right')
plt.show()

final_model = RandomForestClassifier(**best_params)
final_model.fit(X_train_scaled_selected, y_train)

tree_depths = [tree.get_depth() for tree in final_model.estimators_]

plt.figure(figsize=(10, 6))
plt.hist(tree_depths, bins=20, edgecolor='black')
plt.title('Distribution of Tree Depths in Optimized Model')
plt.xlabel('Tree Depth')
plt.ylabel('Count')
plt.show()

feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': final_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in Optimized Model')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

optuna.visualization.plot_param_importances(study)
plt.title('Hyperparameter Importance')
plt.show()

importance_cum = np.cumsum(feature_importance['Importance'])
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(importance_cum) + 1), importance_cum)
plt.title('Cumulative Feature Importance')
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Importance')
plt.grid(True)
plt.show()

print("\nModel Information:")
print(f"Number of selected features: {len(selected_features)}")
print(f"Total training samples: {len(X_train)}")

print("\nTree Depth Statistics:")
print(f"Mean Depth: {np.mean(tree_depths):.2f}")
print(f"Max Depth: {np.max(tree_depths)}")
print(f"Min Depth: {np.min(tree_depths)}")

print("\nFeature Importance Ranking:")
for idx, (feature, importance) in enumerate(zip(feature_importance['Feature'],
                                              feature_importance['Importance']), 1):
    print(f"{idx}. {feature}: {importance:.4f}")

best_rf_params = study.best_params
print("\nBest Hyperparameters for future use:")
print(best_rf_params)

### test set

baseline model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report,
                           roc_auc_score, roc_curve, precision_score, recall_score, f1_score)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

def evaluate_and_visualize(y_true, y_pred, y_pred_proba, dataset_name):
    accuracy = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    conf_matrix = confusion_matrix(y_true, y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix\n{dataset_name}')
    plt.show()

    plt.figure(figsize=(8, 6))
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve\n{dataset_name}')
    plt.legend(loc='lower right')
    plt.show()

    print(f"\nClassification Report - {dataset_name}")
    print(classification_report(y_true, y_pred))

# Data preparation
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']
X_test = combined_test[selected_columns].drop(columns=['is_canceled'])
y_test = combined_test['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model with best parameters
best_params = {
    'n_estimators': 234,
    'max_depth': 20,
    'min_samples_split': 8,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'criterion': 'gini',
    'bootstrap': False,
    'random_state': 42,
    'n_jobs': -1
}

best_model = RandomForestClassifier(**best_params)
best_model.fit(X_train_scaled, y_train)

# Test set predictions
mask_h1 = combined_test['hotel'] == 0
mask_h2 = combined_test['hotel'] == 1

y_pred = best_model.predict(X_test_scaled)
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

y_pred_h1 = best_model.predict(X_test_scaled[mask_h1])
y_pred_proba_h1 = best_model.predict_proba(X_test_scaled[mask_h1])[:, 1]

y_pred_h2 = best_model.predict(X_test_scaled[mask_h2])
y_pred_proba_h2 = best_model.predict_proba(X_test_scaled[mask_h2])[:, 1]

# Performance comparison
performance_comparison = pd.DataFrame({
    'Dataset': ['Combined Test Set', 'H1 Test Set', 'H2 Test Set'],
    'ROC AUC': [
        roc_auc_score(y_test, y_pred_proba),
        roc_auc_score(y_test[mask_h1], y_pred_proba_h1),
        roc_auc_score(y_test[mask_h2], y_pred_proba_h2)
    ],
    'Accuracy': [
        accuracy_score(y_test, y_pred),
        accuracy_score(y_test[mask_h1], y_pred_h1),
        accuracy_score(y_test[mask_h2], y_pred_h2)
    ],
    'Precision': [
        precision_score(y_test, y_pred),
        precision_score(y_test[mask_h1], y_pred_h1),
        precision_score(y_test[mask_h2], y_pred_h2)
    ],
    'Recall': [
        recall_score(y_test, y_pred),
        recall_score(y_test[mask_h1], y_pred_h1),
        recall_score(y_test[mask_h2], y_pred_h2)
    ],
    'F1 Score': [
        f1_score(y_test, y_pred),
        f1_score(y_test[mask_h1], y_pred_h1),
        f1_score(y_test[mask_h2], y_pred_h2)
    ]
})

print("\nDetailed Performance Comparison:")
print(performance_comparison.round(4))

# Performance visualization
plt.figure(figsize=(15, 8))
bar_width = 0.15
metrics = ['ROC AUC', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
x = np.arange(len(performance_comparison['Dataset']))

for i, metric in enumerate(metrics):
    plt.bar(x + i * bar_width,
           performance_comparison[metric],
           bar_width,
           label=metric,
           alpha=0.8)

plt.xlabel('Dataset')
plt.ylabel('Score')
plt.title('Complete Performance Comparison Across Test Sets')
plt.xticks(x + bar_width * 2, performance_comparison['Dataset'], rotation=45)
plt.legend(loc='lower right')
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
metrics_df = performance_comparison.set_index('Dataset')
sns.heatmap(metrics_df, annot=True, cmap='YlOrRd', fmt='.4f', cbar_kws={'label': 'Score'})
plt.title('Performance Metrics Heatmap')
plt.tight_layout()
plt.show()

relative_performance = metrics_df.div(metrics_df.max()) * 100
plt.figure(figsize=(12, 6))
sns.heatmap(relative_performance, annot=True, cmap='YlOrRd', fmt='.1f',
            cbar_kws={'label': 'Relative Performance (%)'})
plt.title('Relative Performance Metrics Heatmap (%)')
plt.tight_layout()
plt.show()

fig, axes = plt.subplots(2, 3, figsize=(20, 12))
axes = axes.ravel()
colors = sns.color_palette('husl', n_colors=len(metrics))

for i, metric in enumerate(metrics):
    sns.barplot(data=performance_comparison, x='Dataset', y=metric, ax=axes[i],
                color=colors[i])
    axes[i].set_title(f'{metric} Comparison')
    axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)
    axes[i].grid(True, axis='y', linestyle='--', alpha=0.7)

for i in range(len(metrics), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

print("\nDetailed Statistics:")
print("\nMean Performance Across Datasets:")
print(metrics_df.mean().round(4))
print("\nStandard Deviation of Performance:")
print(metrics_df.std().round(4))
print("\nRange of Performance (Max - Min):")
print((metrics_df.max() - metrics_df.min()).round(4))

print("\nPerformance Difference from Combined Test Set (%):")
baseline = metrics_df.loc['Combined Test Set']
for dataset in ['H1 Test Set', 'H2 Test Set']:
    diff_percentage = ((metrics_df.loc[dataset] - baseline) / baseline * 100).round(2)
    print(f"\n{dataset}:")
    for metric, value in diff_percentage.items():
        print(f"{metric}: {value:+.2f}%")

feature importance

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report,
                           roc_auc_score, roc_curve, precision_score, recall_score, f1_score)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

def evaluate_and_visualize(y_true, y_pred, y_pred_proba, dataset_name):
    """Helper function to evaluate and visualize results"""
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    # Confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix\n{dataset_name}')
    plt.show()

    # ROC curve
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve\n{dataset_name}')
    plt.legend(loc='lower right')
    plt.show()

    # Print classification report
    print(f"\nClassification Report - {dataset_name}")
    print(classification_report(y_true, y_pred))

# Prepare training data
train_data = pd.concat([H1_train, H2_train], axis=0)
test_data = pd.concat([H1_test, H2_test], axis=0)

# Prepare feature data
X_train = train_data[selected_columns].drop(columns=['is_canceled'])
y_train = train_data['is_canceled']
X_test = test_data[selected_columns].drop(columns=['is_canceled'])
y_test = test_data['is_canceled']

# Get feature importance and select top 10 features
initial_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
initial_model.fit(X_train, y_train)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': initial_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

top_features = feature_importance.head(10)['Feature'].tolist()

# Prepare feature data
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

# Standardize data
scaler = StandardScaler()
X_train_scaled_top = scaler.fit_transform(X_train_top)
X_test_scaled_top = scaler.transform(X_test_top)

# Create model with best parameters
best_params = {
    'n_estimators': 214,
    'max_depth': 20,
    'min_samples_split': 6,
    'min_samples_leaf': 4,
    'max_features': 'log2',
    'criterion': 'gini',
    'bootstrap': False,
    'random_state': 42,
    'n_jobs': -1
}

best_model = RandomForestClassifier(**best_params)
best_model.fit(X_train_scaled_top, y_train)

# Evaluate on test sets
mask_h1 = test_data.index.isin(H1_test.index)
mask_h2 = test_data.index.isin(H2_test.index)

# Overall test set prediction
y_pred = best_model.predict(X_test_scaled_top)
y_pred_proba = best_model.predict_proba(X_test_scaled_top)[:, 1]

# H1 test set prediction
y_pred_h1 = best_model.predict(X_test_scaled_top[mask_h1])
y_pred_proba_h1 = best_model.predict_proba(X_test_scaled_top[mask_h1])[:, 1]

# H2 test set prediction
y_pred_h2 = best_model.predict(X_test_scaled_top[mask_h2])
y_pred_proba_h2 = best_model.predict_proba(X_test_scaled_top[mask_h2])[:, 1]

# Create performance comparison table
performance_comparison = pd.DataFrame({
    'Dataset': ['Combined Test Set', 'H1 Test Set', 'H2 Test Set'],
    'ROC AUC': [
        roc_auc_score(y_test, y_pred_proba),
        roc_auc_score(y_test[mask_h1], y_pred_proba_h1),
        roc_auc_score(y_test[mask_h2], y_pred_proba_h2)
    ],
    'Accuracy': [
        accuracy_score(y_test, y_pred),
        accuracy_score(y_test[mask_h1], y_pred_h1),
        accuracy_score(y_test[mask_h2], y_pred_h2)
    ],
    'Precision': [
        precision_score(y_test, y_pred),
        precision_score(y_test[mask_h1], y_pred_h1),
        precision_score(y_test[mask_h2], y_pred_h2)
    ],
    'Recall': [
        recall_score(y_test, y_pred),
        recall_score(y_test[mask_h1], y_pred_h1),
        recall_score(y_test[mask_h2], y_pred_h2)
    ],
    'F1 Score': [
        f1_score(y_test, y_pred),
        f1_score(y_test[mask_h1], y_pred_h1),
        f1_score(y_test[mask_h2], y_pred_h2)
    ]
})

print("\nDetailed Performance Comparison:")
print(performance_comparison.round(4))

# Plot complete performance comparison
plt.figure(figsize=(15, 8))
performance_metrics = performance_comparison.melt(id_vars=['Dataset'], var_name='Metric', value_name='Score')

# Create grouped bar chart
plt.figure(figsize=(15, 8))
bar_width = 0.15
metrics = ['ROC AUC', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
x = np.arange(len(performance_comparison['Dataset']))

for i, metric in enumerate(metrics):
    plt.bar(x + i * bar_width,
           performance_comparison[metric],
           bar_width,
           label=metric,
           alpha=0.8)

plt.xlabel('Dataset')
plt.ylabel('Score')
plt.title('Complete Performance Comparison Across Test Sets')
plt.xticks(x + bar_width * 2, performance_comparison['Dataset'], rotation=45)
plt.legend(loc='lower right')
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Create heatmap to visualize performance metrics
plt.figure(figsize=(12, 6))
metrics_df = performance_comparison.set_index('Dataset')
sns.heatmap(metrics_df, annot=True, cmap='YlOrRd', fmt='.4f', cbar_kws={'label': 'Score'})
plt.title('Performance Metrics Heatmap')
plt.tight_layout()
plt.show()

# Calculate relative performance
relative_performance = metrics_df.div(metrics_df.max()) * 100

plt.figure(figsize=(12, 6))
sns.heatmap(relative_performance, annot=True, cmap='YlOrRd', fmt='.1f',
            cbar_kws={'label': 'Relative Performance (%)'})
plt.title('Relative Performance Metrics Heatmap (%)')
plt.tight_layout()
plt.show()

# Create individual comparison plots for each metric
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
axes = axes.ravel()

colors = sns.color_palette('husl', n_colors=len(metrics))

for i, metric in enumerate(metrics):
    sns.barplot(data=performance_comparison, x='Dataset', y=metric, ax=axes[i],
                color=colors[i])
    axes[i].set_title(f'{metric} Comparison')
    axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)
    axes[i].grid(True, axis='y', linestyle='--', alpha=0.7)

# Remove unused subplots
for i in range(len(metrics), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

# Print detailed statistics
print("\nDetailed Statistics:")
print("\nMean Performance Across Datasets:")
print(metrics_df.mean().round(4))
print("\nStandard Deviation of Performance:")
print(metrics_df.std().round(4))
print("\nRange of Performance (Max - Min):")
print((metrics_df.max() - metrics_df.min()).round(4))

# Calculate and display performance difference percentage
print("\nPerformance Difference from Combined Test Set (%):")
baseline = metrics_df.loc['Combined Test Set']
for dataset in ['H1 Test Set', 'H2 Test Set']:
    diff_percentage = ((metrics_df.loc[dataset] - baseline) / baseline * 100).round(2)
    print(f"\n{dataset}:")
    for metric, value in diff_percentage.items():
        print(f"{metric}: {value:+.2f}%")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report,
                           roc_auc_score, roc_curve, precision_score, recall_score, f1_score)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

def evaluate_and_visualize(y_true, y_pred, y_pred_proba, dataset_name):
    """Helper function to evaluate and visualize results"""
    # 计算指标
    accuracy = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    # 混淆矩阵
    conf_matrix = confusion_matrix(y_true, y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix\n{dataset_name}')
    plt.show()

    # ROC曲线
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve\n{dataset_name}')
    plt.legend(loc='lower right')
    plt.show()

    # 打印分类报告
    print(f"\nClassification Report - {dataset_name}")
    print(classification_report(y_true, y_pred))

# 准备训练数据
train_data = pd.concat([H1_train, H2_train], axis=0)
test_data = pd.concat([H1_test, H2_test], axis=0)

# 准备特征数据
X_train = train_data[selected_columns].drop(columns=['is_canceled'])
y_train = train_data['is_canceled']
X_test = test_data[selected_columns].drop(columns=['is_canceled'])
y_test = test_data['is_canceled']

# 获取特征重要性并选择top10特征
initial_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
initial_model.fit(X_train, y_train)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': initial_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

top_features = feature_importance.head(10)['Feature'].tolist()

# 准备特征数据
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

# 标准化数据
scaler = StandardScaler()
X_train_scaled_top = scaler.fit_transform(X_train_top)
X_test_scaled_top = scaler.transform(X_test_top)

# 使用最佳参数创建模型
best_params = {
    'n_estimators': 214,
    'max_depth': 20,
    'min_samples_split': 6,
    'min_samples_leaf': 4,
    'max_features': 'log2',
    'criterion': 'gini',
    'bootstrap': False,
    'random_state': 42,
    'n_jobs': -1
}

best_model = RandomForestClassifier(**best_params)
best_model.fit(X_train_scaled_top, y_train)

# 在测试集上进行评估
mask_h1 = test_data.index.isin(H1_test.index)
mask_h2 = test_data.index.isin(H2_test.index)

# 整体测试集预测
y_pred = best_model.predict(X_test_scaled_top)
y_pred_proba = best_model.predict_proba(X_test_scaled_top)[:, 1]

# H1测试集预测
y_pred_h1 = best_model.predict(X_test_scaled_top[mask_h1])
y_pred_proba_h1 = best_model.predict_proba(X_test_scaled_top[mask_h1])[:, 1]

# H2测试集预测
y_pred_h2 = best_model.predict(X_test_scaled_top[mask_h2])
y_pred_proba_h2 = best_model.predict_proba(X_test_scaled_top[mask_h2])[:, 1]

# 创建性能比较表格
performance_comparison = pd.DataFrame({
    'Dataset': ['Combined Test Set', 'H1 Test Set', 'H2 Test Set'],
    'ROC AUC': [
        roc_auc_score(y_test, y_pred_proba),
        roc_auc_score(y_test[mask_h1], y_pred_proba_h1),
        roc_auc_score(y_test[mask_h2], y_pred_proba_h2)
    ],
    'Accuracy': [
        accuracy_score(y_test, y_pred),
        accuracy_score(y_test[mask_h1], y_pred_h1),
        accuracy_score(y_test[mask_h2], y_pred_h2)
    ],
    'Precision': [
        precision_score(y_test, y_pred),
        precision_score(y_test[mask_h1], y_pred_h1),
        precision_score(y_test[mask_h2], y_pred_h2)
    ],
    'Recall': [
        recall_score(y_test, y_pred),
        recall_score(y_test[mask_h1], y_pred_h1),
        recall_score(y_test[mask_h2], y_pred_h2)
    ],
    'F1 Score': [
        f1_score(y_test, y_pred),
        f1_score(y_test[mask_h1], y_pred_h1),
        f1_score(y_test[mask_h2], y_pred_h2)
    ]
})

print("\nDetailed Performance Comparison:")
print(performance_comparison.round(4))

# 绘制完整的性能比较图
plt.figure(figsize=(15, 8))
performance_metrics = performance_comparison.melt(id_vars=['Dataset'], var_name='Metric', value_name='Score')

# 创建分组柱状图
plt.figure(figsize=(15, 8))
bar_width = 0.15
metrics = ['ROC AUC', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
x = np.arange(len(performance_comparison['Dataset']))

for i, metric in enumerate(metrics):
    plt.bar(x + i * bar_width,
           performance_comparison[metric],
           bar_width,
           label=metric,
           alpha=0.8)

plt.xlabel('Dataset')
plt.ylabel('Score')
plt.title('Complete Performance Comparison Across Test Sets')
plt.xticks(x + bar_width * 2, performance_comparison['Dataset'], rotation=45)
plt.legend(loc='lower right')
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# 创建热力图展示性能指标
plt.figure(figsize=(12, 6))
metrics_df = performance_comparison.set_index('Dataset')
sns.heatmap(metrics_df, annot=True, cmap='YlOrRd', fmt='.4f', cbar_kws={'label': 'Score'})
plt.title('Performance Metrics Heatmap')
plt.tight_layout()
plt.show()

# 计算每个指标的相对表现
relative_performance = metrics_df.div(metrics_df.max()) * 100

plt.figure(figsize=(12, 6))
sns.heatmap(relative_performance, annot=True, cmap='YlOrRd', fmt='.1f',
            cbar_kws={'label': 'Relative Performance (%)'})
plt.title('Relative Performance Metrics Heatmap (%)')
plt.tight_layout()
plt.show()

# 为每个指标创建单独的比较图
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
axes = axes.ravel()

colors = sns.color_palette('husl', n_colors=len(metrics))

for i, metric in enumerate(metrics):
    sns.barplot(data=performance_comparison, x='Dataset', y=metric, ax=axes[i],
                color=colors[i])
    axes[i].set_title(f'{metric} Comparison')
    axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)
    axes[i].grid(True, axis='y', linestyle='--', alpha=0.7)

# 删除多余的子图
for i in range(len(metrics), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

# 打印详细的统计分析
print("\nDetailed Statistics:")
print("\nMean Performance Across Datasets:")
print(metrics_df.mean().round(4))
print("\nStandard Deviation of Performance:")
print(metrics_df.std().round(4))
print("\nRange of Performance (Max - Min):")
print((metrics_df.max() - metrics_df.min()).round(4))

# 计算并显示性能差异百分比
print("\nPerformance Difference from Combined Test Set (%):")
baseline = metrics_df.loc['Combined Test Set']
for dataset in ['H1 Test Set', 'H2 Test Set']:
    diff_percentage = ((metrics_df.loc[dataset] - baseline) / baseline * 100).round(2)
    print(f"\n{dataset}:")
    for metric, value in diff_percentage.items():
        print(f"{metric}: {value:+.2f}%")

boruta

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report,
                           roc_auc_score, roc_curve, precision_score, recall_score, f1_score)
from boruta import BorutaPy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

def evaluate_and_visualize(y_true, y_pred, y_pred_proba, dataset_name):
    """Helper function to evaluate and visualize results"""
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    # Confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=12)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix\n{dataset_name}')
    plt.show()

    # ROC curve
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve\n{dataset_name}')
    plt.legend(loc='lower right')
    plt.show()

    # Print classification report
    print(f"\nClassification Report - {dataset_name}")
    print(classification_report(y_true, y_pred))

# Prepare training data
train_data = pd.concat([H1_train, H2_train], axis=0)
test_data = pd.concat([H1_test, H2_test], axis=0)

# Prepare feature data
X_train = train_data[selected_columns].drop(columns=['is_canceled'])
y_train = train_data['is_canceled']
X_test = test_data[selected_columns].drop(columns=['is_canceled'])
y_test = test_data['is_canceled']

# Standardize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Use Boruta for feature selection
print("Running Boruta feature selection...")
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
boruta_selector = BorutaPy(
    rf,
    n_estimators='auto',
    verbose=2,
    random_state=42
)

# Run Boruta selection
boruta_selector.fit(X_train_scaled, y_train)

# Get selected features
selected_feat_mask = boruta_selector.support_
selected_features = X_train.columns[selected_feat_mask].tolist()

print("\nSelected Features by Boruta:")
for idx, feature in enumerate(selected_features, 1):
    print(f"{idx}. {feature}")

# Prepare feature data
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Standardize data
X_train_scaled_selected = scaler.fit_transform(X_train_selected)
X_test_scaled_selected = scaler.transform(X_test_selected)

# Create model with best parameters
best_params = {
    'n_estimators': 295,
    'max_depth': 20,
    'min_samples_split': 4,
    'min_samples_leaf': 1,
    'max_features': 'log2',
    'criterion': 'entropy',
    'bootstrap': True,
    'random_state': 42,
    'n_jobs': -1
}

best_model = RandomForestClassifier(**best_params)
best_model.fit(X_train_scaled_selected, y_train)

# Evaluate on test sets
mask_h1 = test_data.index.isin(H1_test.index)
mask_h2 = test_data.index.isin(H2_test.index)

# Overall test set prediction
y_pred = best_model.predict(X_test_scaled_selected)
y_pred_proba = best_model.predict_proba(X_test_scaled_selected)[:, 1]

# H1 test set prediction
y_pred_h1 = best_model.predict(X_test_scaled_selected[mask_h1])
y_pred_proba_h1 = best_model.predict_proba(X_test_scaled_selected[mask_h1])[:, 1]

# H2 test set prediction
y_pred_h2 = best_model.predict(X_test_scaled_selected[mask_h2])
y_pred_proba_h2 = best_model.predict_proba(X_test_scaled_selected[mask_h2])[:, 1]

# Create performance comparison table
performance_comparison = pd.DataFrame({
    'Dataset': ['Combined Test Set', 'H1 Test Set', 'H2 Test Set'],
    'ROC AUC': [
        roc_auc_score(y_test, y_pred_proba),
        roc_auc_score(y_test[mask_h1], y_pred_proba_h1),
        roc_auc_score(y_test[mask_h2], y_pred_proba_h2)
    ],
    'Accuracy': [
        accuracy_score(y_test, y_pred),
        accuracy_score(y_test[mask_h1], y_pred_h1),
        accuracy_score(y_test[mask_h2], y_pred_h2)
    ],
    'Precision': [
        precision_score(y_test, y_pred),
        precision_score(y_test[mask_h1], y_pred_h1),
        precision_score(y_test[mask_h2], y_pred_h2)
    ],
    'Recall': [
        recall_score(y_test, y_pred),
        recall_score(y_test[mask_h1], y_pred_h1),
        recall_score(y_test[mask_h2], y_pred_h2)
    ],
    'F1 Score': [
        f1_score(y_test, y_pred),
        f1_score(y_test[mask_h1], y_pred_h1),
        f1_score(y_test[mask_h2], y_pred_h2)
    ]
})

print("\nDetailed Performance Comparison:")
print(performance_comparison.round(4))

# Plot complete performance comparison
plt.figure(figsize=(15, 8))
performance_metrics = performance_comparison.melt(id_vars=['Dataset'], var_name='Metric', value_name='Score')

# Create grouped bar chart
plt.figure(figsize=(15, 8))
bar_width = 0.15
metrics = ['ROC AUC', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
x = np.arange(len(performance_comparison['Dataset']))

for i, metric in enumerate(metrics):
    plt.bar(x + i * bar_width,
           performance_comparison[metric],
           bar_width,
           label=metric,
           alpha=0.8)

plt.xlabel('Dataset')
plt.ylabel('Score')
plt.title('Complete Performance Comparison Across Test Sets')
plt.xticks(x + bar_width * 2, performance_comparison['Dataset'], rotation=45)
plt.legend(loc='lower right')
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Create heatmap to visualize performance metrics
plt.figure(figsize=(12, 6))
metrics_df = performance_comparison.set_index('Dataset')
sns.heatmap(metrics_df, annot=True, cmap='YlOrRd', fmt='.4f', cbar_kws={'label': 'Score'})
plt.title('Performance Metrics Heatmap')
plt.tight_layout()
plt.show()

# Calculate relative performance
relative_performance = metrics_df.div(metrics_df.max()) * 100

plt.figure(figsize=(12, 6))
sns.heatmap(relative_performance, annot=True, cmap='YlOrRd', fmt='.1f',
            cbar_kws={'label': 'Relative Performance (%)'})
plt.title('Relative Performance Metrics Heatmap (%)')
plt.tight_layout()
plt.show()

# Create individual comparison plots for each metric
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
axes = axes.ravel()

colors = sns.color_palette('husl', n_colors=len(metrics))

for i, metric in enumerate(metrics):
    sns.barplot(data=performance_comparison, x='Dataset', y=metric, ax=axes[i],
                color=colors[i])
    axes[i].set_title(f'{metric} Comparison')
    axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)
    axes[i].grid(True, axis='y', linestyle='--', alpha=0.7)

# Remove unused subplots
for i in range(len(metrics), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

# Print detailed statistics
print("\nDetailed Statistics:")
print("\nMean Performance Across Datasets:")
print(metrics_df.mean().round(4))
print("\nStandard Deviation of Performance:")
print(metrics_df.std().round(4))
print("\nRange of Performance (Max - Min):")
print((metrics_df.max() - metrics_df.min()).round(4))

# Calculate and display performance difference percentage
print("\nPerformance Difference from Combined Test Set (%):")
baseline = metrics_df.loc['Combined Test Set']
for dataset in ['H1 Test Set', 'H2 Test Set']:
    diff_percentage = ((metrics_df.loc[dataset] - baseline) / baseline * 100).round(2)
    print(f"\n{dataset}:")
    for metric, value in diff_percentage.items():
        print(f"{metric}: {value:+.2f}%")

 ## 3.TabNet




### baseline model

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch

# Data preparation
X_train = combined_train[selected_columns].drop(columns=['is_canceled'])
y_train = combined_train['is_canceled']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# TabNet parameters
tabnet_params = {
    'n_d': 64,
    'n_a': 64,
    'n_steps': 5,
    'gamma': 1.5,
    'n_independent': 2,
    'n_shared': 2,
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=2e-2),
    'scheduler_params': dict(mode="min",
                           patience=5,
                           min_lr=1e-5,
                           factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'mask_type': 'entmax',
    'seed': 42
}

# Cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
y_pred_cv = np.zeros_like(y_train)
y_pred_proba_cv = np.zeros_like(y_train, dtype=float)

fold_count = 1
plt.figure(figsize=(12, 8))

for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    clf = TabNetClassifier(**tabnet_params)

    X_fold_train = X_train_scaled[train_idx]
    y_fold_train = y_train.iloc[train_idx].values
    X_fold_test = X_train_scaled[test_idx]
    y_fold_test = y_train.iloc[test_idx].values

    clf.fit(
        X_fold_train, y_fold_train,
        eval_set=[(X_fold_test, y_fold_test)],
        max_epochs=100,
        patience=10,
        batch_size=1024,
        virtual_batch_size=128
    )

    y_pred_cv[test_idx] = clf.predict(X_fold_test)
    y_pred_proba_cv[test_idx] = clf.predict_proba(X_fold_test)[:, 1]

    fpr, tpr, _ = roc_curve(y_fold_test, clf.predict_proba(X_fold_test)[:, 1])
    roc_auc_fold = roc_auc_score(y_fold_test, clf.predict_proba(X_fold_test)[:, 1])
    plt.plot(fpr, tpr, label=f'Fold {fold_count} (AUC = {roc_auc_fold:.2f})')
    fold_count += 1

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Combined Dataset with TabNet (3-fold CV)')
plt.legend(loc='lower right')
plt.show()

conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})

for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=15)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Combined Dataset Confusion Matrix with TabNet')
plt.show()

print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))
print(f"\nOverall ROC AUC Score: {roc_auc_score(y_train, y_pred_proba_cv):.4f}")

final_model = TabNetClassifier(**tabnet_params)
final_model.fit(
    X_train_scaled, y_train.values,
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128
)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': final_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance in TabNet Model')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

print("\nDataset Information:")
print(f"Total Training Samples: {len(X_train)}")
print(f"Number of Features: {X_train.shape[1]}")
print(f"Class Distribution in Training Set:")
print(y_train.value_counts(normalize=True).round(4) * 100)

In [None]:
import shap
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

# 1. TabNet内置特征重要性
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': best_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('TabNet Feature Importance')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

# 2. SHAP值分析
explainer = shap.KernelExplainer(
    lambda x: best_model.predict_proba(x)[:, 1],
    shap.sample(X_train_scaled, 100)
)
shap_values = explainer.shap_values(X_train_scaled[:100])

plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_train_scaled[:100], feature_names=X_train.columns)
plt.title('SHAP Summary Plot')
plt.show()

# 3. Boruta特征选择
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
boruta = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42)
boruta.fit(X_train_scaled, y_train)

# 获取Boruta结果
boruta_results = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': boruta.ranking_,
    'Status': ['Selected' if rank <= 2 else 'Rejected' for rank in boruta.ranking_]
})
boruta_results = boruta_results.sort_values('Importance')

# 可视化Boruta结果
plt.figure(figsize=(12, 6))
colors = ['green' if status == 'Selected' else 'red' for status in boruta_results['Status']]
sns.barplot(data=boruta_results, x='Importance', y='Feature', palette=colors)
plt.title('Boruta Feature Selection Results')
plt.xlabel('Ranking (lower is better)')
plt.tight_layout()
plt.show()

# 打印选中的特征
print("\nSelected Features by Boruta:")
print(boruta_results[boruta_results['Status'] == 'Selected']['Feature'].tolist())

### different feature combinations

In [None]:
# 1. 获取三种不同的特征集
# Feature Importance top 10
importance_features = feature_importance.nlargest(10, 'Importance')['Feature'].tolist()

# SHAP top 10
shap_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': np.abs(shap_values).mean(0)
}).sort_values('Importance', ascending=False)
shap_features = shap_importance.nlargest(10, 'Importance')['Feature'].tolist()

# Boruta selected features
boruta_features = boruta_results[boruta_results['Status'] == 'Selected']['Feature'].tolist()

# 2. 创建三个数据集
X_train_importance = X_train_scaled[:, [list(X_train.columns).index(col) for col in importance_features]]
X_test_importance = X_test_scaled[:, [list(X_test.columns).index(col) for col in importance_features]]

X_train_shap = X_train_scaled[:, [list(X_train.columns).index(col) for col in shap_features]]
X_test_shap = X_test_scaled[:, [list(X_test.columns).index(col) for col in shap_features]]

X_train_boruta = X_train_scaled[:, [list(X_train.columns).index(col) for col in boruta_features]]
X_test_boruta = X_test_scaled[:, [list(X_test.columns).index(col) for col in boruta_features]]

# 3. 训练三个模型
models = {
    'Feature Importance': TabNetClassifier(**final_params),
    'SHAP': TabNetClassifier(**final_params),
    'Boruta': TabNetClassifier(**final_params)
}

datasets = {
    'Feature Importance': (X_train_importance, X_test_importance),
    'SHAP': (X_train_shap, X_test_shap),
    'Boruta': (X_train_boruta, X_test_boruta)
}

results = {}

# 训练和评估每个模型
for name, model in models.items():
    X_train_subset, X_test_subset = datasets[name]

    # 训练模型
    model.fit(
        X_train_subset, y_train.values,
        eval_set=[(X_train_subset, y_train.values)],
        max_epochs=100,
        batch_size=2048
    )

    # 获取预测
    y_pred = model.predict(X_test_subset)
    y_pred_proba = model.predict_proba(X_test_subset)[:, 1]

    results[name] = {
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }

# 4. 可视化比较
# ROC曲线比较
plt.figure(figsize=(12, 8))
for name, result in results.items():
    fpr, tpr, _ = roc_curve(y_test, result['probabilities'])
    roc_auc = roc_auc_score(y_test, result['probabilities'])
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison for Different Feature Selection Methods')
plt.legend(loc='lower right')
plt.show()

# 混淆矩阵比较
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
for idx, (name, result) in enumerate(results.items()):
    conf_matrix = confusion_matrix(y_test, result['predictions'])
    conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, ax=axes[idx])

    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
            axes[idx].text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)

    axes[idx].set_title(f'{name} Confusion Matrix')
    axes[idx].set_xlabel('Predicted Label')
    axes[idx].set_ylabel('True Label')

plt.tight_layout()
plt.show()

# 打印分类报告
for name, result in results.items():
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, result['predictions']))

# 打印特征数量比较
print("\nNumber of features used in each model:")
print(f"Feature Importance top 10: {len(importance_features)}")
print(f"SHAP top 10: {len(shap_features)}")
print(f"Boruta: {len(boruta_features)}")

feature importance

shap

boruta

### hyperparameter tuning

baseline model

In [None]:
import optuna
from pytorch_tabnet.tab_model import TabNetClassifier
import numpy as np
from sklearn.model_selection import cross_val_score

def objective(trial):
    tabnet_params = {
        'n_d': trial.suggest_categorical('n_d', [8, 16, 32, 64]),
        'n_a': trial.suggest_categorical('n_a', [8, 16, 32, 64]),
        'n_steps': trial.suggest_int('n_steps', 3, 5),
        'gamma': trial.suggest_float('gamma', 1.0, 1.5),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': {
            'lr': trial.suggest_categorical('learning_rate', [0.01, 0.02, 0.005])
        },
        'scheduler_params': {
            'mode': 'min',
            'patience': 5,
            'min_lr': 1e-5,
            'factor': 0.5
        },
        'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
        'mask_type': 'entmax',
        'seed': 42
    }

    clf = TabNetClassifier(**tabnet_params)

    # 使用单次验证集而不是交叉验证
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train_scaled, y_train, test_size=0.2, random_state=42
    )

    clf.fit(
        X_train_split, y_train_split,
        eval_set=[(X_val_split, y_val_split)],
        max_epochs=30,
        patience=5,
        batch_size=2048,
        virtual_batch_size=256
    )

    pred_proba = clf.predict_proba(X_val_split)[:, 1]
    return roc_auc_score(y_val_split, pred_proba)

# 运行优化
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

# 打印最佳参数
print("\nBest parameters:", study.best_trial.params)
print("Best AUC score:", study.best_trial.value)

# 使用最佳参数训练最终模型
best_params = study.best_trial.params
final_params = {
    'n_d': best_params['n_d'],
    'n_a': best_params['n_a'],
    'n_steps': best_params['n_steps'],
    'gamma': best_params['gamma'],
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': {'lr': best_params['learning_rate']},
    'scheduler_params': {
        'mode': 'min',
        'patience': 5,
        'min_lr': 1e-5,
        'factor': 0.5
    },
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'mask_type': 'entmax',
    'seed': 42
}

best_model = TabNetClassifier(**final_params)
best_model.fit(
    X_train_scaled, y_train.values,
    eval_set=[(X_train_scaled, y_train.values)],
    max_epochs=100,
    batch_size=2048
)

In [None]:
# 计算预测概率和ROC曲线
y_pred_cv = best_model.predict(X_train_scaled)
y_pred_proba_cv = best_model.predict_proba(X_train_scaled)[:, 1]

# 绘制ROC曲线
plt.figure(figsize=(12, 8))
fpr, tpr, _ = roc_curve(y_train, y_pred_proba_cv)
roc_auc = roc_auc_score(y_train, y_pred_proba_cv)
plt.plot(fpr, tpr, label=f'TabNet (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Combined H1+H2 Dataset with TabNet (3-fold CV)')
plt.legend(loc='lower right')
plt.show()

# 计算和绘制混淆矩阵
conf_matrix = confusion_matrix(y_train, y_pred_cv)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)

for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Combined H1+H2 Confusion Matrix with TabNet')
plt.show()

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_train, y_pred_cv))

In [None]:
import optuna
from pytorch_tabnet.tab_model import TabNetClassifier

def get_quick_objective(X_train_data, y_train_data):
    def objective(trial):
        params = {
            'n_d': trial.suggest_categorical('n_d', [16, 32]),
            'n_steps': trial.suggest_categorical('n_steps', [3, 4]),
            'optimizer_fn': torch.optim.Adam,
            'optimizer_params': {'lr': trial.suggest_categorical('learning_rate', [0.01, 0.02])},
            'scheduler_params': {'mode': 'min', 'patience': 4, 'min_lr': 1e-5, 'factor': 0.5},
            'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
            'mask_type': 'entmax'
        }

        X_tr, X_val, y_tr, y_val = train_test_split(X_train_data, y_train_data, test_size=0.1, random_state=42)

        clf = TabNetClassifier(**params)
        clf.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            max_epochs=20,
            patience=4,
            batch_size=4096
        )

        return roc_auc_score(y_val, clf.predict_proba(X_val)[:, 1])
    return objective

# 对三个特征集分别进行优化
optimized_params = {}
for name, (X_data, y_data) in [
    ('Importance', (X_train_importance, y_train)),
    ('SHAP', (X_train_shap, y_train)),
    ('Boruta', (X_train_boruta, y_train))
]:
    study = optuna.create_study(direction='maximize')
    study.optimize(get_quick_objective(X_data, y_data), n_trials=8)

    optimized_params[name] = {
        **study.best_trial.params,
        'optimizer_fn': torch.optim.Adam,
        'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
        'scheduler_params': {'mode': 'min', 'patience': 4, 'min_lr': 1e-5, 'factor': 0.5},
        'mask_type': 'entmax'
    }
    print(f"\n{name} Best Parameters:", study.best_trial.params)
    print(f"Best AUC: {study.best_trial.value:.4f}")

# 使用优化后的参数训练最终模型
final_models = {}
for name, params in optimized_params.items():
    model = TabNetClassifier(**params)
    if name == 'Importance':
        X_train_data, X_test_data = X_train_importance, X_test_importance
    elif name == 'SHAP':
        X_train_data, X_test_data = X_train_shap, X_test_shap
    else:  # Boruta
        X_train_data, X_test_data = X_train_boruta, X_test_boruta

    model.fit(X_train_data, y_train.values, max_epochs=100, batch_size=4096)
    final_models[name] = model

feature importance（optuna+test set)

In [None]:
# Feature Importance Model
import optuna
from pytorch_tabnet.tab_model import TabNetClassifier

def objective_importance(trial):
    params = {
        'n_d': trial.suggest_categorical('n_d', [16, 32]),
        'n_steps': trial.suggest_categorical('n_steps', [3, 4]),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': dict(lr=trial.suggest_categorical('learning_rate', [0.01, 0.02])),
        'scheduler_params': dict(mode='min', patience=4, min_lr=1e-5, factor=0.5),
        'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
        'mask_type': 'entmax'
    }

    X_tr, X_val, y_tr, y_val = train_test_split(X_train_importance, y_train, test_size=0.1, random_state=42)

    clf = TabNetClassifier(**params)
    clf.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        max_epochs=20,
        patience=4,
        batch_size=4096
    )

    return roc_auc_score(y_val, clf.predict_proba(X_val)[:, 1])

# 优化Feature Importance模型
study_importance = optuna.create_study(direction='maximize')
study_importance.optimize(objective_importance, n_trials=8)

# 获取最佳参数
best_params_importance = {
    'n_d': study_importance.best_trial.params['n_d'],
    'n_steps': study_importance.best_trial.params['n_steps'],
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=study_importance.best_trial.params['learning_rate']),
    'scheduler_params': dict(mode='min', patience=4, min_lr=1e-5, factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'mask_type': 'entmax'
}

# 训练最终模型
final_model_importance = TabNetClassifier(**best_params_importance)
final_model_importance.fit(
    X_train_importance, y_train.values,
    max_epochs=100,
    batch_size=4096
)

print("Feature Importance Best Parameters:", study_importance.best_trial.params)
print(f"Best AUC: {study_importance.best_trial.value:.4f}")

In [None]:
# 训练集评估
y_train_pred = final_model_importance.predict(X_train_importance)
y_train_proba = final_model_importance.predict_proba(X_train_importance)[:, 1]

# 测试集评估
y_test_pred = final_model_importance.predict(X_test_importance)
y_test_proba = final_model_importance.predict_proba(X_test_importance)[:, 1]

# 绘制ROC曲线
plt.figure(figsize=(12, 8))
# 训练集ROC
fpr_train, tpr_train, _ = roc_curve(y_train, y_train_proba)
roc_auc_train = roc_auc_score(y_train, y_train_proba)
plt.plot(fpr_train, tpr_train, label=f'Train (AUC = {roc_auc_train:.3f})')
# 测试集ROC
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_proba)
roc_auc_test = roc_auc_score(y_test, y_test_proba)
plt.plot(fpr_test, tpr_test, label=f'Test (AUC = {roc_auc_test:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Feature Importance Model)')
plt.legend(loc='lower right')
plt.show()

# 训练集混淆矩阵
plt.figure(figsize=(8, 8))
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
conf_matrix_percentage_train = conf_matrix_train / conf_matrix_train.sum(axis=1).reshape(-1, 1) * 100

sns.heatmap(conf_matrix_train, annot=True, fmt='d', cmap='Blues', cbar=False)
for i in range(conf_matrix_train.shape[0]):
    for j in range(conf_matrix_train.shape[1]):
        percentage_text = f"{conf_matrix_percentage_train[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Training Set Confusion Matrix (Feature Importance Model)')
plt.show()

# 测试集混淆矩阵
plt.figure(figsize=(8, 8))
conf_matrix_test = confusion_matrix(y_test, y_test_pred)
conf_matrix_percentage_test = conf_matrix_test / conf_matrix_test.sum(axis=1).reshape(-1, 1) * 100

sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', cbar=False)
for i in range(conf_matrix_test.shape[0]):
    for j in range(conf_matrix_test.shape[1]):
        percentage_text = f"{conf_matrix_percentage_test[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Test Set Confusion Matrix (Feature Importance Model)')
plt.show()

# 输出训练集分类报告
print("\nTraining Set Classification Report (Feature Importance Model):")
print(classification_report(y_train, y_train_pred))

# 输出测试集分类报告
print("\nTest Set Classification Report (Feature Importance Model):")
print(classification_report(y_test, y_test_pred))

# 输出特征列表
print("\nFeatures used in this model:")
print(importance_features)

# 打印样本数量信息
print("\nDataset Information:")
print(f"Training Set Size: {len(X_train_importance)}")
print(f"Test Set Size: {len(X_test_importance)}")
print(f"Number of Features: {len(importance_features)}")

SHAP（optuna+test set)

In [None]:
# SHAP Model
def objective_shap(trial):
    params = {
        'n_d': trial.suggest_categorical('n_d', [16, 32]),
        'n_steps': trial.suggest_categorical('n_steps', [3, 4]),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': dict(lr=trial.suggest_categorical('learning_rate', [0.01, 0.02])),
        'scheduler_params': dict(mode='min', patience=4, min_lr=1e-5, factor=0.5),
        'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
        'mask_type': 'entmax'
    }

    X_tr, X_val, y_tr, y_val = train_test_split(X_train_shap, y_train, test_size=0.1, random_state=42)

    clf = TabNetClassifier(**params)
    clf.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        max_epochs=20,
        patience=4,
        batch_size=4096
    )

    return roc_auc_score(y_val, clf.predict_proba(X_val)[:, 1])

# 优化SHAP模型
study_shap = optuna.create_study(direction='maximize')
study_shap.optimize(objective_shap, n_trials=8)

# 获取最佳参数
best_params_shap = {
    'n_d': study_shap.best_trial.params['n_d'],
    'n_steps': study_shap.best_trial.params['n_steps'],
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=study_shap.best_trial.params['learning_rate']),
    'scheduler_params': dict(mode='min', patience=4, min_lr=1e-5, factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'mask_type': 'entmax'
}

# 训练最终模型
final_model_shap = TabNetClassifier(**best_params_shap)
final_model_shap.fit(
    X_train_shap, y_train.values,
    max_epochs=100,
    batch_size=4096
)

print("SHAP Best Parameters:", study_shap.best_trial.params)
print(f"Best AUC: {study_shap.best_trial.value:.4f}")

In [None]:
# 训练集评估
y_train_pred = final_model_shap.predict(X_train_shap)
y_train_proba = final_model_shap.predict_proba(X_train_shap)[:, 1]

# 测试集评估
y_test_pred = final_model_shap.predict(X_test_shap)
y_test_proba = final_model_shap.predict_proba(X_test_shap)[:, 1]

# 绘制ROC曲线
plt.figure(figsize=(12, 8))
# 训练集ROC
fpr_train, tpr_train, _ = roc_curve(y_train, y_train_proba)
roc_auc_train = roc_auc_score(y_train, y_train_proba)
plt.plot(fpr_train, tpr_train, label=f'Train (AUC = {roc_auc_train:.3f})')
# 测试集ROC
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_proba)
roc_auc_test = roc_auc_score(y_test, y_test_proba)
plt.plot(fpr_test, tpr_test, label=f'Test (AUC = {roc_auc_test:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (SHAP Model)')
plt.legend(loc='lower right')
plt.show()

# 训练集混淆矩阵
plt.figure(figsize=(8, 8))
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
conf_matrix_percentage_train = conf_matrix_train / conf_matrix_train.sum(axis=1).reshape(-1, 1) * 100

sns.heatmap(conf_matrix_train, annot=True, fmt='d', cmap='Blues', cbar=False)
for i in range(conf_matrix_train.shape[0]):
    for j in range(conf_matrix_train.shape[1]):
        percentage_text = f"{conf_matrix_percentage_train[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Training Set Confusion Matrix (SHAP Model)')
plt.show()

# 测试集混淆矩阵
plt.figure(figsize=(8, 8))
conf_matrix_test = confusion_matrix(y_test, y_test_pred)
conf_matrix_percentage_test = conf_matrix_test / conf_matrix_test.sum(axis=1).reshape(-1, 1) * 100

sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', cbar=False)
for i in range(conf_matrix_test.shape[0]):
    for j in range(conf_matrix_test.shape[1]):
        percentage_text = f"{conf_matrix_percentage_test[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Test Set Confusion Matrix (SHAP Model)')
plt.show()

# 输出训练集分类报告
print("\nTraining Set Classification Report (SHAP Model):")
print(classification_report(y_train, y_train_pred))

# 输出测试集分类报告
print("\nTest Set Classification Report (SHAP Model):")
print(classification_report(y_test, y_test_pred))

# 输出特征列表
print("\nFeatures used in this model:")
print(shap_features)

# 打印样本数量信息
print("\nDataset Information:")
print(f"Training Set Size: {len(X_train_shap)}")
print(f"Test Set Size: {len(X_test_shap)}")
print(f"Number of Features: {len(shap_features)}")

Boruta（optuna+test set)

In [None]:
# Boruta Model
def objective_boruta(trial):
    params = {
        'n_d': trial.suggest_categorical('n_d', [16, 32]),
        'n_steps': trial.suggest_categorical('n_steps', [3, 4]),
        'optimizer_fn': torch.optim.Adam,
        'optimizer_params': dict(lr=trial.suggest_categorical('learning_rate', [0.01, 0.02])),
        'scheduler_params': dict(mode='min', patience=4, min_lr=1e-5, factor=0.5),
        'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
        'mask_type': 'entmax'
    }

    X_tr, X_val, y_tr, y_val = train_test_split(X_train_boruta, y_train, test_size=0.1, random_state=42)

    clf = TabNetClassifier(**params)
    clf.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        max_epochs=20,
        patience=4,
        batch_size=4096
    )

    return roc_auc_score(y_val, clf.predict_proba(X_val)[:, 1])

# 优化Boruta模型
study_boruta = optuna.create_study(direction='maximize')
study_boruta.optimize(objective_boruta, n_trials=8)

# 获取最佳参数
best_params_boruta = {
    'n_d': study_boruta.best_trial.params['n_d'],
    'n_steps': study_boruta.best_trial.params['n_steps'],
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=study_boruta.best_trial.params['learning_rate']),
    'scheduler_params': dict(mode='min', patience=4, min_lr=1e-5, factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'mask_type': 'entmax'
}

# 训练最终模型
final_model_boruta = TabNetClassifier(**best_params_boruta)
final_model_boruta.fit(
    X_train_boruta, y_train.values,
    max_epochs=100,
    batch_size=4096
)

print("Boruta Best Parameters:", study_boruta.best_trial.params)
print(f"Best AUC: {study_boruta.best_trial.value:.4f}")

In [None]:
# 训练集评估
y_train_pred = final_model_boruta.predict(X_train_boruta)
y_train_proba = final_model_boruta.predict_proba(X_train_boruta)[:, 1]

# 测试集评估
y_test_pred = final_model_boruta.predict(X_test_boruta)
y_test_proba = final_model_boruta.predict_proba(X_test_boruta)[:, 1]

# 绘制ROC曲线
plt.figure(figsize=(12, 8))
# 训练集ROC
fpr_train, tpr_train, _ = roc_curve(y_train, y_train_proba)
roc_auc_train = roc_auc_score(y_train, y_train_proba)
plt.plot(fpr_train, tpr_train, label=f'Train (AUC = {roc_auc_train:.3f})')
# 测试集ROC
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_proba)
roc_auc_test = roc_auc_score(y_test, y_test_proba)
plt.plot(fpr_test, tpr_test, label=f'Test (AUC = {roc_auc_test:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Boruta Model)')
plt.legend(loc='lower right')
plt.show()

# 训练集混淆矩阵
plt.figure(figsize=(8, 8))
conf_matrix_train = confusion_matrix(y_train, y_train_pred)
conf_matrix_percentage_train = conf_matrix_train / conf_matrix_train.sum(axis=1).reshape(-1, 1) * 100

sns.heatmap(conf_matrix_train, annot=True, fmt='d', cmap='Blues', cbar=False)
for i in range(conf_matrix_train.shape[0]):
    for j in range(conf_matrix_train.shape[1]):
        percentage_text = f"{conf_matrix_percentage_train[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Training Set Confusion Matrix (Boruta Model)')
plt.show()

# 测试集混淆矩阵
plt.figure(figsize=(8, 8))
conf_matrix_test = confusion_matrix(y_test, y_test_pred)
conf_matrix_percentage_test = conf_matrix_test / conf_matrix_test.sum(axis=1).reshape(-1, 1) * 100

sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', cbar=False)
for i in range(conf_matrix_test.shape[0]):
    for j in range(conf_matrix_test.shape[1]):
        percentage_text = f"{conf_matrix_percentage_test[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Test Set Confusion Matrix (Boruta Model)')
plt.show()

# 输出训练集分类报告
print("\nTraining Set Classification Report (Boruta Model):")
print(classification_report(y_train, y_train_pred))

# 输出测试集分类报告
print("\nTest Set Classification Report (Boruta Model):")
print(classification_report(y_test, y_test_pred))

# 输出特征列表
print("\nFeatures used in this model:")
print(boruta_features)

# 打印样本数量信息
print("\nDataset Information:")
print(f"Training Set Size: {len(X_train_boruta)}")
print(f"Test Set Size: {len(X_test_boruta)}")
print(f"Number of Features: {len(boruta_features)}")

### test set

baseline model

In [None]:
# 预处理测试集
X_test = test_data[selected_columns].drop(columns=['is_canceled'])
y_test = test_data['is_canceled']
X_test_scaled = scaler.transform(X_test)

# 获取测试集预测
y_test_pred = best_model.predict(X_test_scaled)
y_test_proba = best_model.predict_proba(X_test_scaled)[:, 1]

# ROC曲线
plt.figure(figsize=(12, 8))
fpr, tpr, _ = roc_curve(y_test, y_test_proba)
roc_auc = roc_auc_score(y_test, y_test_proba)
plt.plot(fpr, tpr, label=f'TabNet (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Combined H1+H2 Test Dataset with TabNet')
plt.legend(loc='lower right')
plt.show()

# 混淆矩阵
conf_matrix = confusion_matrix(y_test, y_test_pred)
conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

plt.figure(figsize=(8, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)

for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
        plt.text(j + 0.2, i + 0.2, percentage_text, ha='center', va='center', color='green', fontsize=9)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Combined H1+H2 Test Set Confusion Matrix with TabNet')
plt.show()

# 分类报告
print("\nTest Set Classification Report:")
print(classification_report(y_test, y_test_pred))

# 输出测试集大小信息
print("\nTest Dataset Information:")
print(f"Total Test Samples: {len(X_test)}")
print(f"Class Distribution in Test Set:")
print(y_test.value_counts(normalize=True).round(4) * 100)

feature importance

shap

boruta

# error analysis




###H1

Best model：random forest baseline/TOP10 features/ TOP10 SHAP model with Optuna

In [None]:
def analyze_channels(X_test_scaled, y_test, test_data, best_rf):
    """
    对最优化随机森林模型进行分渠道分析
    """
    # 存储每个渠道的结果
    channel_results = {}

    # 对每个渠道进行分析
    for channel in test_data['market_segment'].unique():
        # 获取该渠道的数据
        channel_mask = test_data['market_segment'] == channel
        X_channel = X_test_scaled[channel_mask]
        y_channel = y_test[channel_mask]

        if len(y_channel) < 50:  # 跳过样本量太小的渠道
            continue

        # 预测
        y_pred = best_rf.predict(X_channel)
        y_pred_proba = best_rf.predict_proba(X_channel)[:, 1]

        # 计算指标
        metrics = {
            'accuracy': accuracy_score(y_channel, y_pred),
            'precision': precision_score(y_channel, y_pred),
            'recall': recall_score(y_channel, y_pred),
            'f1': f1_score(y_channel, y_pred),
            'roc_auc': roc_auc_score(y_channel, y_pred_proba),
            'conf_matrix': confusion_matrix(y_channel, y_pred),
            'sample_size': len(y_channel)
        }

        channel_results[channel] = metrics

    return channel_results

def visualize_channel_results(channel_results):
    """
    创建渠道分析的可视化
    """
    # 1. 准备性能指标数据
    metrics_data = []
    for channel, metrics in channel_results.items():
        metrics_data.append({
            'Channel': channel,
            'Sample Size': metrics['sample_size'],
            'Accuracy': metrics['accuracy'],
            'Precision': metrics['precision'],
            'Recall': metrics['recall'],
            'F1-Score': metrics['f1'],
            'ROC AUC': metrics['roc_auc']
        })

    metrics_df = pd.DataFrame(metrics_data)

    # 2. 创建性能指标对比图
    plt.figure(figsize=(12, 6))
    metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC']
    x = np.arange(len(metrics_df['Channel']))
    width = 0.15

    for i, metric in enumerate(metrics_to_plot):
        plt.bar(x + width*i - width*2,
               metrics_df[metric],
               width,
               label=metric)

    plt.xlabel('Channel')
    plt.ylabel('Score')
    plt.title('Performance Metrics by Channel')
    plt.xticks(x, metrics_df['Channel'], rotation=45)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    # 3. 为每个渠道创建混淆矩阵
    for channel, metrics in channel_results.items():
        conf_matrix = metrics['conf_matrix']
        conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)

        # 添加百分比标注
        for i in range(conf_matrix.shape[0]):
            for j in range(conf_matrix.shape[1]):
                percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
                plt.text(j + 0.2, i + 0.2, percentage_text,
                        ha='center', va='center', color='green')

        plt.title(f'Confusion Matrix: {channel}\n(n={metrics["sample_size"]})')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.show()

    # 4. 打印详细结果
    print("\nDetailed Performance by Channel:")
    print(metrics_df.round(4))

    # 5. 错误率分析
    print("\nError Analysis by Channel:")
    for channel, metrics in channel_results.items():
        conf_matrix = metrics['conf_matrix']
        total = conf_matrix.sum()
        false_positives = conf_matrix[0, 1]
        false_negatives = conf_matrix[1, 0]

        print(f"\n{channel} (n={metrics['sample_size']}):")
        print(f"False Positive Rate: {false_positives/total*100:.2f}%")
        print(f"False Negative Rate: {false_negatives/total*100:.2f}%")
        print(f"Total Error Rate: {(false_positives + false_negatives)/total*100:.2f}%")

    return metrics_df

# 运行分析
channel_results = analyze_channels(X_test_scaled, y_test, H1_test, best_rf)

# 可视化结果
metrics_df = visualize_channel_results(channel_results)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# 定义渠道映射字典
channel_mapping = {
    6: 'Online TA',
    5: 'Offline TA/TO',
    4: 'Groups',
    3: 'Direct',
    2: 'Corporate',
    1: 'Complementary',
    0: 'Aviation',
    7: 'Undefined'
}

def analyze_channels(X_test_scaled, y_test, H1_test, best_rf):
    """
    对最优化随机森林模型进行分渠道分析
    """
    # 将数字编码映射回渠道名称
    H1_test = H1_test.copy()
    H1_test['market_segment'] = H1_test['market_segment'].map(channel_mapping)

    # 获取前5个最常见的预订渠道
    top_channels = H1_test['market_segment'].value_counts().head(5).index
    channel_results = {}

    for channel in top_channels:
        # 获取该渠道的数据
        channel_mask = H1_test['market_segment'] == channel
        X_channel = X_test_scaled[channel_mask]
        y_channel = y_test[channel_mask]

        # 预测
        y_pred = best_rf.predict(X_channel)
        y_pred_proba = best_rf.predict_proba(X_channel)[:, 1]

        # 计算指标
        metrics = {
            'accuracy': accuracy_score(y_channel, y_pred),
            'precision': precision_score(y_channel, y_pred),
            'recall': recall_score(y_channel, y_pred),
            'f1': f1_score(y_channel, y_pred),
            'roc_auc': roc_auc_score(y_channel, y_pred_proba),
            'conf_matrix': confusion_matrix(y_channel, y_pred),
            'sample_size': len(y_channel),
            'classification_report': classification_report(y_channel, y_pred, output_dict=True)
        }

        channel_results[channel] = metrics

    return channel_results

def visualize_results(channel_results):
    """
    创建可视化比较
    """
    # 准备数据
    metrics_data = []
    for channel, metrics in channel_results.items():
        metrics_data.append({
            'Channel': channel,
            'Sample Size': metrics['sample_size'],
            'Accuracy': metrics['accuracy'],
            'Precision': metrics['precision'],
            'Recall': metrics['recall'],
            'F1-Score': metrics['f1'],
            'ROC AUC': metrics['roc_auc']
        })

    metrics_df = pd.DataFrame(metrics_data)

    # 1. 创建性能指标热图
    plt.figure(figsize=(12, 6))
    metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC']
    heatmap_data = metrics_df[['Channel'] + metrics_to_plot].set_index('Channel')
    sns.heatmap(heatmap_data, annot=True, fmt='.3f', cmap='YlOrRd')
    plt.title('Performance Metrics by Channel')
    plt.tight_layout()
    plt.show()

    # 2. 创建渠道性能对比柱状图
    plt.figure(figsize=(12, 6))
    x = np.arange(len(metrics_df['Channel']))
    width = 0.15

    for i, metric in enumerate(metrics_to_plot):
        plt.bar(x + width*i - width*2,
               metrics_df[metric],
               width,
               label=metric)

    plt.xlabel('Channel')
    plt.ylabel('Score')
    plt.title('Performance Metrics by Channel')
    plt.xticks(x, metrics_df['Channel'], rotation=45)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, axis='y', linestyle='--', alpha=0.7)
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.show()

    # 3. 创建混淆矩阵可视化
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.ravel()

    for idx, (channel, metrics) in enumerate(channel_results.items()):
        conf_matrix = metrics['conf_matrix']
        conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                   cbar=False, ax=axes[idx])
        axes[idx].set_title(f'{channel}\n(n={metrics["sample_size"]})')
        axes[idx].set_xlabel('Predicted')
        axes[idx].set_ylabel('Actual')

        for i in range(conf_matrix.shape[0]):
            for j in range(conf_matrix.shape[1]):
                axes[idx].text(j + 0.2, i + 0.2,
                             f'{conf_matrix_percentage[i, j]:.1f}%',
                             ha='center', va='center', color='green')

    # 删除多余的子图
    if len(channel_results) < len(axes):
        for idx in range(len(channel_results), len(axes)):
            fig.delaxes(axes[idx])

    plt.tight_layout()
    plt.show()

    # 4. 打印详细结果
    print("\nDetailed Performance by Channel:")
    print(metrics_df.round(4))

    # 5. 错误率分析
    print("\nError Analysis by Channel:")
    for channel, metrics in channel_results.items():
        conf_matrix = metrics['conf_matrix']
        total = conf_matrix.sum()
        false_positives = conf_matrix[0, 1]
        false_negatives = conf_matrix[1, 0]

        print(f"\n{channel} (n={metrics['sample_size']}):")
        print(f"False Positive Rate: {false_positives/total*100:.2f}%")
        print(f"False Negative Rate: {false_negatives/total*100:.2f}%")
        print(f"Total Error Rate: {(false_positives + false_negatives)/total*100:.2f}%")

    return metrics_df

# 运行分析
channel_results = analyze_channels(X_test_scaled, y_test, H1_test, best_rf)

# 可视化结果
metrics_df = visualize_results(channel_results)

###H2

Best model：random forest TOP10 features model with Optuna& Oversampling&SMOTE

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (confusion_matrix, roc_curve, classification_report,
                           accuracy_score, precision_score, recall_score,
                           f1_score, roc_auc_score)
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler

def analyze_channels_with_sampling(X_train_scaled_top, y_train, H2_train, cv=3):
    """
    Analyze channel performance for SMOTE and Oversampling methods
    """
    # 定义采样方法
    samplers = {
        'SMOTE': SMOTE(random_state=42),
        'Oversample': RandomOverSampler(random_state=42)
    }

    # 存储所有结果
    all_results = {}
    cv_splitter = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    for method, sampler in samplers.items():
        print(f"\n=== {method} ===")
        # 对训练数据进行重采样
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

        # 初始化模型
        rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)

        # 分渠道评估
        channel_results = {}
        for channel in H2_train['market_segment'].unique():
            # 获取该渠道的数据
            channel_mask = H2_train['market_segment'] == channel
            X_channel = X_train_scaled_top[channel_mask]
            y_channel = y_train[channel_mask]

            if len(y_channel) < 50:  # 跳过样本量太小的渠道
                continue

            # 使用交叉验证进行预测
            y_pred = cross_val_predict(rf_model, X_channel, y_channel, cv=cv_splitter)
            y_pred_proba = cross_val_predict(rf_model, X_channel, y_channel,
                                           cv=cv_splitter, method='predict_proba')[:, 1]

            # 计算指标
            metrics = {
                'accuracy': accuracy_score(y_channel, y_pred),
                'precision': precision_score(y_channel, y_pred),
                'recall': recall_score(y_channel, y_pred),
                'f1': f1_score(y_channel, y_pred),
                'roc_auc': roc_auc_score(y_channel, y_pred_proba),
                'conf_matrix': confusion_matrix(y_channel, y_pred),
                'sample_size': len(y_channel)
            }

            channel_results[channel] = metrics

        all_results[method] = channel_results

    return all_results

def visualize_sampling_results(all_results):
    """
    Create visualizations for sampling methods comparison
    """
    # 1. 创建性能指标比较表格
    comparison_data = []
    for method, channel_results in all_results.items():
        for channel, metrics in channel_results.items():
            comparison_data.append({
                'Method': method,
                'Channel': channel,
                'Sample Size': metrics['sample_size'],
                'Accuracy': metrics['accuracy'],
                'Precision': metrics['precision'],
                'Recall': metrics['recall'],
                'F1': metrics['f1'],
                'ROC AUC': metrics['roc_auc']
            })

    comparison_df = pd.DataFrame(comparison_data)

    # 2. 每个渠道的混淆矩阵和ROC曲线
    for method, channel_results in all_results.items():
        for channel, metrics in channel_results.items():
            conf_matrix = metrics['conf_matrix']
            conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

            plt.figure(figsize=(8, 6))
            sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
            for i in range(conf_matrix.shape[0]):
                for j in range(conf_matrix.shape[1]):
                    percentage_text = f"{conf_matrix_percentage[i, j]:.1f}%"
                    plt.text(j + 0.2, i + 0.2, percentage_text,
                           ha='center', va='center', color='green')
            plt.title(f'Confusion Matrix\n{method} - {channel} (n={metrics["sample_size"]})')
            plt.xlabel('Predicted Label')
            plt.ylabel('True Label')
            plt.show()

    # 3. 性能指标比较图
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1', 'ROC AUC']

    plt.figure(figsize=(15, 8))
    for idx, metric in enumerate(metrics, 1):
        plt.subplot(2, 3, idx)
        for method in ['SMOTE', 'Oversample']:
            method_data = comparison_df[comparison_df['Method'] == method]
            plt.bar(np.arange(len(method_data)) + (0.35 if method == 'SMOTE' else 0),
                   method_data[metric],
                   width=0.35,
                   label=method)
        plt.title(f'{metric} by Channel')
        plt.xticks(np.arange(len(method_data)), method_data['Channel'], rotation=45)
        plt.legend()
    plt.tight_layout()
    plt.show()

    # 4. 打印详细结果
    print("\nDetailed Results by Channel:")
    for method in ['SMOTE', 'Oversample']:
        print(f"\n=== {method} ===")
        method_data = comparison_df[comparison_df['Method'] == method]
        print(method_data.round(4))

        print(f"\nError Analysis for {method}:")
        for channel, metrics in all_results[method].items():
            conf_matrix = metrics['conf_matrix']
            total = conf_matrix.sum()
            false_positives = conf_matrix[0, 1]
            false_negatives = conf_matrix[1, 0]

            print(f"\n{channel} (n={metrics['sample_size']}):")
            print(f"False Positive Rate: {false_positives/total*100:.2f}%")
            print(f"False Negative Rate: {false_negatives/total*100:.2f}%")
            print(f"Total Error Rate: {(false_positives + false_negatives)/total*100:.2f}%")

    return comparison_df

# 运行分析
all_results = analyze_channels_with_sampling(X_train_scaled_top, y_train, H2_train)

# 可视化结果
comparison_df = visualize_sampling_results(all_results)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE, RandomOverSampler

# 定义渠道映射
channel_mapping = {
    6: 'Online TA',
    5: 'Offline TA/TO',
    4: 'Groups',
    3: 'Direct',
    2: 'Corporate',
    1: 'Complementary',
    0: 'Aviation',
    7: 'Undefined'
}

def analyze_channels_cv(X_train_scaled_top, y_train, H2_train, cv=3):
    """
    使用交叉验证分析各渠道的性能
    """
    # 定义采样方法
    samplers = {
        'SMOTE': SMOTE(random_state=42),
        'Oversample': RandomOverSampler(random_state=42)
    }

    # 存储所有结果
    all_results = {}
    cv_splitter = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    # 将数字编码映射回渠道名称
    H2_train = H2_train.copy()
    H2_train['market_segment'] = H2_train['market_segment'].map(channel_mapping)

    for method_name, sampler in samplers.items():
        print(f"\nProcessing {method_name}...")
        # 对数据进行重采样
        X_resampled, y_resampled = sampler.fit_resample(X_train_scaled_top, y_train)

        # 初始化模型
        model = RandomForestClassifier(random_state=42, n_jobs=-1)

        # 获取前5个最常见的预订渠道
        top_channels = H2_train['market_segment'].value_counts().head(5).index
        channel_results = {}

        for channel in top_channels:
            # 获取该渠道的数据
            channel_mask = H2_train['market_segment'] == channel
            X_channel = X_train_scaled_top[channel_mask]
            y_channel = y_train[channel_mask]

            # 使用交叉验证进行预测
            y_pred = cross_val_predict(model, X_channel, y_channel, cv=cv_splitter)
            y_pred_proba = cross_val_predict(model, X_channel, y_channel,
                                           cv=cv_splitter, method='predict_proba')[:, 1]

            # 计算指标
            channel_metrics = {
                'accuracy': accuracy_score(y_channel, y_pred),
                'roc_auc': roc_auc_score(y_channel, y_pred_proba),
                'conf_matrix': confusion_matrix(y_channel, y_pred),
                'sample_size': len(y_channel),
                'classification_report': classification_report(y_channel, y_pred, output_dict=True)
            }

            channel_results[channel] = channel_metrics

        all_results[method_name] = channel_results

    return all_results

def visualize_results(all_results):
    """
    创建可视化比较不同方法
    """
    # 准备比较数据
    comparison_data = []

    for method, channel_results in all_results.items():
        for channel, metrics in channel_results.items():
            comparison_data.append({
                'Method': method,
                'Channel': channel,
                'Accuracy': metrics['accuracy'],
                'ROC AUC': metrics['roc_auc'],
                'Precision': metrics['classification_report']['weighted avg']['precision'],
                'Recall': metrics['classification_report']['weighted avg']['recall'],
                'F1-Score': metrics['classification_report']['weighted avg']['f1-score'],
                'Sample Size': metrics['sample_size']
            })

    comparison_df = pd.DataFrame(comparison_data)

    # 1. 创建性能指标热图
    plt.figure(figsize=(15, 8))
    metrics_to_plot = ['Accuracy', 'ROC AUC', 'Precision', 'Recall', 'F1-Score']

    for idx, method in enumerate(['SMOTE', 'Oversample']):
        plt.subplot(1, 2, idx+1)
        method_data = comparison_df[comparison_df['Method'] == method].set_index('Channel')[metrics_to_plot]
        sns.heatmap(method_data, annot=True, fmt='.3f', cmap='YlOrRd')
        plt.title(f'{method} Performance Metrics by Channel')

    plt.tight_layout()
    plt.show()

    # 2. 创建各指标的比较柱状图
    channels = comparison_df['Channel'].unique()
    for metric in metrics_to_plot:
        plt.figure(figsize=(10, 6))
        x = np.arange(len(channels))
        width = 0.35

        smote_values = comparison_df[comparison_df['Method'] == 'SMOTE'][metric].values
        over_values = comparison_df[comparison_df['Method'] == 'Oversample'][metric].values

        plt.bar(x - width/2, smote_values, width, label='SMOTE')
        plt.bar(x + width/2, over_values, width, label='Oversample')

        plt.xlabel('Channel')
        plt.ylabel(metric)
        plt.title(f'{metric} Comparison by Channel and Method')
        plt.xticks(x, channels, rotation=45)
        plt.legend()
        plt.grid(True, axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.show()

    # 3. 打印性能比较表格
    comparison_pivot = comparison_df.pivot(index='Channel',
                                         columns='Method',
                                         values=['Accuracy', 'ROC AUC', 'Precision', 'Recall', 'F1-Score'])
    print("\nDetailed Performance Comparison:")
    print(comparison_pivot.round(4))

    # 4. 错误率分析
    print("\nError Analysis by Channel and Method:")
    for method, channel_results in all_results.items():
        print(f"\n=== {method} ===")
        for channel, metrics in channel_results.items():
            conf_matrix = metrics['conf_matrix']
            total = conf_matrix.sum()
            false_positives = conf_matrix[0, 1]
            false_negatives = conf_matrix[1, 0]

            print(f"\n{channel}:")
            print(f"Sample Size: {metrics['sample_size']}")
            print(f"False Positive Rate: {false_positives/total*100:.2f}%")
            print(f"False Negative Rate: {false_negatives/total*100:.2f}%")
            print(f"Total Error Rate: {(false_positives + false_negatives)/total*100:.2f}%")

    return comparison_df

# 运行分析
all_results = analyze_channels_cv(X_train_scaled_top, y_train, H2_train)

# 可视化结果
comparison_df = visualize_results(all_results)

###H1+H2

Best model：random forest TOP10 features model with Optuna

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

def analyze_top_channels(X_test_scaled_top, y_test, test_data, best_model, n_top_channels=5):
    """
    Analyze model performance for top n booking channels
    """
    # 获取前5个最常见的预订渠道
    top_channels = test_data['market_segment'].value_counts().head(n_top_channels).index

    # 存储每个渠道的结果
    channel_results = {}

    # 对每个渠道进行分析
    for channel in top_channels:
        # 创建渠道掩码
        channel_mask = test_data['market_segment'] == channel

        # 获取该渠道的测试数据
        X_channel = X_test_scaled_top[channel_mask]
        y_channel = y_test[channel_mask]

        # 进行预测
        y_pred_channel = best_model.predict(X_channel)
        y_pred_proba_channel = best_model.predict_proba(X_channel)[:, 1]

        # 计算指标
        channel_metrics = {
            'accuracy': accuracy_score(y_channel, y_pred_channel),
            'roc_auc': roc_auc_score(y_channel, y_pred_proba_channel),
            'conf_matrix': confusion_matrix(y_channel, y_pred_channel),
            'sample_size': len(y_channel),
            'classification_report': classification_report(y_channel, y_pred_channel, output_dict=True)
        }

        channel_results[channel] = channel_metrics

    return channel_results

def visualize_channel_results(channel_results):
    """
    创建可视化来比较不同渠道的表现
    """
    # 准备数据
    channels = list(channel_results.keys())
    metrics_df = pd.DataFrame({
        'Channel': channels,
        'Accuracy': [channel_results[c]['accuracy'] for c in channels],
        'ROC AUC': [channel_results[c]['roc_auc'] for c in channels],
        'Sample Size': [channel_results[c]['sample_size'] for c in channels],
        'Precision': [channel_results[c]['classification_report']['weighted avg']['precision'] for c in channels],
        'Recall': [channel_results[c]['classification_report']['weighted avg']['recall'] for c in channels],
        'F1-Score': [channel_results[c]['classification_report']['weighted avg']['f1-score'] for c in channels]
    })

    # 创建性能指标热图
    plt.figure(figsize=(12, 6))
    metrics_heatmap = metrics_df.set_index('Channel').drop('Sample Size', axis=1)
    sns.heatmap(metrics_heatmap, annot=True, fmt='.3f', cmap='YlOrRd')
    plt.title('Performance Metrics by Channel')
    plt.tight_layout()
    plt.show()

    # 创建混淆矩阵可视化
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.ravel()

    for idx, channel in enumerate(channels):
        conf_matrix = channel_results[channel]['conf_matrix']
        conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                   cbar=False, ax=axes[idx])
        axes[idx].set_title(f'{channel}\n(n={channel_results[channel]["sample_size"]})')
        axes[idx].set_xlabel('Predicted')
        axes[idx].set_ylabel('Actual')

        # 添加百分比标注
        for i in range(conf_matrix.shape[0]):
            for j in range(conf_matrix.shape[1]):
                axes[idx].text(j + 0.2, i + 0.2,
                             f'{conf_matrix_percentage[i, j]:.1f}%',
                             ha='center', va='center', color='green')

    plt.tight_layout()
    plt.show()

    return metrics_df

# 运行分析
channel_results = analyze_top_channels(X_test_scaled_top, y_test, test_data, best_model)
metrics_df = visualize_channel_results(channel_results)

# 打印详细结果
print("\nDetailed Channel Performance Summary:")
print("\nSample Sizes:")
for channel in channel_results:
    print(f"{channel}: {channel_results[channel]['sample_size']} samples")

print("\nPerformance Metrics by Channel:")
print(metrics_df.round(4))

# 计算和显示错误率分析
print("\nError Analysis by Channel:")
for channel in channel_results:
    conf_matrix = channel_results[channel]['conf_matrix']
    total = conf_matrix.sum()
    false_positives = conf_matrix[0, 1]
    false_negatives = conf_matrix[1, 0]

    print(f"\n{channel}:")
    print(f"False Positive Rate: {false_positives/total*100:.2f}%")
    print(f"False Negative Rate: {false_negatives/total*100:.2f}%")
    print(f"Total Error Rate: {(false_positives + false_negatives)/total*100:.2f}%")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# 创建渠道映射字典
channel_mapping = {
    6: 'Online TA',
    5: 'Offline TA/TO',
    4: 'Groups',
    3: 'Direct',
    2: 'Corporate',
    1: 'Complementary',
    0: 'Aviation',
    7: 'Undefined'
}

def analyze_top_channels(X_test_scaled_top, y_test, test_data, best_model, n_top_channels=5):
    """
    Analyze model performance for top n booking channels
    """
    # 将数字编码映射回渠道名称
    test_data = test_data.copy()
    test_data['market_segment'] = test_data['market_segment'].map(channel_mapping)

    # 获取前5个最常见的预订渠道
    top_channels = test_data['market_segment'].value_counts().head(n_top_channels).index

    # 存储每个渠道的结果
    channel_results = {}

    # 对每个渠道进行分析
    for channel in top_channels:
        channel_mask = test_data['market_segment'] == channel
        X_channel = X_test_scaled_top[channel_mask]
        y_channel = y_test[channel_mask]

        y_pred_channel = best_model.predict(X_channel)
        y_pred_proba_channel = best_model.predict_proba(X_channel)[:, 1]

        channel_metrics = {
            'accuracy': accuracy_score(y_channel, y_pred_channel),
            'roc_auc': roc_auc_score(y_channel, y_pred_proba_channel),
            'conf_matrix': confusion_matrix(y_channel, y_pred_channel),
            'sample_size': len(y_channel),
            'classification_report': classification_report(y_channel, y_pred_channel, output_dict=True)
        }

        channel_results[channel] = channel_metrics

    return channel_results

def visualize_channel_results(channel_results):
    """
    创建可视化来比较不同渠道的表现
    """
    # 准备数据
    channels = list(channel_results.keys())
    metrics_df = pd.DataFrame({
        'Channel': channels,
        'Accuracy': [channel_results[c]['accuracy'] for c in channels],
        'ROC AUC': [channel_results[c]['roc_auc'] for c in channels],
        'Sample Size': [channel_results[c]['sample_size'] for c in channels],
        'Precision': [channel_results[c]['classification_report']['weighted avg']['precision'] for c in channels],
        'Recall': [channel_results[c]['classification_report']['weighted avg']['recall'] for c in channels],
        'F1-Score': [channel_results[c]['classification_report']['weighted avg']['f1-score'] for c in channels]
    })

    # 1. 创建性能指标热图
    plt.figure(figsize=(12, 6))
    metrics_heatmap = metrics_df.set_index('Channel').drop('Sample Size', axis=1)
    sns.heatmap(metrics_heatmap, annot=True, fmt='.3f', cmap='YlOrRd')
    plt.title('Performance Metrics by Channel')
    plt.tight_layout()
    plt.show()

    # 2. 创建混淆矩阵可视化
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.ravel()

    for idx, channel in enumerate(channels):
        conf_matrix = channel_results[channel]['conf_matrix']
        conf_matrix_percentage = conf_matrix / conf_matrix.sum(axis=1).reshape(-1, 1) * 100

        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                   cbar=False, ax=axes[idx])
        axes[idx].set_title(f'{channel}\n(n={channel_results[channel]["sample_size"]})')
        axes[idx].set_xlabel('Predicted')
        axes[idx].set_ylabel('Actual')

        for i in range(conf_matrix.shape[0]):
            for j in range(conf_matrix.shape[1]):
                axes[idx].text(j + 0.2, i + 0.2,
                             f'{conf_matrix_percentage[i, j]:.1f}%',
                             ha='center', va='center', color='green')

    plt.tight_layout()
    plt.show()

    # 3. 新增柱状图比较
    plt.figure(figsize=(12, 6))
    metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC']

    x = np.arange(len(channels))
    width = 0.15
    multiplier = 0

    for metric in metrics_to_plot:
        offset = width * multiplier
        plt.bar(x + offset, metrics_df[metric], width, label=metric)
        multiplier += 1

    plt.xlabel('Channel')
    plt.ylabel('Score')
    plt.title('Performance Metrics Comparison by Channel')
    plt.xticks(x + width * 2, channels, rotation=45)
    plt.legend(loc='upper right')
    plt.grid(True, axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    return metrics_df

# 运行分析
channel_results = analyze_top_channels(X_test_scaled_top, y_test, test_data, best_model)
metrics_df = visualize_channel_results(channel_results)

# 打印详细结果
print("\nDetailed Channel Performance Summary:")
print("\nSample Sizes:")
for channel in channel_results:
    print(f"{channel}: {channel_results[channel]['sample_size']} samples")

print("\nPerformance Metrics by Channel:")
print(metrics_df.round(4))

# 计算和显示错误率分析
print("\nError Analysis by Channel:")
for channel in channel_results:
    conf_matrix = channel_results[channel]['conf_matrix']
    total = conf_matrix.sum()
    false_positives = conf_matrix[0, 1]
    false_negatives = conf_matrix[1, 0]

    print(f"\n{channel}:")
    print(f"False Positive Rate: {false_positives/total*100:.2f}%")
    print(f"False Negative Rate: {false_negatives/total*100:.2f}%")
    print(f"Total Error Rate: {(false_positives + false_negatives)/total*100:.2f}%")