In [None]:
import functools
import os
import typing
import random
import joblib
import networkx as nx
import numpy as np
import pandas as pd
import dcor
from scipy.stats import pearsonr, ttest_rel
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import balanced_accuracy_score, silhouette_score
from sklearn.model_selection import GridSearchCV, GroupKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, LinearRegression, LassoCV
from sklearn.cluster import DBSCAN
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.utils import check_random_state
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
from scipy.special import softmax
from scipy.stats import chi2_contingency, ks_2samp
from scipy.spatial.distance import pdist, squareform
from tqdm.auto import tqdm
from scipy import stats
from scipy.stats import entropy
import statsmodels.api as sm
import pingouin as pg
from pgmpy.estimators import CITests
from scipy.spatial.distance import pdist, squareform
from econml.dml import CausalForestDML, DML
import lightgbm as lgb
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.exceptions import NotFittedError
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor

In [None]:
import crunch
crunch = crunch.load_notebook()

In [None]:
def graph_nodes_representation(graph, nodelist):
    """
    Create an alternative representation of a graph which is hashable
    and equivalent graphs have the same hash.

    Python cannot PROPERLY use nx.Graph/DiGraph as key for
    dictionaries, because two equivalent graphs with just different
    order of the nodes would result in different keys. This is
    undesirable here.

    So here we transform the graph into an equivalent form that is
    based on a specific nodelist and that is hashable. In this way,
    two equivalent graphs, once transformed, will result in identical
    keys.

    So we use the following trick: extract the adjacency matrix
    (with nodes in a fixed order) and then make a hashable thing out
    of it, through tuple(array.flatten()):
    """

    # This get the adjacency matrix with nodes in a given order, as
    # numpy array (which is not hashable):
    adjacency_matrix = nx.adjacency_matrix(graph, nodelist=nodelist).todense()

    # This transforms the numpy array into a hashable object:
    hashable = tuple(adjacency_matrix.flatten())

    return hashable

def create_graph_label():
    """
    Create a dictionary from graphs to labels, in two formats.
    """
    graph_label = {
        nx.DiGraph([("X", "Y"), ("v", "X"), ("v", "Y")]): "Confounder",
        nx.DiGraph([("X", "Y"), ("X", "v"), ("Y", "v")]): "Collider",
        nx.DiGraph([("X", "Y"), ("X", "v"), ("v", "Y")]): "Mediator",
        nx.DiGraph([("X", "Y"), ("v", "X")]):             "Cause of X",
        nx.DiGraph([("X", "Y"), ("v", "Y")]):             "Cause of Y",
        nx.DiGraph([("X", "Y"), ("X", "v")]):             "Consequence of X",
        nx.DiGraph([("X", "Y"), ("Y", "v")]):             "Consequence of Y",
        nx.DiGraph({"X": ["Y"], "v": []}):                "Independent",
    }

    nodelist = ["v", "X", "Y"]

    # This is an equivalent alternative to graph_label but in a form
    # for which two equivalent graphs have the same key:
    adjacency_label = {
        graph_nodes_representation(graph, nodelist): label
        for graph, label in graph_label.items()
    }

    return graph_label, adjacency_label

def get_labels(adjacency_matrix, adjacency_label):
    """
    Transform an adjacency_matrix (as pd.DataFrame) into a dictionary of variable:label
    """

    result = {}
    for variable in adjacency_matrix.columns.drop(["X", "Y"]):
        submatrix = adjacency_matrix.loc[[variable, "X", "Y"], [variable, "X", "Y"]]  # this is not hashable
        key = tuple(submatrix.values.flatten())  # this is hashable and a compatible with adjacency_label
    
        result[variable] = adjacency_label[key]

    return result

graph_label, adjacency_label = create_graph_label()

In [None]:
def apply_mapping(df, mapping):
    df_new = df.copy()
    # 创建临时映射以避免冲突
    temp_mapping = {k: f'_temp_{k}' for k in mapping.keys()}
    df_new.rename(columns=temp_mapping, inplace=True)
    if df_new.shape[0] == df_new.shape[1]:  # 如果是方阵，如标签矩阵
        df_new.rename(index=temp_mapping, inplace=True)
    # 应用最终映射
    final_mapping = {f'_temp_{k}': v for k, v in mapping.items()}
    df_new.rename(columns=final_mapping, inplace=True)
    if df_new.shape[0] == df_new.shape[1]:
        df_new.rename(index=final_mapping, inplace=True)
    return df_new

def check_duplicate_columns(df):
    """检查是否存在重复的列名"""
    return df.columns.duplicated().any()

def augment_data(X_train, y_train):
    new_X_train = X_train.copy()
    new_y_train = y_train.copy()
    for sample_id in X_train.keys():
        X = X_train[sample_id]
        y = y_train[sample_id]
        variables = list(X.columns)
        dim = len(variables)
        # 提取因果关系对
        edges = []
        for u in y.index:
            for v in y.columns:
                if y.loc[u, v] == 1:
                    edges.append((u, v))
        # 排除涉及 X 和 Y 的边
        edges_no_XY = [(u, v) for (u, v) in edges if u not in ['X', 'Y'] and v not in ['X', 'Y']]
        if dim >= 4:
            edges_to_use = edges_no_XY
            attempts = 0
            success = False
            while attempts < 3 and not success:
                if not edges_to_use:
                    break  # 没有合适的边，跳出循环
                u, v = random.choice(edges_to_use)
                mapping = {'X': u, 'Y': v, u: 'X', v: 'Y'}
                # 应用映射到特征矩阵和标签矩阵
                X_new = apply_mapping(X, mapping)
                y_new = apply_mapping(y, mapping)
                # 检查特征矩阵是否有重复列
                if check_duplicate_columns(X_new):
                    attempts += 1
                    continue  # 重试
                else:
                    # 没有重复列，存储新的数据
                    new_sample_id = '0' + sample_id
                    new_X_train[new_sample_id] = X_new
                    new_y_train[new_sample_id] = y_new
                    success = True
            if not success:
                # 没有找到合适的映射，复制原始数据
                new_sample_id = '0' + sample_id
                new_X_train[new_sample_id] = X.copy()
                new_y_train[new_sample_id] = y.copy()
        else:
            # 对于维度较低的数据，允许涉及 X 和 Y 的边
            edges_to_use = edges
            if not edges_to_use:
                # 没有边，复制原始数据
                new_sample_id = '0' + sample_id
                new_X_train[new_sample_id] = X.copy()
                new_y_train[new_sample_id] = y.copy()
                continue
            u, v = random.choice(edges_to_use)
            mapping = {'X': u, 'Y': v, u: 'X', v: 'Y'}
            # 应用映射到特征矩阵和标签矩阵
            X_new = apply_mapping(X, mapping)
            y_new = apply_mapping(y, mapping)
            # 检查特征矩阵是否有重复列
            if check_duplicate_columns(X_new):
                # 如果有重复列，复制原始数据
                new_sample_id = '0' + sample_id
                new_X_train[new_sample_id] = X.copy()
                new_y_train[new_sample_id] = y.copy()
            else:
                # 没有重复列，存储新的数据
                new_sample_id = '0' + sample_id
                new_X_train[new_sample_id] = X_new
                new_y_train[new_sample_id] = y_new
    return new_X_train, new_y_train

In [None]:
X_train, y_train, X_test = crunch.load_data()
print(len(X_train), len(y_train), len(X_test))
# X_train, y_train = augment_data(X_train, y_train)

In [None]:
def pearson_correlation(dataset):
    """
    Given a dataset, we compute the correlation-based features for each
    varibale, which are the correlation between that variable with X and Y,
    as well as summary statistics (max, min, mean, std) of all pairs
    of correlations.
    """

    variables = dataset.columns.drop(["X", "Y"])

    df = []
    for variable in variables:
        tmp = dataset.corr().drop([variable], axis="columns").loc[variable].abs()

        df.append({
            "variable": variable,
            "corr(v,X)": dataset[[variable, "X"]].corr().loc[variable, "X"],
            "corr(v,Y)": dataset[[variable, "Y"]].corr().loc[variable, "Y"],
            "max(corr(v, others))": tmp.max(),
            "min(corr(v, others))": tmp.min(),
            "mean(corr(v, others))": tmp.mean(),
            "std(corr(v, others))": tmp.std(),
            "25%(corr(v, others))": tmp.quantile(0.25), 
            "75%(corr(v, others))": tmp.quantile(0.75), 
        })


    df = pd.DataFrame(df)
    df["dataset"] = dataset.name

    df["corr(X,Y)"] = dataset[["X", "Y"]].corr().loc["X", "Y"]

    # pearsonr is NaN when the variance is 0, so we fill with 0
    df.fillna(0, inplace=True)

    # Reorder columns:
    df = df[["dataset"] + [colname for colname in df.columns if colname != "dataset"]]

    return df

def mutual_information(dataset):
    """
    Given a dataset, we compute the mutual-information-based features
    for each variable, which are the MI between that variable
    and X and Y, as well as summary statistics (max, min, mean, std) of
    all pairs of MI.
    """

    variables = dataset.columns.drop(["X", "Y"])

    df = []
    for variable in variables:
        tmp = mutual_info_regression(dataset.drop(columns=[variable]), dataset[variable])
        tmp = pd.Series(tmp)  # Convert tmp to a Pandas Series

        df.append({
            "variable": variable,
            "MI(v,X)": mutual_info_regression(dataset[[variable]], dataset["X"], discrete_features=False)[0],
            "MI(v,Y)": mutual_info_regression(dataset[[variable]], dataset["Y"], discrete_features=False)[0],
            "max(MI(v, others))": tmp.max(),
            "min(MI(v, others))": tmp.min(),
            "mean(MI(v, others))": tmp.mean(),
            "std(MI(v, others))": tmp.std(),
        })

    df = pd.DataFrame(df)
    df["dataset"] = dataset.name

    df["MI(X,Y)"] = mutual_info_regression(dataset[["X"]], dataset["Y"], discrete_features=False)[0]

    # Reorder columns:
    df = df[["dataset"] + [colname for colname in df.columns if colname != "dataset"]]

    return df

def add_dimension_feature(dataset):
    """
    Add a dimension feature to the dataset.
    """
    variables = dataset.columns.drop(["X", "Y"])
    dimension = len(variables)
    square_dimension = dimension * dimension
    df = pd.DataFrame({
        "variable": variables,
        "dimension": dimension,
        "square_dimension": square_dimension
    })
    df["dataset"] = dataset.name
    
    return df

def spearman_correlation(dataset):
    """
    Given a dataset, we compute the Spearman rank correlation-based features for each
    variable, which are the Spearman correlation between that variable with X and Y,
    as well as summary statistics (max, min, mean, std) of all pairs of Spearman correlations.
    """
    variables = dataset.columns.drop(["X", "Y"])

    df = []
    for variable in variables:
        tmp = dataset.corr(method='spearman').drop([variable], axis="columns").loc[variable].abs()

        df.append({
            "variable": variable,
            "spearman_corr(v,X)": dataset[[variable, "X"]].corr(method='spearman').loc[variable, "X"],
            "spearman_corr(v,Y)": dataset[[variable, "Y"]].corr(method='spearman').loc[variable, "Y"],
            "max(spearman_corr(v, others))": tmp.max(),
            "min(spearman_corr(v, others))": tmp.min(),
            "mean(spearman_corr(v, others))": tmp.mean(),
            "std(spearman_corr(v, others))": tmp.std(),
            #TODO 分位数
        })

    df = pd.DataFrame(df)
    df["dataset"] = dataset.name

    df["spearman_corr(X,Y)"] = dataset[["X", "Y"]].corr(method='spearman').loc["X", "Y"]

    # Spearman correlation is NaN when there are ties in rank, so we fill with 0
    df.fillna(0, inplace=True)

    # Reorder columns:
    df = df[["dataset"] + [colname for colname in df.columns if colname != "dataset"]]

    return df

def kendall_correlation(dataset):
    """
    Given a dataset, we compute the Kendall's tau correlation-based features for each
    variable, which are the Kendall's tau correlation between that variable with X and Y,
    as well as summary statistics (max, min, mean, std) of all pairs of Kendall's tau correlations.
    """
    variables = dataset.columns.drop(["X", "Y"])

    df = []
    for variable in variables:
        tmp = dataset.corr(method='kendall').drop([variable], axis="columns").loc[variable].abs()

        df.append({
            "variable": variable,
            "kendall_corr(v,X)": dataset[[variable, "X"]].corr(method='kendall').loc[variable, "X"],
            "kendall_corr(v,Y)": dataset[[variable, "Y"]].corr(method='kendall').loc[variable, "Y"],
            "max(kendall_corr(v, others))": tmp.max(),
            "min(kendall_corr(v, others))": tmp.min(),
            "mean(kendall_corr(v, others))": tmp.mean(),
            "std(kendall_corr(v, others))": tmp.std(),
        })

    df = pd.DataFrame(df)
    df["dataset"] = dataset.name

    df["kendall_corr(X,Y)"] = dataset[["X", "Y"]].corr(method='kendall').loc["X", "Y"]

    # Kendall's tau correlation can be NaN in some cases, so we fill with 0
    df.fillna(0, inplace=True)

    # Reorder columns:
    df = df[["dataset"] + [colname for colname in df.columns if colname != "dataset"]]

    return df

def distance_correlation(dataset):
    """
    Given a dataset, we compute the distance correlation-based features for each
    variable, which are the distance correlation between that variable with X and Y,
    as well as summary statistics (max, min, mean, std) of all pairs of distance correlations.
    """
    variables = dataset.columns.drop(["X", "Y"])

    df = []
    for variable in variables:
        tmp = []
        # Compute distance correlation between 'variable' and all other variables (excluding itself)
        other_variables = dataset.columns.drop([variable])
        for other_var in other_variables:
            corr = dcor.distance_correlation(dataset[variable], dataset[other_var])
            tmp.append(corr)
        tmp = pd.Series(tmp)  # Convert tmp to a Pandas Series

        df.append({
            "variable": variable,
            "dcor(v,X)": dcor.distance_correlation(dataset[variable], dataset["X"]),
            "dcor(v,Y)": dcor.distance_correlation(dataset[variable], dataset["Y"]),
            "max(dcor(v, others))": tmp.max(),
            "min(dcor(v, others))": tmp.min(),
            "mean(dcor(v, others))": tmp.mean(),
            "std(dcor(v, others))": tmp.std(),
            "25%(dcor(v, others))": tmp.quantile(0.25),# Success
            "75%(dcor(v, others))": tmp.quantile(0.75),
        })

    df = pd.DataFrame(df)
    df["dataset"] = dataset.name

    df["dcor(X,Y)"] = dcor.distance_correlation(dataset["X"], dataset["Y"])

    # Reorder columns:
    df = df[["dataset"] + [colname for colname in df.columns if colname != "dataset"]]

    return df

def Squared_term(dataset, variables):
        for var in variables:
            dataset[f'{var}_squared_term'] = dataset[var] ** 2
        return dataset
    
def Interaction_term(dataset, variables):
    for i in range(len(variables)):
        for j in range(i + 1, len(variables)):
            dataset[f'{variables[i]}_{variables[j]}'] = dataset[variables[i]] * dataset[variables[j]]
    return dataset

def Cos_Sin_term(dataset, variables):
    for var in variables:
        dataset[f'{var}_cos_term'] = np.cos(dataset[var])
        dataset[f'{var}_sin_term'] = np.sin(dataset[var])
    return dataset

def linear_regression_feature(dataset):
    variables = dataset.columns.drop(["X", "Y"]).tolist()

    # model1: Fit X, v, v^2, v_i*v_j, v_i*X ~ Y
    model1_features = ["X"] + variables
    d1 = Squared_term(dataset[model1_features], model1_features)
    d1 = Interaction_term(d1, model1_features)  # 0.4600-0.4648
    # d1 = Cos_Sin_term(d1, variables)
    model1_features = d1.columns.tolist()
    scaler = StandardScaler()
    d1_scaled = scaler.fit_transform(d1)
    model1 = LinearRegression().fit(d1_scaled, dataset[["Y"]])
    model1_coefs = model1.coef_[0].tolist()
    model1_dict = {name: coef for name, coef in zip(model1_features, model1_coefs)}
    
    # model2: Fit v, v^2, cos(v)/sin(v) ~ X
    model2_features = variables
    d2 = Squared_term(dataset[model2_features], model2_features)
    d2 = Interaction_term(d2, model2_features)  # 0.4648-0.4643
    d2 = Cos_Sin_term(d2, model2_features)
    model2_features = d2.columns.tolist()
    scaler = StandardScaler()
    d2_scaled = scaler.fit_transform(d2)
    model2 = LinearRegression().fit(d2_scaled, dataset[["X"]])
    model2_coefs = model2.coef_[0].tolist()
    model2_dict = {name: coef for name, coef in zip(model2_features, model2_coefs)}
    
    df = []
    for i, variable in enumerate(variables):
        df.append({
            "variable": variable,
            "v~Y_coefficient": model1_dict[variable],
            "v_squared~Y_coefficient": model1_dict[f"{variable}_squared_term"],
            # "v_cos~Y_coefficient": model1_dict[f"{variable}_cos_term"],  # 掉分
            # "v_sin~Y_coefficient": model1_dict[f"{variable}_sin_term"],
            "v~X_coefficient": model2_dict[variable],
            "v_squared~X_coefficient": model2_dict[f"{variable}_squared_term"],
            "v_cos~X_coefficient": model2_dict[f"{variable}_cos_term"], 
            "v_sin~X_coefficient": model2_dict[f"{variable}_sin_term"],
        })
        
    df = pd.DataFrame(df)
    df["dataset"] = dataset.name
    
    df["X~Y_coefficient"] = model1_dict["X"]
    
    # Reorder columns:
    df = df[["dataset"] + [colname for colname in df.columns if colname != "dataset"]]

    return df

def ridge_regression_feature(dataset):
    variables = dataset.columns.drop(["X", "Y"]).tolist()
    # 使用GridSearchCV来选择最佳的alpha值
    param_grid = {'alpha': np.logspace(-6, 6, 13)}
    
    # model1: Fit X, v ~ Y
    model1_features = ["X"] + variables
    d1 = Squared_term(dataset[model1_features], model1_features)
    # d1 = Interaction_term(d1, model1_features) # 掉分
    model1_features = d1.columns.tolist()
    scaler1 = StandardScaler()
    d1_scaled = scaler1.fit_transform(d1)
    model1 = GridSearchCV(Ridge(random_state=42), param_grid, cv=5)
    model1.fit(d1_scaled, dataset["Y"])
    model1_coefs = model1.best_estimator_.coef_.tolist()
    model1_dict = {name: coef for name, coef in zip(model1_features, model1_coefs)}
    
    # model2: Fit v ~ X
    model2_features = variables
    d2 = Squared_term(dataset[model2_features], model2_features)
    d2 = Interaction_term(d2, model2_features)
    # d2 = Cos_Sin_term(d2, model2_features)  # 掉分
    model2_features = d2.columns.tolist()
    scaler2 = StandardScaler()
    d2_scaled = scaler2.fit_transform(d2)
    model2 = GridSearchCV(Ridge(random_state=42), param_grid, cv=5)
    model2.fit(d2_scaled, dataset["X"])
    model2_coefs = model2.best_estimator_.coef_.tolist()
    model2_dict = {name: coef for name, coef in zip(model2_features, model2_coefs)}

    # # 获取最优的 alpha 值
    # best_alpha_model1 = model1.best_params_['alpha']   # 0.4730-0.4727
    # best_alpha_model2 = model2.best_params_['alpha']
    
    df = []
    for i, variable in enumerate(variables):
        df.append({
            "variable": variable,
            "v~Y_ridge_coefficient": model1_dict[variable],
            # "v_squared~Y_ridge_coefficient": model1_dict[f"{variable}_squared_term"],  # 掉分
            "v~X_ridge_coefficient": model2_dict[variable],
            # "v_squared~X_ridge_coefficient": model2_dict[f"{variable}_squared_term"],  # 掉分
            # "v_cos~X_ridge_coefficient": model2_dict[f"{variable}_cos_term"],  # 掉分
            # "v_sin~X_ridge_coefficient": model2_dict[f"{variable}_sin_term"],  # 掉分
            # "v~Y_ridge_alpha": best_alpha_model1,
            # "v~X_ridge_alpha": best_alpha_model2
        })
        
    df = pd.DataFrame(df)
    df["dataset"] = dataset.name
    
    df["X~Y_ridge_coefficient"] = model1_dict["X"]
    
    # Reorder columns:
    df = df[["dataset"] + [colname for colname in df.columns if colname != "dataset"]]

    return df

def cluster_features(dataset):
    variables = dataset.columns.drop(["X", "Y"]).tolist()
    scaler = StandardScaler()
    d_scaled = pd.DataFrame(scaler.fit_transform(dataset), columns=dataset.columns)

    results = []
    eps_values = [0.3]  # , 0.5, 0.7
    
    for variable in variables:
        cluster_counts = []
        noise_counts = []
        avg_cluster_sizes = []
        density_variations = []
        feature_importances = []
        silhouette_scores = []
        
        for eps in eps_values:
            cluster_df = d_scaled[[variable, "X", "Y"]].copy()
            dbscan = DBSCAN(eps=eps, min_samples=5)
            cluster_df["cluster"] = dbscan.fit_predict(cluster_df)
            
            # 基本统计
            cluster_count = len(set(cluster_df["cluster"])) - (1 if -1 in cluster_df["cluster"] else 0)
            noise_count = (cluster_df["cluster"] == -1).sum()
            cluster_counts.append(cluster_count)
            noise_counts.append(noise_count)
            
            # 密度分析
            cluster_sizes = cluster_df[cluster_df["cluster"] != -1]["cluster"].value_counts()
            avg_cluster_size = cluster_sizes.mean() if not cluster_sizes.empty else 0
            density_variation = cluster_sizes.std() / avg_cluster_size if avg_cluster_size > 0 else 0
            avg_cluster_sizes.append(avg_cluster_size)
            density_variations.append(density_variation)
            
            # 特征重要性
            if cluster_count > 1:  # 确保有多个簇
                feature_importance = abs(np.corrcoef(cluster_df[variable], cluster_df["cluster"]))[0, 1]
            else:
                feature_importance = 0
            feature_importances.append(feature_importance)
            
            # 轮廓系数
            non_noise_mask = cluster_df["cluster"] != -1
            if len(set(cluster_df.loc[non_noise_mask, "cluster"])) > 1:
                sil_score = silhouette_score(cluster_df.loc[non_noise_mask, [variable, "X", "Y"]], 
                                             cluster_df.loc[non_noise_mask, "cluster"], 
                                             metric="euclidean")
            else:
                sil_score = 0
            silhouette_scores.append(sil_score)
        
        result = {
            "variable": variable
        }
        for i, eps in enumerate(eps_values):
            result.update({
                f"cluster_count_{eps}": cluster_counts[i],             # 0.4730-0.4736
                # f"noise_count_{eps}": noise_counts[i],                 # 0.4736-0.4740
                # f"avg_cluster_size_{eps}": avg_cluster_sizes[i],     # 0.4740-0.4735
                # f"density_variation_{eps}": density_variations[i],     # 0.4740-0.4741
                # f"feature_importance_{eps}": feature_importances[i], # 0.4741-0.4736
                # f"silhouette_score_{eps}": silhouette_scores[i]      # 0.4741-0.4723
            })
        results.append(result)

    df = pd.DataFrame(results)
    df["dataset"] = dataset.name

    # Reorder columns:
    df = df[["dataset"] + [colname for colname in df.columns if colname != "dataset"]]

    return df

def conditional_independence_tests(dataset):  # 太慢了，得三个小时
    """
    A mixed-data residualization based conditional independence test[1].
    Uses XGBoost estimator to compute LS residuals[2], and then does an association test (Pillai’s Trace) on the residuals.
    """
    variables = dataset.columns.drop(["X", "Y"]).tolist()

    df = []
    for variable in variables:
        # v-X
        coef1, p_value1 = CITests.ci_pillai(X=variable, Y="X", Z=dataset.columns.drop(["X", variable]).tolist(), data=dataset, boolean=False)
        # v-Y
        coef2, p_value2 = CITests.ci_pillai(X=variable, Y="Y", Z=dataset.columns.drop(["Y", variable]).tolist(), data=dataset, boolean=False)
        # X-v
        coef3, p_value3 = CITests.ci_pillai(X="X", Y=variable, Z=dataset.columns.drop(["X", variable]).tolist(), data=dataset, boolean=False)
        # Y-v
        coef4, p_value4 = CITests.ci_pillai(X="Y", Y=variable, Z=dataset.columns.drop(["Y", variable]).tolist(), data=dataset, boolean=False)
        df.append({
            "variable": variable,
            "v~X_ci_pillai_coef": coef1,
            "v~X_ci_pillai_p_value": p_value1,
            "v~Y_ci_pillai_coef": coef2,
            "v~Y_ci_pillai_p_value": p_value2,
            "X~v_ci_pillai_coef": coef3,
            "X~v_ci_pillai_p_value": p_value3,
            "Y~v_ci_pillai_coef": coef4,
            "Y~v_ci_pillai_p_value": p_value4
        })
    
    df = pd.DataFrame(df)
    df["dataset"] = dataset.name
    
    # Reorder columns:
    df = df[["dataset"] + [colname for colname in df.columns if colname != "dataset"]]

    return df

def dml_estimate(data, Y_var, T_var, X_vars, n_splits=4, use_gpu=False):
    """
    使用双重机器学习估计T对Y的因果效应。
    返回：
    - result: 包含以下键的字典：
        - 'theta': 估计的因果效应。
        - 'se': 估计的标准误差。
    """
    # 从DataFrame中提取变量
    Y = data[Y_var].values
    T = data[T_var].values
    X = data[X_vars].values

    # 初始化残差
    Y_residuals = np.zeros_like(Y)
    T_residuals = np.zeros_like(T)

    # 设置交叉拟合
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # LightGBM参数
    params = {
        'objective': 'regression',
        'learning_rate': 0.1,
        'num_leaves': 31,
        'verbosity': -1,
        'device_type': 'cuda' if use_gpu else 'cpu',
        'n_jobs': -1
    }

    # 如果使用GPU，加上其他GPU相关参数
    if use_gpu:
        # 根据最新的LightGBM文档，这些参数只在特定情况下需要
        # 这里只设置device_type为'gpu'，其余参数使用默认值
        params['max_bin'] = 63  # 建议在GPU模式下使用较小的max_bin值

    # 交叉拟合循环
    for train_index, test_index in kf.split(X):
        # 将数据拆分为训练集和测试集
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        T_train, T_test = T[train_index], T[test_index]

        # 结果模型
        Y_model = LGBMRegressor(**params)
        Y_model.fit(X_train, Y_train)
        Y_pred = Y_model.predict(X_test)
        Y_residuals[test_index] = Y_test - Y_pred

        # 处理模型
        T_model = LGBMRegressor(**params)
        T_model.fit(X_train, T_train)
        T_pred = T_model.predict(X_test)
        T_residuals[test_index] = T_test - T_pred

    # 使用残差进行线性回归估计因果效应
    causal_model = LinearRegression(fit_intercept=False)
    causal_model.fit(T_residuals.reshape(-1, 1), Y_residuals)
    theta = causal_model.coef_[0]

    # 计算标准误差
    n = len(Y_residuals)
    residuals = Y_residuals - theta * T_residuals
    sigma2 = np.sum(residuals ** 2) / (n - 1)
    T_residuals_variance = np.var(T_residuals, ddof=1)
    se = np.sqrt(sigma2 / (n * T_residuals_variance))

    # 返回结果
    result = {
        'theta': theta,
        'se': se
    }
    return result

def dml_estimate(data, Y_var, T_var, X_vars, n_splits=4, use_gpu=False):
    """
    使用双重机器学习估计T对Y的因果效应。
    返回：
    - result: 包含以下键的字典：
        - 'theta': 估计的因果效应。
        - 'se': 估计的标准误差。
    """
    # 从DataFrame中提取变量
    Y = data[Y_var].values
    T = data[T_var].values
    X = data[X_vars].values

    # 初始化残差
    Y_residuals = np.zeros_like(Y)
    T_residuals = np.zeros_like(T)

    # 设置交叉拟合
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # LightGBM参数
    params = {
        'objective': 'regression',
        'learning_rate': 0.1,
        'num_leaves': 31,
        'verbosity': -1,
        'device_type': 'gpu' if use_gpu else 'cpu',
        'gpu_platform_id': 1,
        'gpu_device_id': 0,
        'n_jobs': -1
    }

    # 如果使用GPU，加上其他GPU相关参数
    if use_gpu:
        # 根据最新的LightGBM文档，这些参数只在特定情况下需要
        # 这里只设置device_type为'gpu'，其余参数使用默认值
        params['max_bin'] = 63  # 建议在GPU模式下使用较小的max_bin值

    # 交叉拟合循环
    for train_index, test_index in kf.split(X):
        # 将数据拆分为训练集和测试集
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        T_train, T_test = T[train_index], T[test_index]

        # 结果模型
        Y_model = LGBMRegressor(**params)
        Y_model.fit(X_train, Y_train)
        Y_pred = Y_model.predict(X_test)
        Y_residuals[test_index] = Y_test - Y_pred

        # 处理模型
        T_model = LGBMRegressor(**params)
        T_model.fit(X_train, T_train)
        T_pred = T_model.predict(X_test)
        T_residuals[test_index] = T_test - T_pred

    # 使用残差进行线性回归估计因果效应
    causal_model = LinearRegression(fit_intercept=False)
    causal_model.fit(T_residuals.reshape(-1, 1), Y_residuals)
    theta = causal_model.coef_[0]

    # 计算标准误差
    n = len(Y_residuals)
    residuals = Y_residuals - theta * T_residuals
    sigma2 = np.sum(residuals ** 2) / (n - 1)
    T_residuals_variance = np.var(T_residuals, ddof=1)
    se = np.sqrt(sigma2 / (n * T_residuals_variance))

    # 返回结果
    result = {
        'theta': theta,
        'se': se
    }
    return result

def double_machine_learning(dataset):
    variables = dataset.columns.drop(["X", "Y"])

    df = []
    for variable in variables:
        # 判断v-X的因果效应，设置variables中的其他v和Y为控制变量
        Y_var = "X"
        T_var = variable
        X_vars = [var for var in dataset.columns.tolist() if var not in [Y_var, T_var]]
        result = dml_estimate(dataset, Y_var, T_var, X_vars, n_splits=4, use_gpu=False)

        df.append({
            "variable": variable,
            "v~X_DML_theta": result['theta'],
            # "v~X_DML_se": result['se']
        })
    
    df = pd.DataFrame(df)
    df["dataset"] = dataset.name
    
    # Reorder columns:
    df = df[["dataset"] + [colname for colname in df.columns if colname != "dataset"]]

    return df

def conditional_mutual_information(dataset):
    """
    Calculate conditional mutual information for each variable with X and Y.
    """
    variables = dataset.columns.drop(["X", "Y"])
    
    df = []
    for variable in variables:
        # Calculate conditional MI(v, X | Y)
        mi_vx_given_y = mutual_info_regression(dataset[[variable, "Y"]], dataset["X"], discrete_features=False)[0] - \
                        mutual_info_regression(dataset[["Y"]], dataset["X"], discrete_features=False)[0]
        
        # Calculate conditional MI(v, Y | X)
        mi_vy_given_x = mutual_info_regression(dataset[[variable, "X"]], dataset["Y"], discrete_features=False)[0] - \
                        mutual_info_regression(dataset[["X"]], dataset["Y"], discrete_features=False)[0]
        
        # Calculate conditional MI(X, Y | v)
        mi_xy_given_v = mutual_info_regression(dataset[["X", variable]], dataset["Y"], discrete_features=False)[0] - \
                        mutual_info_regression(dataset[[variable]], dataset["Y"], discrete_features=False)[0]
        
        df.append({
            "variable": variable,
            "conditional_MI(v,X|Y)": mi_vx_given_y,
            "conditional_MI(v,Y|X)": mi_vy_given_x,
            "conditional_MI(X,Y|v)": mi_xy_given_v,
        })
    
    df = pd.DataFrame(df)
    df["dataset"] = dataset.name
    
    # Reorder columns:
    df = df[["dataset"] + [colname for colname in df.columns if colname != "dataset"]]
    
    return df

def partial_correlation(dataset):
    """
    Compute partial correlation coefficients for each variable with X and Y,
    controlling for the other variable, as well as the partial correlation
    between X and Y controlling for each variable.
    """
    variables = dataset.columns.drop(["X", "Y"])
    
    df = []
    for variable in variables:
        # Compute partial correlations
        pcorr_vX_Y = pg.partial_corr(data=dataset, x=variable, y='X', covar='Y')['r'].iloc[0]
        pcorr_vY_X = pg.partial_corr(data=dataset, x=variable, y='Y', covar='X')['r'].iloc[0]
        pcorr_XY_v = pg.partial_corr(data=dataset, x='X', y='Y', covar=variable)['r'].iloc[0]
        
        df.append({
            "variable": variable,
            "partial_corr(v,X|Y)": pcorr_vX_Y,
            "partial_corr(v,Y|X)": pcorr_vY_X,
            "partial_corr(X,Y|v)": pcorr_XY_v,
        })
    
    df = pd.DataFrame(df)
    df["dataset"] = dataset.name
    
    # Reorder columns:
    df = df[["dataset"] + [colname for colname in df.columns if colname != "dataset"]]
    
    return df



In [6]:
def label(adjacency_matrix):
    """
    Given a graph as adjacency_matrix, create the class labels of each variable.
    """

    adjacency_graph, adjacency_label = create_graph_label()
    labels = get_labels(adjacency_matrix, adjacency_label)
    variables = adjacency_matrix.columns.drop(["X", "Y"])

    df = pd.DataFrame({
        "variable": variables,
        "label": [labels[variable] for variable in variables],
    })
    df["dataset"] = adjacency_matrix.name

    # Reorder columns:
    df = df[["dataset"] + [colname for colname in df.columns if colname != "dataset"]]

    return df


In [None]:
def create_some_columns(names_datasets, function):
    """
    Apply an embedding function to a list of datasets.
    """

    df = []
    for name, dataset in tqdm(names_datasets.items()):
        dataset = names_datasets[name]
        dataset.name = name
    
        try:
            df_dataset = function(dataset)
        except ValueError as e:
            print(name, e)
            raise NotImplementedError

        df_dataset["dataset"] = name
        df.append(df_dataset)

    df = pd.concat(df, axis="index").reset_index(drop=True)
    return df

In [None]:
def create_some_columns_parallel(names_datasets, function, n_jobs=-1):
    """
    Apply an embedding function to a list of datasets.

    Parallel version.
    """

    def f(name, dataset, function):
        dataset.name = name
        df_dataset = function(dataset)
        df_dataset["dataset"] = name
        return df_dataset

    df = joblib.Parallel(n_jobs=n_jobs)(
        joblib.delayed(f)(name, dataset, function)
        for name, dataset in tqdm(names_datasets.items())
    )

    df = pd.concat(df, axis="index").reset_index(drop=True)
    return df

In [None]:
def create_all_columns(functions_names_datasets, n_jobs=-1, create_dimension_feature = False):
    """
    given a dictionary of {function1:names, function2:names,...} apply
    the desired functions to the list of datasets and merge all of them
    in a single X_y_group dataframe.
    """

    columns = []
    if create_dimension_feature:
        dimension_feature = create_some_columns(functions_names_datasets[list(functions_names_datasets.keys())[0]], add_dimension_feature)
        columns.append(dimension_feature)
    
    for function, names_datasets in functions_names_datasets.items():
        print(f"set: {function.__name__}")

        if n_jobs != 1:
            feature_set = create_some_columns_parallel(names_datasets, function, n_jobs=n_jobs)
        else:
            feature_set = create_some_columns(names_datasets, function)

        columns.append(feature_set)

    # Merge all feature sets into a single dataframe:
    columns = functools.reduce(
        lambda left, right: pd.merge(left, right, on=["dataset", "variable"]),
        columns,
    )

    return columns

In [None]:
names_datasets_train = X_train
names_graphs_train = y_train
print(f"Creating X_y_group_train from {len(names_datasets_train)} datasets and graphs")
X_y_group_train = create_all_columns(
    {
        pearson_correlation: names_datasets_train,
        mutual_information: names_datasets_train,  # uncomment this line to add features but at high computational cost
        label: names_graphs_train,
    },
    n_jobs=-1,
    create_dimension_feature=True,
)

In [None]:
names_datasets_train = X_train
names_graphs_train = y_train
X_y_group_train_additional = create_all_columns(
    {
        spearman_correlation: names_datasets_train,
        kendall_correlation: names_datasets_train,
        distance_correlation: names_datasets_train,
        conditional_mutual_information: names_datasets_train,
        linear_regression_feature: names_datasets_train,
    },
    n_jobs=-1,
)


X_y_group_train = pd.concat([X_y_group_train, X_y_group_train_additional], axis=1)
# 去掉重复的列
X_y_group_train = X_y_group_train.loc[:,~X_y_group_train.columns.duplicated()]
X_y_group_train['MI(v,X)^2'] = X_y_group_train['MI(v,X)'] ** 2
X_y_group_train['MI(v,Y)^2'] = X_y_group_train['MI(v,Y)'] ** 2
X_y_group_train['MI(X,Y)^2'] = X_y_group_train['MI(X,Y)'] ** 2
X_y_group_train['max(MI(v, others))^2'] = X_y_group_train['max(MI(v, others))'] ** 2
X_y_group_train['min(MI(v, others))^2'] = X_y_group_train['min(MI(v, others))'] ** 2

In [None]:
X_y_group_train = pd.read_csv('./mid_data/X_y_group_train_updated_v8.7_线性回归v7&岭回归v2.csv')
print(X_y_group_train.shape)
print(X_y_group_train.columns)

In [None]:
# names_datasets_train = X_train
# names_graphs_train = y_train
X_y_group_train_additional = create_all_columns(
    {
        # linear_regression_feature: names_datasets_train,
        # ridge_regression_feature: names_datasets_train,
        # conditional_independence_tests: names_datasets_train,  # 速度太慢
        # double_machine_learning: names_datasets_train,
    },
    n_jobs=-1,
)
X_y_group_train = pd.concat([X_y_group_train, X_y_group_train_additional], axis=1)
print('X_y_group_train.shape', X_y_group_train.shape)
# 去掉重复的列
X_y_group_train = X_y_group_train.loc[:,~X_y_group_train.columns.duplicated()]
print('去重后X_y_group_train.shape', X_y_group_train.shape)

In [None]:
# X_y_group_train.to_csv('./mid_data/X_y_group_train_updated_v8.7_线性回归v7&岭回归v2.csv', index=False)

In [None]:
names_datasets_train = X_train
names_graphs_train = y_train
X_y_group_train_additional = create_all_columns(
    {
        distance_correlation: names_datasets_train,
    },
    n_jobs=-1,
)
X_y_group_train = pd.concat([X_y_group_train, X_y_group_train_additional], axis=1)
print('X_y_group_train.shape', X_y_group_train.shape)
# 去掉重复的列
X_y_group_train = X_y_group_train.loc[:,~X_y_group_train.columns.duplicated()]
print('去重后X_y_group_train.shape', X_y_group_train.shape)

In [11]:
def remove_outliers(df, columns, threshold=3):
    z_scores = np.abs(stats.zscore(df[columns]))
    df['z_score_max'] = np.max(z_scores, axis=1)
    df['is_outlier'] = df['z_score_max'] > threshold
    df_cleaned = df[df['is_outlier'] == False].copy()
    df_cleaned.drop(columns=['z_score_max', 'is_outlier'], inplace=True)
    return df_cleaned

In [12]:
# X_y_group_train = remove_outliers(X_y_group_train, ['corr(v,X)', 'corr(v,Y)', 'ttest(v,X)', 'ttest(v,Y)']) 不需要去除outliers了
print("Adding numeric labels y")
le = LabelEncoder()
X_y_group_train["y"] = le.fit_transform(X_y_group_train["label"])
# reordering columns:
X_y_group_train = X_y_group_train[["dataset", "variable"] + X_y_group_train.columns.drop(["dataset", "variable", "label", "y"]).tolist() + ["label", "y"]]

numeric_columns = X_y_group_train.select_dtypes(include=[np.number]).columns

# 对数值类型的列使用平均值填充
X_y_group_train[numeric_columns] = X_y_group_train[numeric_columns].fillna(X_y_group_train[numeric_columns].mean())

blacklist = ["ttest(v,X)", "pvalue(ttest(v,X))<=0.05", "ttest(v,Y)", "pvalue(ttest(v,Y))<=0.05", "ttest(X,Y)", "pvalue(ttest(X,Y))<=0.05"]
columns_to_drop = [col for col in blacklist if col in X_y_group_train.columns]
X_y_group_train = X_y_group_train.drop(columns=columns_to_drop)

display(X_y_group_train)

print("Extracting X_train, y_train, and group")
X_train = X_y_group_train.drop(["variable", "dataset", "label", "y"], axis="columns")
y_train = X_y_group_train["y"]
group_train = X_y_group_train["dataset"]

In [13]:
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=13,
    n_jobs=-1,
    class_weight="balanced",
    random_state=42
)
display(model)

cv = GroupKFold(n_splits=4)
results = cross_val_score(
    model,
    X_train,
    y_train,
    groups=group_train,
    cv=cv,
    verbose=True,
    scoring="balanced_accuracy"
)

print(f"multiclass balanced accuracy: mean={results.mean()}")

In [None]:
之前应该是 4728 等会看看
Success, 4753
准备记录新的缓慢特征那个

In [None]:
X_y_group_train_additional = create_all_columns(
    {
        spearman_correlation: names_datasets_train,
    },
    n_jobs=-1,
)
X_y_group_train = pd.concat([X_y_group_train, X_y_group_train_additional], axis=1)
print('X_y_group_train.shape', X_y_group_train.shape)
# 去掉重复的列
X_y_group_train = X_y_group_train.loc[:,~X_y_group_train.columns.duplicated()]
print('去重后X_y_group_train.shape', X_y_group_train.shape)

In [None]:
print("Adding numeric labels y")
le = LabelEncoder()
X_y_group_train["y"] = le.fit_transform(X_y_group_train["label"])
# reordering columns:
X_y_group_train = X_y_group_train[["dataset", "variable"] + X_y_group_train.columns.drop(["dataset", "variable", "label", "y"]).tolist() + ["label", "y"]]


blacklist = ["ttest(v,X)", "pvalue(ttest(v,X))<=0.05", "ttest(v,Y)", "pvalue(ttest(v,Y))<=0.05", "ttest(X,Y)", "pvalue(ttest(X,Y))<=0.05"]
columns_to_drop = [col for col in blacklist if col in X_y_group_train.columns]
X_y_group_train = X_y_group_train.drop(columns=columns_to_drop)

display(X_y_group_train)

print("Extracting X_train, y_train, and group")
X_train = X_y_group_train.drop(["variable", "dataset", "label", "y"], axis="columns")
y_train = X_y_group_train["y"]
group_train = X_y_group_train["dataset"]

In [None]:
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=13,
    n_jobs=-1,
    class_weight="balanced",
    random_state=42
)
display(model)

cv = GroupKFold(n_splits=4)
results = cross_val_score(
    model,
    X_train,
    y_train,
    groups=group_train,
    cv=cv,
    verbose=True,
    scoring="balanced_accuracy"
)

print(f"multiclass balanced accuracy: mean={results.mean()}")

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import balanced_accuracy_score

# ... 现有的模型定义代码 ...

cv = GroupKFold(n_splits=4)

# 使用cross_val_predict获取每个折叠的预测结果
y_pred = cross_val_predict(model, X_train, y_train, groups=group_train, cv=cv, n_jobs=-1)

# 计算每个折叠的平衡准确率
fold_scores = []
for train_index, val_index in cv.split(X_train, y_train, groups=group_train):
    y_true_fold = y_train[val_index]
    y_pred_fold = y_pred[val_index]
    fold_score = balanced_accuracy_score(y_true_fold, y_pred_fold)
    fold_scores.append(fold_score)

# 找出最佳折叠的索引
best_fold_index = fold_scores.index(max(fold_scores))

# 在最佳折叠上训练模型
train_index, val_index = list(cv.split(X_train, y_train, groups=group_train))[best_fold_index]
X_train_best, X_val_best = X_train[train_index], X_train[val_index]
y_train_best, y_val_best = y_train[train_index], y_train[val_index]

best_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=13,
    n_jobs=-1,
    class_weight="balanced",
    random_state=42
)
best_model.fit(X_train_best, y_train_best)

# 保存最佳模型
import joblib
joblib.dump(best_model, 'best_random_forest_model.joblib')

print(f"Best fold balanced accuracy: {max(fold_scores)}")
print(f"Average balanced accuracy: {sum(fold_scores) / len(fold_scores)}")
print("Best model saved as 'best_random_forest_model.joblib'")

In [None]:
print(f"multiclass balanced accuracy: mean={results.mean()}")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


X = X_train
y = y_train
# Assuming X and y are your features and target variables
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Get predictions
y_pred = model.predict(X_test)


# Calculate accuracy for each dimension
dimensions = sorted(X_test['dimension'].unique())
accuracies = []

for dim in dimensions:
    mask = X_test['dimension'] == dim
    y_true_dim = y_test[mask]
    y_pred_dim = y_pred[mask]
    acc = accuracy_score(y_true_dim, y_pred_dim)
    accuracies.append(acc)

# Visualize the results
plt.figure(figsize=(12, 6))
plt.plot(dimensions, accuracies, marker='o')
plt.xlabel('Dimension')
plt.ylabel('Accuracy')
plt.title('Model Accuracy vs Dimension')
plt.grid(True)
plt.xticks(dimensions)  # Ensure all dimensions are shown on x-axis
plt.tight_layout()
plt.show()

# Print accuracies
print("Accuracies for each dimension:")
for dim, acc in zip(dimensions, accuracies):
    print(f"Dimension {dim}: Accuracy = {acc:.4f}")

# Calculate and print overall accuracy
overall_accuracy = accuracy_score(y_test, y_pred)
print(f"\nOverall Accuracy: {overall_accuracy:.4f}")

# Get feature importances
importances = model.feature_importances_
feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)

# Display top 20 most important features
print("\nTop 20 most important features:")
print(feature_importances)

# Save the model
import joblib
joblib.dump(model, './resources/random_forest_model_v1.joblib')

print("\nModel and feature importances have been saved.")

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
import re
# 假设 X_train, y_train 已经准备好

# 清理特征名称
def clean_feature_names(X):
    def clean_name(name):
        name = re.sub(r'[^\w\s-]', '_', name)
        if name[0].isdigit():
            name = 'f_' + name
        return name
    
    X.columns = [clean_name(col) for col in X.columns]
    return X

X_train = clean_feature_names(X_train)

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

# 创建TabNet模型
tabnet_model = TabNetClassifier(
    n_d=64,  # 决策步骤的维度
    n_a=64,  # 注意力步骤的维度
    n_steps=5,  # 总共的决策步骤数
    gamma=1.5,  # 控制稀疏性的参数
    n_independent=2,  # 每个步骤的独立层数
    n_shared=2,  # 共享层数
    cat_idxs=[],  # 分类特征的索引
    cat_dims=[],  # 每个分类特征的唯一值数量
    cat_emb_dim=[],  # 每个分类特征的嵌入维度
    lambda_sparse=1e-3,  # L1正则化参数
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params=dict(
        mode="min", patience=5, min_lr=1e-5, factor=0.5
    ),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    mask_type="entmax",  # "sparsemax" or "entmax"
    device_name='cuda' if torch.cuda.is_available() else 'cpu'
)

# 训练模型
tabnet_model.fit(
    X_train=X_train.values, 
    y_train=y_train.values,
    eval_set=[(X_val.values, y_val.values)],
    eval_name=['val'],
    eval_metric=['balanced_accuracy'],
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=256,
    num_workers=0,
    weights=1,
    drop_last=False
)

# 打印训练结果
print(f"Best epoch: {tabnet_model.best_epoch}")
print(f"Best validation balanced accuracy: {tabnet_model.best_cost}")

# 在训练集和验证集上评估模型
y_train_pred = tabnet_model.predict(X_train.values)
y_val_pred = tabnet_model.predict(X_val.values)

train_accuracy = balanced_accuracy_score(y_train, y_train_pred)
val_accuracy = balanced_accuracy_score(y_val, y_val_pred)

print(f"Train balanced accuracy: {train_accuracy:.4f}")
print(f"Validation balanced accuracy: {val_accuracy:.4f}")

# 保存模型
import joblib
# joblib.dump(tabnet_model, './resources/tabnet_model_v0.joblib')

In [None]:
import re

def clean_feature_names(X):
    # 函数用于清理特征名称
    def clean_name(name):
        # 移除或替换特殊字符
        name = re.sub(r'[^\w\s-]', '_', name)
        # 确保名称不以数字开头
        if name[0].isdigit():
            name = 'f_' + name
        return name

    X.columns = [clean_name(col) for col in X.columns]
    return X

# 清理特征名称
X_train = clean_feature_names(X_train)


from lightgbm import LGBMClassifier
import lightgbm as lgb
model = LGBMClassifier(
    n_estimators=2000,
    learning_rate=0.03,
    max_depth=6,
    num_leaves=29,
    min_child_samples=20,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced',
    device='gpu',
    gpu_platform_id=1,
    gpu_device_id=0,
)
display(model)

from sklearn.model_selection import GroupKFold
from sklearn.metrics import balanced_accuracy_score
import numpy as np



cv = GroupKFold(n_splits=4)
train_scores = []
val_scores = []

for train_idx, val_idx in cv.split(X_train, y_train, groups=group_train):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    callbacks = [lgb.log_evaluation(period=1), lgb.early_stopping(stopping_rounds=10)]
    
    model.fit(X_train_fold, y_train_fold, callbacks=callbacks, eval_set=[(X_val_fold, y_val_fold)])

    
    y_train_pred = model.predict(X_train_fold)
    y_val_pred = model.predict(X_val_fold)
    
    train_score = balanced_accuracy_score(y_train_fold, y_train_pred)
    val_score = balanced_accuracy_score(y_val_fold, y_val_pred)
    
    train_scores.append(train_score)
    val_scores.append(val_score)
    
    print(f"Fold train balanced accuracy: {train_score:.4f}")
    print(f"Fold validation balanced accuracy: {val_score:.4f}")
    print("---")

print(f"Average train balanced accuracy: {np.mean(train_scores):.4f} ± {np.std(train_scores):.4f}")
print(f"Average validation balanced accuracy: {np.mean(val_scores):.4f} ± {np.std(val_scores):.4f}")

In [None]:
# Uncomment what you need!
def train(
    X_train: typing.Dict[str, pd.DataFrame],
    y_train: typing.Dict[str, pd.DataFrame],
    # number_of_features: int,
    model_directory_path: str,
    # id_column_name: str,
    # prediction_column_name: str,
    # has_gpu: bool,
) -> None:
    # 1) Create the variable embedding:
    return
    X_y_group_train_pathname = "/tmp/X_y_group_train.parquet"
    try:
        print(f"Loading {X_y_group_train_pathname}")
        X_y_group_train = pd.read_parquet(X_y_group_train_pathname)
    except FileNotFoundError:
        print("Creating X_y_group_train")
        names_datasets_train = X_train
        names_graphs_train = y_train
        X_y_group_train = create_all_columns(
            {
                pearson_correlation: names_datasets_train,
                ttest: names_datasets_train,
                mutual_information: names_datasets_train,  # comment this line to greatly reduce computation
                label: names_graphs_train,
                spearman_correlation: names_datasets_train,
                kendall_correlation: names_datasets_train,
                distance_correlation: names_datasets_train,
                conditional_mutual_information: names_datasets_train,
            },
            n_jobs=-1,
        )
        X_y_group_train = remove_outliers(X_y_group_train, ['corr(v,X)', 'corr(v,Y)', 'ttest(v,X)', 'ttest(v,Y)'])
        # 2) Massage X_y_group_train to prepare what is needed by the model:
        print("Adding numeric labels y") # sklearn wants numeric labels
        le = LabelEncoder()
        le.classes_ = np.array([
            'Cause of X', 'Consequence of X', 'Confounder', 'Collider',
            'Mediator', 'Independent', 'Cause of Y', 'Consequence of Y',
        ])
        X_y_group_train["y"] = le.transform(X_y_group_train["label"])

        # reordering columns:
        X_y_group_train = X_y_group_train[["dataset", "variable"] + X_y_group_train.columns.drop(["dataset", "variable", "label", "y"]).tolist() + ["label", "y"]]
        display(X_y_group_train)

        print("Saving X_y_group_train")
        os.makedirs(os.path.dirname(X_y_group_train_pathname), exist_ok=True)
        X_y_group_train.to_parquet(X_y_group_train_pathname)

    print("Extracting X_train, y_train, and group")
    X_train = X_y_group_train.drop(["variable", "dataset", "label", "y"], axis="columns")
    y_train = X_y_group_train["y"]
    group_train = X_y_group_train["dataset"]

    # Model:
    model = RandomForestClassifier(n_estimators=100, max_depth=11, n_jobs=-1, class_weight="balanced")
    model.fit(X_train, y_train)

    joblib.dump(
        model,
        os.path.join(model_directory_path, "model.joblib")
    )


In [None]:
def create_submission(X_y_pred_test):
    """
    From the predicted test set, for each dataset, take predicted
    classes of all variables, create the adjacency matrix, then create
    the submission in the requested format.
    """

    submission = {}
    for name, prediction in tqdm(X_y_pred_test.groupby("dataset"), delay=10):
        variables_labels = prediction[["variable", "label_predicted"]].set_index("variable")
        variables = variables_labels.index.tolist()
        variables_all = ["X", "Y"] + variables

        adjacency_matrix = pd.DataFrame(index=variables_all, columns=variables_all)
        adjacency_matrix.index.name = "parent"
        adjacency_matrix[:] = 0
        adjacency_matrix.loc["X", "Y"] = 1

        for v in variables:
            l = variables_labels.loc[v].item()
            if l == "Cause of X":
                adjacency_matrix.loc[v, "X"] = 1
            elif l == "Cause of Y":
                adjacency_matrix.loc[v, "Y"] = 1
            elif l == "Consequence of X":
                adjacency_matrix.loc["X", v] = 1
            elif l == "Consequence of Y":
                adjacency_matrix.loc["Y", v] = 1
            elif l == "Confounder":
                adjacency_matrix.loc[v, "X"] = 1
                adjacency_matrix.loc[v, "Y"] = 1
            elif l == "Collider":
                adjacency_matrix.loc["X", v] = 1
                adjacency_matrix.loc["Y", v] = 1
            elif l == "Mediator":
                adjacency_matrix.loc["X", v] = 1
                adjacency_matrix.loc[v, "Y"] = 1
            elif l == "Confounder":
                pass

        for i in variables_all:
            for j in variables_all:
                submission[f'{name}_{i}_{j}'] = int(adjacency_matrix.loc[i, j])

    return submission


# Uncomment what you need!
def infer(
    X_test: typing.Dict[str, pd.DataFrame],
    # number_of_features: int,
    model_directory_path: str,
    id_column_name: str,
    prediction_column_name: str,
    # has_gpu: bool,
    # has_trained: bool,
) -> pd.DataFrame:
    model = joblib.load(os.path.join(model_directory_path, "random_forest_model_v0.joblib"))

    names_datasets_test = X_test
    X_group_test = create_all_columns(
        {
                pearson_correlation: names_datasets_train,
                ttest: names_datasets_train,
                mutual_information: names_datasets_train,  # comment this line to greatly reduce computation
                label: names_graphs_train,
                spearman_correlation: names_datasets_train,
                kendall_correlation: names_datasets_train,
                distance_correlation: names_datasets_train,
                conditional_mutual_information: names_datasets_train,
        },
        n_jobs=-1,
    )

    X_test = X_group_test.drop(columns=["dataset", "variable"])
    y_predicted = model.predict(X_test)
    X_y_pred_test = X_group_test
    X_y_pred_test["y_predicted"] = y_predicted

    le = LabelEncoder()
    le.classes_ = np.array([
        'Cause of X', 'Consequence of X', 'Confounder', 'Collider',
        'Mediator', 'Independent', 'Cause of Y', 'Consequence of Y',
    ])

    X_y_pred_test["label_predicted"] = le.inverse_transform(y_predicted)

    submission = create_submission(X_y_pred_test)

    return pd.DataFrame(
        submission.items(),
        columns=[
            id_column_name,
            prediction_column_name
        ]
    )

In [None]:
crunch.test(
    no_determinism_check=True
)

print("Download this notebook and submit it to the platform: https://hub.crunchdao.com/competitions/causality-discovery/submit/via/notebook")