In [1]:
import joblib
import pandas as pd
import os
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier as cab
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')
from sklearn import metrics
# from gensim.models import Word2Vec

import tarfile

def compress_csv_to_tar_gz(csv_files, output_filename):
    # 创建 tar.gz 文件
    with tarfile.open(output_filename, "w:gz") as tar:
        for csv_file in csv_files:
            # 确保文件存在
            if os.path.isfile(csv_file):
                tar.add(csv_file, arcname=os.path.basename(csv_file))
            else:
                print(f"文件 {csv_file} 不存在，跳过。")
                
def reduce_mem_usage(df, only_fp64=False, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    if only_fp64==True:
        numerics = [ 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def cv_model(clf, train_x, train_y, test_x, clf_name, train_y_2, sd):
    folds = 5
    seed = sd
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    # 初始化 GroupKFold，设置折数为 5
#     kf = GroupKFold(n_splits=5)
    nclass =  len(np.unique(train_y))
    train = np.zeros((train_x.shape[0],nclass ))  # 为多分类任务初始化
    test = np.zeros((test_x.shape[0], nclass ))    # 为多分类任务初始化

    cv_scores = []


    model_lst = []
    # 进行五折交叉验证
#     for i, (train_index, valid_index) in enumerate(kf.split(train_x, groups=train_y_2)):
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i + 1)))
        trn_x, trn_y = train_x.iloc[train_index], train_y[train_index]
        val_x, val_y = train_x.iloc[valid_index], train_y[valid_index]
#         sample_weight = y_train_sample_weight.iloc[train_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'multiclass',  # multiclassova
                'metric': 'multiclassova',  # 使用多分类的评价指标
                'num_class': len(np.unique(train_y)),  # 类别数量
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2022,
                'n_jobs': -1,
                'verbose': -1,
            }
            model = clf.train(params, train_matrix, 5000, valid_sets=[train_matrix, valid_matrix], 
                              categorical_feature=[], verbose_eval=500, early_stopping_rounds=500,
                            # feval=WeightedF1Metric,  # 使用自定义 F1 评分函数
                             )
            val_pred_proba = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred_proba = model.predict(test_x, num_iteration=model.best_iteration)


        elif clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x, label=trn_y)
            valid_matrix = clf.DMatrix(val_x, label=val_y)
            test_matrix = clf.DMatrix(test_x)
            params = {
                'booster': 'gbtree',
                'objective': 'multi:softprob',  # 修改为多分类
                'num_class': len(np.unique(train_y)),  # 类别数量
                'eval_metric': 'mlogloss',  # 使用多分类的评价指标
                'gamma': 1,
                'min_child_weight': 1.5,
                'max_depth': 5,
                'lambda': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'eta': 0.3
                ,
                'tree_method': 'exact',
                'seed': 2020,
                'n_jobs': -1,
                "silent": True,
                'tree_method': 'gpu_hist',      # 使用 GPU 加速
                'predictor': 'gpu_predictor',
            }
            watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]
            model = clf.train(params, train_matrix, 9000, evals=watchlist, 
                              verbose_eval=500, early_stopping_rounds=500)
            val_pred_proba = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred_proba = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)

#             print(val_pred_proba[:10])
#             print(val_pred_proba[:10, :10])

        elif clf_name == "cab":
            params = {
                'learning_rate': 0.3,
                'l2_leaf_reg': 10,
                'od_type': 'Iter',
                'od_wait': 70,
                'bootstrap_type': 'Bernoulli',
                'random_seed': 11251,
                'depth': 5,
                'task_type': 'GPU',  # 启用 GPU
                'loss_function': 'MultiClassOneVsAll',  # 修改为多分类
                
            }
            model = clf(iterations=9000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      use_best_model=True, verbose=500,
                      cat_features = [],
#                       sample_weight=sample_weight  # 添加样本权重
#                       custom_metric=[f1_metric]  # 使用自定义多分类 F1 作为评估指标
                     )
            
            # 获取概率预测
            val_pred_proba = model.predict_proba(val_x)
            # test_pred_proba = model.predict_proba(test_x)
            
            
            # 定义批大小
            batch_size = 2000000  # 根据您的内存情况进行调整

            # 计算总的样本数
            num_samples = test_x.shape[0]

            # 初始化一个空的数组来存储预测结果
            test_pred_proba = np.empty((num_samples, nclass ))  # 假设 model.classes_ 返回类别数

            # 分批次进行预测
            for start in tqdm(range(0, num_samples, batch_size) ):
                end = min(start + batch_size, num_samples)  # 确保不超出边界
                print(start, end)

                # 进行预测并直接存储到预测结果数组中
                test_pred_proba[start:end] = model.predict_proba(test_x[start:end] )
                
            


        # 获取预测标签
        val_pred = np.argmax(val_pred_proba, axis=1)
#             test_labels = np.argmax(test_pred_proba, axis=1)

        # 对于多分类，val_pred 和 test_pred 是类别的索引
        train[valid_index] = val_pred_proba
        test += test_pred_proba / kf.n_splits
        


        # 计算 F1 分数（可以根据需要选择其他多分类指标）
        f1 = f1_score(val_y, val_pred, average='micro')  # 使用加权平均 F1 分数
        cv_scores.append(f1)

#         print(val_y.iloc[:10])
#         print(val_pred_proba[:10,:10])

        print(cv_scores)
        model_lst.append(model)
        
        del test_pred_proba, val_pred_proba, val_pred
        [gc.collect() for _ in range(5)]

    print("%s_score_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    
    
    
    return train, test, cv_scores, np.mean(cv_scores), np.std(cv_scores), model_lst

def lgb_model(x_train, y_train, x_test, train_y_2, sd):
    lgb_train, lgb_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst = cv_model(lgb, x_train, y_train, x_test, "lgb", train_y_2, sd)
    return lgb_train, lgb_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst
def cab_model(x_train, y_train, x_test, train_y_2, sd):
    cab_train, cab_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst = cv_model(cab, x_train, y_train, x_test, "cab", train_y_2, sd)
    return cab_train, cab_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst
def xgb_model(x_train, y_train, x_test, train_y_2, sd):
    xgb_train, xgb_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst = cv_model(xgb, x_train, y_train, x_test, "xgb", train_y_2, sd)
    return xgb_train, xgb_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst

In [2]:
1

1

In [3]:
root_dir = '../data_fea_sub/v13'
file_lst_train = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'train' in filename][:]

train_df = pd.DataFrame()
for file in tqdm(file_lst_train):
#     print('{}/{}'.format(root_dir, file))
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    train_df = pd.concat([train_df, df_], axis=0)

root_dir = '../data_fea_sub/v13'
file_lst_test = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'test' in filename][:]


test_df = pd.DataFrame()
for file in tqdm(file_lst_test):
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    test_df = pd.concat([test_df, df_], axis=0)

train_df.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in train_df.columns.tolist()[3:]]
test_df.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in test_df.columns.tolist()[3:]]

train_df.shape, test_df.shape

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 10.31it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:09<00:00,  1.00it/s]


((1038169, 42), (10135596, 42))

In [4]:
root_dir = '../data_fea_sub/v13_2'
file_lst_train = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'train' in filename][:]

train_df_2 = pd.DataFrame()
for file in tqdm(file_lst_train):
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    train_df_2 = pd.concat([train_df_2, df_], axis=0)

root_dir = '../data_fea_sub/v13_2'
file_lst_test = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'test' in filename][:]


test_df_2 = pd.DataFrame()
for file in tqdm(file_lst_test):
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    test_df_2 = pd.concat([test_df_2, df_], axis=0)

train_df_2.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in train_df_2.columns.tolist()[3:]]
test_df_2.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in test_df_2.columns.tolist()[3:]]

print(train_df_2.shape, test_df_2.shape)

train_df = train_df.merge(train_df_2, on = ['filename', 'dt_rk', 't_cut'], how='left')
test_df = test_df.merge(test_df_2, on = ['filename', 'dt_rk', 't_cut'], how='left')

del train_df_2, test_df_2
[gc.collect() for _ in range(5)]

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.09it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:10<00:00,  1.01s/it]


(1038087, 41) (10121312, 41)


[0, 0, 0, 0, 0]

In [5]:
root_dir = '../data_fea_sub/v13_3'
file_lst_train = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'train' in filename][:]

train_df_2 = pd.DataFrame()
for file in tqdm(file_lst_train):
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    train_df_2 = pd.concat([train_df_2, df_], axis=0)

root_dir = '../data_fea_sub/v13_3'
file_lst_test = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'test' in filename][:]


test_df_2 = pd.DataFrame()
for file in tqdm(file_lst_test):
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    test_df_2 = pd.concat([test_df_2, df_], axis=0)

train_df_2.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in train_df_2.columns.tolist()[3:]]
test_df_2.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in test_df_2.columns.tolist()[3:]]

print(train_df_2.shape, test_df_2.shape)

train_df = train_df.merge(train_df_2, on = ['filename', 'dt_rk', 't_cut'], how='left')
test_df = test_df.merge(test_df_2, on = ['filename', 'dt_rk', 't_cut'], how='left')

del train_df_2, test_df_2
[gc.collect() for _ in range(5)]

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  7.61it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:11<00:00,  1.17s/it]


(1038060, 51) (10116842, 51)


[9, 0, 0, 0, 0]

In [6]:
root_dir = '../data_fea_sub/v13_4'
file_lst_train = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'train' in filename][:]

train_df_2 = pd.DataFrame()
for file in tqdm(file_lst_train):
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    train_df_2 = pd.concat([train_df_2, df_], axis=0)

root_dir = '../data_fea_sub/v13_4'
file_lst_test = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'test' in filename][:]


test_df_2 = pd.DataFrame()
for file in tqdm(file_lst_test):
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    test_df_2 = pd.concat([test_df_2, df_], axis=0)

train_df_2.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in train_df_2.columns.tolist()[3:]]
test_df_2.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in test_df_2.columns.tolist()[3:]]

print(train_df_2.shape, test_df_2.shape)

train_df_2 = train_df_2.iloc[:, :40]
test_df_2 = test_df_2.iloc[:, :40]

train_df = train_df.merge(train_df_2, on = ['filename', 'dt_rk', 't_cut'], how='left')
test_df = test_df.merge(test_df_2, on = ['filename', 'dt_rk', 't_cut'], how='left')

del train_df_2, test_df_2
[gc.collect() for _ in range(5)]

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 10.28it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.13it/s]


(1038033, 39) (10112887, 39)


[0, 0, 0, 0, 0]

In [7]:
# train_df = reduce_mem_usage(train_df)
# test_df = reduce_mem_usage(test_df)

In [8]:
# train_df['t_cut'].value_counts()

In [9]:
train_label = pd.read_csv('../data/train_y_v0.1.0.csv')
cols_label = train_label.columns.tolist()[1:]

# 将指定列拼接成新的一列
train_label['label_combined'] = train_label[cols_label].apply(lambda row: ', '.join(row.values.astype(str)), axis=1)

# train_label.head()

In [10]:
label_combined_unique = train_label['label_combined'].value_counts().index.tolist()
dict_str2num= {k: v for k, v in zip(label_combined_unique, range(len(label_combined_unique)))}
dict_num2str= {k: v for k, v in zip(range(len(label_combined_unique)), label_combined_unique)}

len(label_combined_unique)

91

In [11]:
train_label['label_combined_num'] = train_label['label_combined'].apply(lambda x: dict_str2num[x])

In [12]:
train_df = train_df.merge(train_label, on='filename')

In [13]:
train_df.shape, test_df.shape

((1038169, 260), (10135596, 164))

In [14]:
# train_df['fea_count'] = train_df.groupby('filename')['filename'].transform('size')
# test_df['fea_count'] = test_df.groupby('filename')['filename'].transform('size')

In [15]:
cols_drop = ['fea_t_max','fea_a_mean', 'fea_dt_p2p'
             , 'fea_dt_unicnt','fea_dt_skew','fea_dt_std', 'fea_dt_min', 'fea_dt_max'
            ,'fea_dt_count'
            ,'fea_t_min','fea_v_count','fea_a_count'
            ,'fea_t_p2p','fea_dt_Q50'
             
             
             , 'fea_a_2_unicnt', 'fea_a_3_unicnt', 'fea_a_4_unicnt'
             , 'fea_a_2_max', 'fea_a_3_max', 'fea_a_4_max'
             , 'fea_a_Q50', 'fea_a_Q50', 'fea_a_Q50'
#              ,'fea_dt_rk'             
            ]

train_df = train_df.drop(cols_drop, axis=1)
test_df = test_df.drop(cols_drop, axis=1)

In [16]:
col_label = 'label_combined_num'
# 训练数据/测试数据准备
features = [f for f in train_df.columns if 'fea_' in f and 'dv/dt' not in f
            and '4rd' not in f and '5rd' not in f and 'fea_t_p2p_freq' not in f and 'fft_' not in f
           and 'pos_' not in f and 'neg_' not in f
            
            and 'a_7' not in f
            
            and 'da_4' not in f 
            and 'da_5' not in f 
            and 'd2a' not in f
           ]
print('->'*10, '去除唯一值前: ', len(features), features)
cols_single = []
for col in tqdm(features):
    unicnt = train_df[col].nunique()
    if unicnt==1:
        cols_single.append(col)
print('->'*10, '唯一值特征: ', len(cols_single), cols_single)
features = [f for f in features if f not in cols_single]
print('->'*10, '去除唯一值后: ', len(features), features)

# for col in features:
#     num_5, num_95 = train_df[col].quantile(0.05), train_df[col].quantile(0.95)
#     train_df[col] = train_df[col].apply(lambda x: num_5 if x<num_5 else num_95 if x>num_95 else x)

            
train = train_df.reset_index(drop=True).copy()
# train[col_label] = train[col_label].apply(lambda x: 1 if x>-1 else -1)
# train = train[train[col_label]!=0].reset_index(drop=True)
test = test_df.reset_index(drop=True)
# train[col_label] = train[col_label].apply(lambda x: 1 if x==1 else 0)

x_train = train[features]
x_test = test[features]

y_train = train[col_label]

->->->->->->->->->-> 去除唯一值前:  116 ['fea_v_mean', 'fea_v_std', 'fea_v_max', 'fea_v_min', 'fea_v_p2p', 'fea_v_unicnt', 'fea_v_Q25', 'fea_v_Q50', 'fea_v_Q75', 'fea_v_skew', 'fea_v_kurt', 'fea_a_std', 'fea_a_max', 'fea_a_min', 'fea_a_p2p', 'fea_a_unicnt', 'fea_a_Q25', 'fea_a_Q75', 'fea_a_skew', 'fea_a_kurt', 'fea_dt_mean', 'fea_dt_Q25', 'fea_dt_Q75', 'fea_dt_kurt', 'fea_a_2_sum', 'fea_a_2_mean', 'fea_a_2_std', 'fea_a_2_min', 'fea_a_2_p2p', 'fea_a_2_Q25', 'fea_a_2_Q50', 'fea_a_2_Q75', 'fea_a_2_skew', 'fea_a_2_kurt', 'fea_a_3_sum', 'fea_a_3_mean', 'fea_a_3_std', 'fea_a_3_min', 'fea_a_3_p2p', 'fea_a_3_Q25', 'fea_a_3_Q50', 'fea_a_3_Q75', 'fea_a_3_skew', 'fea_a_3_kurt', 'fea_a_4_sum', 'fea_a_4_mean', 'fea_a_4_std', 'fea_a_4_min', 'fea_a_4_p2p', 'fea_a_4_Q25', 'fea_a_4_Q50', 'fea_a_4_Q75', 'fea_a_4_skew', 'fea_a_4_kurt', 'fea_v_sum', 'fea_a_sum', 'fea_a_5_sum', 'fea_a_5_mean', 'fea_a_5_std', 'fea_a_5_max', 'fea_a_5_min', 'fea_a_5_p2p', 'fea_a_5_unicnt', 'fea_a_5_Q25', 'fea_a_5_Q50', 'fea_a_5_Q75

100%|████████████████████████████████████████████████████████████████████████████████| 116/116 [00:03<00:00, 29.09it/s]


->->->->->->->->->-> 唯一值特征:  0 []
->->->->->->->->->-> 去除唯一值后:  116 ['fea_v_mean', 'fea_v_std', 'fea_v_max', 'fea_v_min', 'fea_v_p2p', 'fea_v_unicnt', 'fea_v_Q25', 'fea_v_Q50', 'fea_v_Q75', 'fea_v_skew', 'fea_v_kurt', 'fea_a_std', 'fea_a_max', 'fea_a_min', 'fea_a_p2p', 'fea_a_unicnt', 'fea_a_Q25', 'fea_a_Q75', 'fea_a_skew', 'fea_a_kurt', 'fea_dt_mean', 'fea_dt_Q25', 'fea_dt_Q75', 'fea_dt_kurt', 'fea_a_2_sum', 'fea_a_2_mean', 'fea_a_2_std', 'fea_a_2_min', 'fea_a_2_p2p', 'fea_a_2_Q25', 'fea_a_2_Q50', 'fea_a_2_Q75', 'fea_a_2_skew', 'fea_a_2_kurt', 'fea_a_3_sum', 'fea_a_3_mean', 'fea_a_3_std', 'fea_a_3_min', 'fea_a_3_p2p', 'fea_a_3_Q25', 'fea_a_3_Q50', 'fea_a_3_Q75', 'fea_a_3_skew', 'fea_a_3_kurt', 'fea_a_4_sum', 'fea_a_4_mean', 'fea_a_4_std', 'fea_a_4_min', 'fea_a_4_p2p', 'fea_a_4_Q25', 'fea_a_4_Q50', 'fea_a_4_Q75', 'fea_a_4_skew', 'fea_a_4_kurt', 'fea_v_sum', 'fea_a_sum', 'fea_a_5_sum', 'fea_a_5_mean', 'fea_a_5_std', 'fea_a_5_max', 'fea_a_5_min', 'fea_a_5_p2p', 'fea_a_5_unicnt', 'fea_a_5

In [17]:
# train['filename'].nunique(), test['filename'].nunique()

In [18]:
x_train.columns

Index(['fea_v_mean', 'fea_v_std', 'fea_v_max', 'fea_v_min', 'fea_v_p2p',
       'fea_v_unicnt', 'fea_v_Q25', 'fea_v_Q50', 'fea_v_Q75', 'fea_v_skew',
       ...
       'fea_a_6_std', 'fea_a_6_max', 'fea_a_6_min', 'fea_a_6_p2p',
       'fea_a_6_unicnt', 'fea_a_6_Q25', 'fea_a_6_Q50', 'fea_a_6_Q75',
       'fea_a_6_skew', 'fea_a_6_kurt'],
      dtype='object', length=116)

In [19]:
sd = 1999110
cab_train, cab_test, cv_scores, cv_scores_mean, cv_scores_std, cab_model_lst = cab_model(x_train, y_train, x_test, train_df['filename'].values, sd)


************************************ 1 ************************************
0:	learn: 0.4461818	test: 0.4461863	best: 0.4461863 (0)	total: 151ms	remaining: 22m 42s
500:	learn: 0.0128407	test: 0.0130008	best: 0.0130008 (500)	total: 56.9s	remaining: 16m 5s
1000:	learn: 0.0119959	test: 0.0122678	best: 0.0122678 (1000)	total: 1m 54s	remaining: 15m 14s
1500:	learn: 0.0115839	test: 0.0119553	best: 0.0119553 (1500)	total: 2m 52s	remaining: 14m 19s
2000:	learn: 0.0113162	test: 0.0117834	best: 0.0117834 (2000)	total: 3m 49s	remaining: 13m 22s
2500:	learn: 0.0111157	test: 0.0116731	best: 0.0116731 (2500)	total: 4m 47s	remaining: 12m 26s
3000:	learn: 0.0109563	test: 0.0115939	best: 0.0115939 (3000)	total: 5m 44s	remaining: 11m 28s
3500:	learn: 0.0108312	test: 0.0115362	best: 0.0115362 (3500)	total: 6m 41s	remaining: 10m 30s
4000:	learn: 0.0107181	test: 0.0114921	best: 0.0114921 (4000)	total: 7m 37s	remaining: 9m 32s
4500:	learn: 0.0106234	test: 0.0114561	best: 0.0114561 (4500)	total: 8m 34s	remai

  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

0 2000000


 17%|██████████████                                                                      | 1/6 [01:32<07:42, 92.58s/it]

2000000 4000000


 33%|████████████████████████████                                                        | 2/6 [03:06<06:13, 93.40s/it]

4000000 6000000


 50%|██████████████████████████████████████████                                          | 3/6 [04:39<04:39, 93.19s/it]

6000000 8000000


 67%|████████████████████████████████████████████████████████                            | 4/6 [06:11<03:05, 92.74s/it]

8000000 10000000


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [07:45<01:33, 93.05s/it]

10000000 10135596


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [07:51<00:00, 78.57s/it]


[0.7535952685976284]
************************************ 2 ************************************
0:	learn: 0.4461834	test: 0.4461751	best: 0.4461751 (0)	total: 157ms	remaining: 23m 34s
500:	learn: 0.0128397	test: 0.0130297	best: 0.0130297 (500)	total: 57.4s	remaining: 16m 13s
1000:	learn: 0.0119737	test: 0.0122825	best: 0.0122825 (1000)	total: 1m 55s	remaining: 15m 19s
1500:	learn: 0.0115791	test: 0.0119879	best: 0.0119879 (1500)	total: 2m 52s	remaining: 14m 22s
2000:	learn: 0.0113186	test: 0.0118182	best: 0.0118182 (2000)	total: 3m 50s	remaining: 13m 24s
2500:	learn: 0.0111187	test: 0.0117022	best: 0.0117021 (2499)	total: 4m 47s	remaining: 12m 26s
3000:	learn: 0.0109583	test: 0.0116223	best: 0.0116223 (3000)	total: 5m 44s	remaining: 11m 29s
3500:	learn: 0.0108195	test: 0.0115633	best: 0.0115633 (3500)	total: 6m 42s	remaining: 10m 31s
4000:	learn: 0.0107116	test: 0.0115191	best: 0.0115190 (3998)	total: 7m 39s	remaining: 9m 33s
4500:	learn: 0.0106109	test: 0.0114781	best: 0.0114781 (449

  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

0 2000000


 17%|██████████████                                                                      | 1/6 [01:33<07:45, 93.06s/it]

2000000 4000000


 33%|████████████████████████████                                                        | 2/6 [03:08<06:16, 94.24s/it]

4000000 6000000


 50%|██████████████████████████████████████████                                          | 3/6 [04:40<04:40, 93.35s/it]

6000000 8000000


 67%|████████████████████████████████████████████████████████                            | 4/6 [06:12<03:06, 93.03s/it]

8000000 10000000


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [07:51<01:34, 94.98s/it]

10000000 10135596


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [07:57<00:00, 79.64s/it]


[0.7535952685976284, 0.7530654902376298]
************************************ 3 ************************************
0:	learn: 0.4461827	test: 0.4461827	best: 0.4461827 (0)	total: 153ms	remaining: 22m 59s
500:	learn: 0.0128497	test: 0.0130503	best: 0.0130503 (500)	total: 56.7s	remaining: 16m 1s
1000:	learn: 0.0119838	test: 0.0123020	best: 0.0123020 (1000)	total: 1m 54s	remaining: 15m 14s
1500:	learn: 0.0115808	test: 0.0119950	best: 0.0119950 (1500)	total: 2m 52s	remaining: 14m 20s
2000:	learn: 0.0113162	test: 0.0118257	best: 0.0118257 (2000)	total: 3m 49s	remaining: 13m 24s
2500:	learn: 0.0111158	test: 0.0117157	best: 0.0117157 (2500)	total: 4m 47s	remaining: 12m 26s
3000:	learn: 0.0109645	test: 0.0116433	best: 0.0116433 (3000)	total: 5m 43s	remaining: 11m 27s
3500:	learn: 0.0108300	test: 0.0115769	best: 0.0115769 (3500)	total: 6m 40s	remaining: 10m 29s
4000:	learn: 0.0107190	test: 0.0115325	best: 0.0115325 (4000)	total: 7m 37s	remaining: 9m 31s
4500:	learn: 0.0106165	test: 0.0114996	b

  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

0 2000000


 17%|██████████████                                                                      | 1/6 [01:32<07:41, 92.22s/it]

2000000 4000000


 33%|████████████████████████████                                                        | 2/6 [03:05<06:11, 92.95s/it]

4000000 6000000


 50%|██████████████████████████████████████████                                          | 3/6 [04:37<04:37, 92.47s/it]

6000000 8000000


 67%|████████████████████████████████████████████████████████                            | 4/6 [06:12<03:06, 93.25s/it]

8000000 10000000


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [07:53<01:36, 96.08s/it]

10000000 10135596


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [07:59<00:00, 79.93s/it]


[0.7535952685976284, 0.7530654902376298, 0.753084754905266]
************************************ 4 ************************************
0:	learn: 0.4461819	test: 0.4461908	best: 0.4461908 (0)	total: 155ms	remaining: 23m 10s
500:	learn: 0.0128964	test: 0.0130226	best: 0.0130226 (500)	total: 57.1s	remaining: 16m 7s
1000:	learn: 0.0120136	test: 0.0122615	best: 0.0122615 (1000)	total: 1m 54s	remaining: 15m 18s
1500:	learn: 0.0115941	test: 0.0119447	best: 0.0119447 (1500)	total: 2m 52s	remaining: 14m 22s
2000:	learn: 0.0113205	test: 0.0117700	best: 0.0117700 (2000)	total: 3m 50s	remaining: 13m 25s
2500:	learn: 0.0111253	test: 0.0116596	best: 0.0116596 (2500)	total: 4m 47s	remaining: 12m 27s
3000:	learn: 0.0109603	test: 0.0115716	best: 0.0115716 (2999)	total: 5m 44s	remaining: 11m 29s
3500:	learn: 0.0108337	test: 0.0115159	best: 0.0115159 (3500)	total: 6m 41s	remaining: 10m 30s
4000:	learn: 0.0107230	test: 0.0114689	best: 0.0114689 (4000)	total: 7m 38s	remaining: 9m 33s
4500:	learn: 0.010622

  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

0 2000000


 17%|██████████████                                                                      | 1/6 [01:25<07:05, 85.16s/it]

2000000 4000000


 33%|████████████████████████████                                                        | 2/6 [02:53<05:48, 87.18s/it]

4000000 6000000


 50%|██████████████████████████████████████████                                          | 3/6 [04:20<04:20, 86.99s/it]

6000000 8000000


 67%|████████████████████████████████████████████████████████                            | 4/6 [05:48<02:54, 87.31s/it]

8000000 10000000


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [07:20<01:29, 89.08s/it]

10000000 10135596


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [07:26<00:00, 74.41s/it]


[0.7535952685976284, 0.7530654902376298, 0.753084754905266, 0.7539564811158097]
************************************ 5 ************************************
0:	learn: 0.4461842	test: 0.4461823	best: 0.4461823 (0)	total: 157ms	remaining: 23m 31s
500:	learn: 0.0128593	test: 0.0130221	best: 0.0130221 (500)	total: 57.2s	remaining: 16m 10s
1000:	learn: 0.0119929	test: 0.0122701	best: 0.0122701 (1000)	total: 1m 54s	remaining: 15m 16s
1500:	learn: 0.0115779	test: 0.0119569	best: 0.0119569 (1500)	total: 2m 52s	remaining: 14m 21s
2000:	learn: 0.0113116	test: 0.0117849	best: 0.0117849 (2000)	total: 3m 50s	remaining: 13m 25s
2500:	learn: 0.0111048	test: 0.0116708	best: 0.0116708 (2500)	total: 4m 48s	remaining: 12m 29s
3000:	learn: 0.0109488	test: 0.0115922	best: 0.0115922 (2997)	total: 5m 45s	remaining: 11m 30s
3500:	learn: 0.0108173	test: 0.0115328	best: 0.0115328 (3500)	total: 6m 42s	remaining: 10m 32s
4000:	learn: 0.0107036	test: 0.0114891	best: 0.0114891 (4000)	total: 7m 40s	remaining: 9m 34s


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

0 2000000


 17%|██████████████                                                                      | 1/6 [01:34<07:51, 94.33s/it]

2000000 4000000


 33%|████████████████████████████                                                        | 2/6 [03:07<06:13, 93.48s/it]

4000000 6000000


 50%|█████████████████████████████████████████▌                                         | 3/6 [05:05<05:14, 104.75s/it]

6000000 8000000


 67%|███████████████████████████████████████████████████████▎                           | 4/6 [07:51<04:17, 128.93s/it]

8000000 10000000


 83%|█████████████████████████████████████████████████████████████████████▏             | 5/6 [09:25<01:56, 116.26s/it]

10000000 10135596


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [09:31<00:00, 95.26s/it]


[0.7535952685976284, 0.7530654902376298, 0.753084754905266, 0.7539564811158097, 0.7540227227849138]
cab_score_list: [0.7535952685976284, 0.7530654902376298, 0.753084754905266, 0.7539564811158097, 0.7540227227849138]
cab_score_mean: 0.7535449435282496
cab_score_std: 0.00041032129929872936


In [None]:
#  save model
# for i in tqdm(range(len(cab_model_lst))):
#     joblib.dump(cab_model_lst[i], '../model/sd1999110_feacut_10drop_v13_42_cab_{}.pkl'.format(i))

# =============***************
# infer way1- full infer
# if you just want infer same result as me, please just load model and infer 
# =============***************
# test = np.zeros((x_test.shape[0], len(np.unique(y_train))))    # 为多分类任务初始化
# for i in tqdm(range(5)):
#     model = joblib.load( '../model/sd1999110_feacut_10drop_v13_42_cab_{}.pkl'.format(i))
#     print('load model {}'.format(i))
#     test_pred_proba = model.predict_proba(x_test)
#     test += test_pred_proba / 5


In [29]:
# del train_df, train,  test, x_train, y_train, x_test
# [gc.collect() for i in range(5)]

[28, 0, 0, 0, 0]

In [None]:
1

In [31]:
try:
    del combined_df, max_values_, mean_values_, combined_df_mge
    [gc.collect() for _ in range(5)]
except:
    print('no need del')

In [20]:
try:
    del combined_df, max_values_, mean_values_, combined_df_mge
    [gc.collect() for _ in range(5)]
except:
    print('no need del')
    
# 合并概率矩阵和 file 列
combined_df = pd.concat([test_df[['filename']].reset_index(drop=True), pd.DataFrame(1.0*cab_test )], axis=1)
# combined_df = pd.concat([test_df[['filename']].reset_index(drop=True), pd.DataFrame( 0.6*cab_test_lst[1] + 0.4*cab_test_lst[0] )], axis=1)

# 根据 'file' 列分组并计算 n 列的最大值
max_values_ = combined_df.groupby('filename').max().reset_index()
mean_values_ = combined_df.groupby('filename').mean().reset_index()

combined_df_mge = pd.concat([max_values_, mean_values_], axis=0)

no need del


In [21]:
mge_values = combined_df_mge.groupby('filename').mean().reset_index()

# mge_values.iloc[:, 3] = mge_values.iloc[:, 3]*0.85
print(mge_values.shape)
test_df_out = mge_values.copy()
print(mge_values.shape)

(315720, 92)
(315720, 92)


In [22]:
col_label = 'label_combined_num'
test_df_out[col_label] = np.argmax(mge_values.iloc[:, 1:].values, axis=1)
test_df_out['prob'] = np.max(mge_values.iloc[:, 1:].values, axis=1)
print(test_df_out[col_label].value_counts().head())

file = 'v51_sd1999110_feacut_10drop_merge_v13_cab_5fold_5mean5max_42'
test_df_out['label_combined'] = test_df_out['label_combined_num'].apply(lambda x: dict_num2str[x])
for i in tqdm(range(len(cols_label))):
    test_df_out[cols_label[i]] = test_df_out['label_combined'].apply(lambda x: int(x.split(',')[i])/2+0.5 )
print(test_df_out['Zone_Air_Temperature_Sensor'].value_counts())

for col in tqdm(cols_label):
    test_df_out[col] = test_df_out[col].apply(lambda x: round(x,4))
cols_out = ['filename'] + cols_label
test_df_out[cols_out].to_csv('../res/{}.csv'.format(file), index=False)

0    72859
2    38054
1    36966
9    14002
5    13633
Name: label_combined_num, dtype: int64


100%|██████████████████████████████████████████████████████████████████████████████████| 94/94 [01:40<00:00,  1.07s/it]


0.0    303009
1.0     11130
0.5      1581
Name: Zone_Air_Temperature_Sensor, dtype: int64


100%|██████████████████████████████████████████████████████████████████████████████████| 94/94 [00:18<00:00,  5.12it/s]


In [23]:
# 示例 CSV 文件列表
csv_files = ['../res/{}.csv'.format(file)]  # 替换为你的 CSV 文件名
output_filename = '../tar/{}.tar.gz'.format(file)

# 调用函数
compress_csv_to_tar_gz(csv_files, output_filename)

print(f"已将 CSV 文件压缩为 {output_filename}")

已将 CSV 文件压缩为 ../tar/v51_sd1999110_feacut_10drop_merge_v13_cab_5fold_5mean5max_42.tar.gz
