In [1]:
import joblib
import pandas as pd
import os
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier as cab
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')
from sklearn import metrics
# from gensim.models import Word2Vec

import tarfile

def compress_csv_to_tar_gz(csv_files, output_filename):
    # 创建 tar.gz 文件
    with tarfile.open(output_filename, "w:gz") as tar:
        for csv_file in csv_files:
            # 确保文件存在
            if os.path.isfile(csv_file):
                tar.add(csv_file, arcname=os.path.basename(csv_file))
            else:
                print(f"文件 {csv_file} 不存在，跳过。")
                
def reduce_mem_usage(df, only_fp64=False, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    if only_fp64==True:
        numerics = [ 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def cv_model(clf, train_x, train_y, test_x, clf_name, train_y_2, sd):
    folds = 5
    seed = sd
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    # 初始化 GroupKFold，设置折数为 5
#     kf = GroupKFold(n_splits=5)
    nclass =  len(np.unique(train_y))
    train = np.zeros((train_x.shape[0],nclass ))  # 为多分类任务初始化
    test = np.zeros((test_x.shape[0], nclass ))    # 为多分类任务初始化

    cv_scores = []


    model_lst = []
    # 进行五折交叉验证
#     for i, (train_index, valid_index) in enumerate(kf.split(train_x, groups=train_y_2)):
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i + 1)))
        trn_x, trn_y = train_x.iloc[train_index], train_y[train_index]
        val_x, val_y = train_x.iloc[valid_index], train_y[valid_index]
#         sample_weight = y_train_sample_weight.iloc[train_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'multiclass',  # multiclassova
                'metric': 'multiclassova',  # 使用多分类的评价指标
                'num_class': len(np.unique(train_y)),  # 类别数量
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2022,
                'n_jobs': -1,
                'verbose': -1,
            }
            model = clf.train(params, train_matrix, 5000, valid_sets=[train_matrix, valid_matrix], 
                              categorical_feature=[], verbose_eval=500, early_stopping_rounds=500,
                            # feval=WeightedF1Metric,  # 使用自定义 F1 评分函数
                             )
            val_pred_proba = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred_proba = model.predict(test_x, num_iteration=model.best_iteration)


        elif clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x, label=trn_y)
            valid_matrix = clf.DMatrix(val_x, label=val_y)
            test_matrix = clf.DMatrix(test_x)
            params = {
                'booster': 'gbtree',
                'objective': 'multi:softprob',  # 修改为多分类
                'num_class': len(np.unique(train_y)),  # 类别数量
                'eval_metric': 'mlogloss',  # 使用多分类的评价指标
                'gamma': 1,
                'min_child_weight': 1.5,
                'max_depth': 5,
                'lambda': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'eta': 0.3
                ,
                'tree_method': 'exact',
                'seed': 2020,
                'n_jobs': -1,
                "silent": True,
                'tree_method': 'gpu_hist',      # 使用 GPU 加速
                'predictor': 'gpu_predictor',
            }
            watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]
            model = clf.train(params, train_matrix, 9000, evals=watchlist, 
                              verbose_eval=500, early_stopping_rounds=500)
            val_pred_proba = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred_proba = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)

#             print(val_pred_proba[:10])
#             print(val_pred_proba[:10, :10])

        elif clf_name == "cab":
            params = {
                'learning_rate': 0.3,
                'l2_leaf_reg': 10,
                'od_type': 'Iter',
                'od_wait': 70,
                'bootstrap_type': 'Bernoulli',
                'random_seed': 11251,
                'depth': 5,
                'task_type': 'GPU',  # 启用 GPU
                'loss_function': 'MultiClassOneVsAll',  # 修改为多分类
                
            }
            model = clf(iterations=9000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      use_best_model=True, verbose=500,
                      cat_features = [],
#                       sample_weight=sample_weight  # 添加样本权重
#                       custom_metric=[f1_metric]  # 使用自定义多分类 F1 作为评估指标
                     )
            
            # 获取概率预测
            val_pred_proba = model.predict_proba(val_x)
            # test_pred_proba = model.predict_proba(test_x)
            
            
            # 定义批大小
            batch_size = 2000000  # 根据您的内存情况进行调整

            # 计算总的样本数
            num_samples = test_x.shape[0]

            # 初始化一个空的数组来存储预测结果
            test_pred_proba = np.empty((num_samples, nclass ))  # 假设 model.classes_ 返回类别数

            # 分批次进行预测
            for start in tqdm(range(0, num_samples, batch_size) ):
                end = min(start + batch_size, num_samples)  # 确保不超出边界
                print(start, end)

                # 进行预测并直接存储到预测结果数组中
                test_pred_proba[start:end] = model.predict_proba(test_x[start:end] )
                
            


        # 获取预测标签
        val_pred = np.argmax(val_pred_proba, axis=1)
#             test_labels = np.argmax(test_pred_proba, axis=1)

        # 对于多分类，val_pred 和 test_pred 是类别的索引
        train[valid_index] = val_pred_proba
        test += test_pred_proba / kf.n_splits
        


        # 计算 F1 分数（可以根据需要选择其他多分类指标）
        f1 = f1_score(val_y, val_pred, average='micro')  # 使用加权平均 F1 分数
        cv_scores.append(f1)
        


#         print(val_y.iloc[:10])
#         print(val_pred_proba[:10,:10])

        print(cv_scores)
        model_lst.append(model)
        
        del test_pred_proba, val_pred_proba, val_pred
        [gc.collect() for _ in range(5)]
        
        if f1<0.7472:
            break

    print("%s_score_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    
    
    
    return train, test, cv_scores, np.mean(cv_scores), np.std(cv_scores), model_lst

def lgb_model(x_train, y_train, x_test, train_y_2, sd):
    lgb_train, lgb_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst = cv_model(lgb, x_train, y_train, x_test, "lgb", train_y_2, sd)
    return lgb_train, lgb_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst
def cab_model(x_train, y_train, x_test, train_y_2, sd):
    cab_train, cab_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst = cv_model(cab, x_train, y_train, x_test, "cab", train_y_2, sd)
    return cab_train, cab_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst
def xgb_model(x_train, y_train, x_test, train_y_2, sd):
    xgb_train, xgb_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst = cv_model(xgb, x_train, y_train, x_test, "xgb", train_y_2, sd)
    return xgb_train, xgb_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst

In [2]:
root_dir = '../data_fea_sub/v13'
file_lst_train = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'train' in filename][:]

train_df = pd.DataFrame()
for file in tqdm(file_lst_train):
#     print('{}/{}'.format(root_dir, file))
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    train_df = pd.concat([train_df, df_], axis=0)

root_dir = '../data_fea_sub/v13'
file_lst_test = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'test' in filename][:]


test_df = pd.DataFrame()
for file in tqdm(file_lst_test):
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    test_df = pd.concat([test_df, df_], axis=0)

train_df.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in train_df.columns.tolist()[3:]]
test_df.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in test_df.columns.tolist()[3:]]

train_df.shape, test_df.shape

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 11.39it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:09<00:00,  1.01it/s]


((1038169, 42), (10135596, 42))

In [3]:
root_dir = '../data_fea_sub/v13_2'
file_lst_train = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'train' in filename][:]

train_df_2 = pd.DataFrame()
for file in tqdm(file_lst_train):
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    train_df_2 = pd.concat([train_df_2, df_], axis=0)

root_dir = '../data_fea_sub/v13_2'
file_lst_test = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'test' in filename][:]


test_df_2 = pd.DataFrame()
for file in tqdm(file_lst_test):
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    test_df_2 = pd.concat([test_df_2, df_], axis=0)

train_df_2.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in train_df_2.columns.tolist()[3:]]
test_df_2.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in test_df_2.columns.tolist()[3:]]

print(train_df_2.shape, test_df_2.shape)

train_df = train_df.merge(train_df_2, on = ['filename', 'dt_rk', 't_cut'], how='left')
test_df = test_df.merge(test_df_2, on = ['filename', 'dt_rk', 't_cut'], how='left')

del train_df_2, test_df_2
[gc.collect() for _ in range(5)]

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.28it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:09<00:00,  1.02it/s]


(1038087, 41) (10121312, 41)


[0, 0, 0, 0, 0]

In [4]:
root_dir = '../data_fea_sub/v13_3'
file_lst_train = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'train' in filename][:]

train_df_2 = pd.DataFrame()
for file in tqdm(file_lst_train):
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    train_df_2 = pd.concat([train_df_2, df_], axis=0)

root_dir = '../data_fea_sub/v13_3'
file_lst_test = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'test' in filename][:]


test_df_2 = pd.DataFrame()
for file in tqdm(file_lst_test):
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    test_df_2 = pd.concat([test_df_2, df_], axis=0)

train_df_2.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in train_df_2.columns.tolist()[3:]]
test_df_2.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in test_df_2.columns.tolist()[3:]]

print(train_df_2.shape, test_df_2.shape)

train_df = train_df.merge(train_df_2, on = ['filename', 'dt_rk', 't_cut'], how='left')
test_df = test_df.merge(test_df_2, on = ['filename', 'dt_rk', 't_cut'], how='left')

del train_df_2, test_df_2
[gc.collect() for _ in range(5)]

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  7.21it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:12<00:00,  1.21s/it]


(1038060, 51) (10116842, 51)


[9, 0, 0, 0, 0]

In [5]:
root_dir = '../data_fea_sub/v13_4'
file_lst_train = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'train' in filename][:]

train_df_2 = pd.DataFrame()
for file in tqdm(file_lst_train):
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    train_df_2 = pd.concat([train_df_2, df_], axis=0)

root_dir = '../data_fea_sub/v13_4'
file_lst_test = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'test' in filename][:]


test_df_2 = pd.DataFrame()
for file in tqdm(file_lst_test):
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    test_df_2 = pd.concat([test_df_2, df_], axis=0)

train_df_2.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in train_df_2.columns.tolist()[3:]]
test_df_2.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in test_df_2.columns.tolist()[3:]]

print(train_df_2.shape, test_df_2.shape)

train_df_2 = train_df_2.iloc[:, :40]
test_df_2 = test_df_2.iloc[:, :40]

train_df = train_df.merge(train_df_2, on = ['filename', 'dt_rk', 't_cut'], how='left')
test_df = test_df.merge(test_df_2, on = ['filename', 'dt_rk', 't_cut'], how='left')

del train_df_2, test_df_2
[gc.collect() for _ in range(5)]

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 10.17it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.14it/s]


(1038033, 39) (10112887, 39)


[0, 0, 0, 0, 0]

In [6]:
test_df.shape

(10135596, 164)

In [7]:
# train_df['t_cut'].value_counts()

In [8]:
train_label = pd.read_csv('../data/train_y_v0.1.0.csv')
cols_label = train_label.columns.tolist()[1:]

# 将指定列拼接成新的一列
train_label['label_combined'] = train_label[cols_label].apply(lambda row: ', '.join(row.values.astype(str)), axis=1)

# train_label.head()

In [9]:
label_combined_unique = train_label['label_combined'].value_counts().index.tolist()
dict_str2num= {k: v for k, v in zip(label_combined_unique, range(len(label_combined_unique)))}
dict_num2str= {k: v for k, v in zip(range(len(label_combined_unique)), label_combined_unique)}

len(label_combined_unique)

91

In [10]:
train_label['label_combined_num'] = train_label['label_combined'].apply(lambda x: dict_str2num[x])

In [11]:
train_df = train_df.merge(train_label, on='filename')

In [12]:
train_df.shape, test_df.shape

((1038169, 260), (10135596, 164))

In [13]:
# train_df['fea_count'] = train_df.groupby('filename')['filename'].transform('size')
# test_df['fea_count'] = test_df.groupby('filename')['filename'].transform('size')

In [14]:
cols_drop = ['fea_t_max','fea_a_mean', 'fea_dt_p2p'
             , 'fea_dt_unicnt','fea_dt_skew','fea_dt_std', 'fea_dt_min', 'fea_dt_max'
            ,'fea_dt_count'
            ,'fea_t_min','fea_v_count','fea_a_count'
            ,'fea_t_p2p','fea_dt_Q50'
             
             
             , 'fea_a_2_unicnt', 'fea_a_3_unicnt', 'fea_a_4_unicnt'
             , 'fea_a_2_max', 'fea_a_3_max', 'fea_a_4_max'
             , 'fea_a_Q50', 'fea_a_Q50', 'fea_a_Q50'
#              ,'fea_dt_rk'             
            ]

train_df = train_df.drop(cols_drop, axis=1)
test_df = test_df.drop(cols_drop, axis=1)

In [15]:
col_label = 'label_combined_num'
# 训练数据/测试数据准备
features = ['fea_v_mean', 'fea_v_std', 'fea_v_max', 'fea_v_min', 'fea_v_p2p', 'fea_v_unicnt', 'fea_v_Q25', 'fea_v_Q50', 'fea_v_Q75', 'fea_v_skew', 'fea_v_kurt', 'fea_a_std', 'fea_a_max', 'fea_a_min', 'fea_a_p2p', 'fea_a_unicnt', 'fea_a_Q25', 'fea_a_Q75', 'fea_a_skew', 'fea_a_kurt', 'fea_dt_mean', 'fea_dt_Q25', 'fea_dt_Q75', 'fea_dt_kurt', 'fea_a_2_sum', 'fea_a_2_mean', 'fea_a_2_std', 'fea_a_2_min', 'fea_a_2_p2p', 'fea_a_2_Q25', 'fea_a_2_Q50', 'fea_a_2_Q75', 'fea_a_2_skew', 'fea_a_2_kurt', 'fea_a_3_sum', 'fea_a_3_mean', 'fea_a_3_std', 'fea_a_3_min', 'fea_a_3_p2p', 'fea_a_3_Q25', 'fea_a_3_Q50', 'fea_a_3_Q75', 'fea_a_3_skew', 'fea_a_3_kurt', 'fea_a_4_sum', 'fea_a_4_mean', 'fea_a_4_std', 'fea_a_4_min', 'fea_a_4_p2p', 'fea_a_4_Q25', 'fea_a_4_Q50', 'fea_a_4_Q75', 'fea_a_4_skew', 'fea_a_4_kurt', 'fea_v_sum', 'fea_a_sum', 'fea_a_5_sum', 'fea_a_5_mean', 'fea_a_5_std', 'fea_a_5_max', 'fea_a_5_min', 'fea_a_5_p2p', 'fea_a_5_unicnt', 'fea_a_5_Q25', 'fea_a_5_Q50', 'fea_a_5_Q75', 'fea_a_5_skew', 'fea_a_5_kurt', 'fea_da_1_sum', 'fea_da_1_mean', 'fea_da_1_std', 'fea_da_1_max', 'fea_da_1_min', 'fea_da_1_p2p', 'fea_da_1_unicnt', 'fea_da_1_Q25', 'fea_da_1_Q50', 'fea_da_1_Q75', 'fea_da_1_skew', 'fea_da_1_kurt', 'fea_da_2_sum', 'fea_da_2_mean', 'fea_da_2_std', 'fea_da_2_max', 'fea_da_2_min', 'fea_da_2_p2p', 'fea_da_2_unicnt', 'fea_da_2_Q25', 'fea_da_2_Q50', 'fea_da_2_Q75', 'fea_da_2_skew', 'fea_da_2_kurt', 'fea_da_3_sum', 'fea_da_3_mean', 'fea_da_3_std'
            , 'fea_da_3_max', 'fea_da_3_min', 'fea_da_3_p2p', 'fea_da_3_unicnt', 'fea_da_3_Q25', 'fea_da_3_Q50', 'fea_da_3_Q75', 'fea_da_3_skew', 'fea_da_3_kurt', 'fea_a_6_sum', 'fea_a_6_mean', 'fea_a_6_std', 'fea_a_6_max', 'fea_a_6_min', 'fea_a_6_p2p', 'fea_a_6_unicnt', 'fea_a_6_Q25', 'fea_a_6_Q50', 'fea_a_6_Q75', 'fea_a_6_skew', 'fea_a_6_kurt']
print('->'*10, '去除唯一值前: ', len(features), features)
cols_single = []
for col in tqdm(features):
    unicnt = train_df[col].nunique()
    if unicnt==1:
        cols_single.append(col)
print('->'*10, '唯一值特征: ', len(cols_single), cols_single)
features = [f for f in features if f not in cols_single]
print('->'*10, '去除唯一值后: ', len(features), features)

# for col in features:
#     num_5, num_95 = train_df[col].quantile(0.05), train_df[col].quantile(0.95)
#     train_df[col] = train_df[col].apply(lambda x: num_5 if x<num_5 else num_95 if x>num_95 else x)

            
# train = train_df.reset_index(drop=True).copy()
# train[col_label] = train[col_label].apply(lambda x: 1 if x>-1 else -1)
# train = train[train[col_label]!=0].reset_index(drop=True)
# test = test_df.reset_index(drop=True)
# train[col_label] = train[col_label].apply(lambda x: 1 if x==1 else 0)

# x_train = train[features]
x_test = test_df[features].reset_index(drop=True)

y_train = train_df[col_label].reset_index(drop=True)

->->->->->->->->->-> 去除唯一值前:  116 ['fea_v_mean', 'fea_v_std', 'fea_v_max', 'fea_v_min', 'fea_v_p2p', 'fea_v_unicnt', 'fea_v_Q25', 'fea_v_Q50', 'fea_v_Q75', 'fea_v_skew', 'fea_v_kurt', 'fea_a_std', 'fea_a_max', 'fea_a_min', 'fea_a_p2p', 'fea_a_unicnt', 'fea_a_Q25', 'fea_a_Q75', 'fea_a_skew', 'fea_a_kurt', 'fea_dt_mean', 'fea_dt_Q25', 'fea_dt_Q75', 'fea_dt_kurt', 'fea_a_2_sum', 'fea_a_2_mean', 'fea_a_2_std', 'fea_a_2_min', 'fea_a_2_p2p', 'fea_a_2_Q25', 'fea_a_2_Q50', 'fea_a_2_Q75', 'fea_a_2_skew', 'fea_a_2_kurt', 'fea_a_3_sum', 'fea_a_3_mean', 'fea_a_3_std', 'fea_a_3_min', 'fea_a_3_p2p', 'fea_a_3_Q25', 'fea_a_3_Q50', 'fea_a_3_Q75', 'fea_a_3_skew', 'fea_a_3_kurt', 'fea_a_4_sum', 'fea_a_4_mean', 'fea_a_4_std', 'fea_a_4_min', 'fea_a_4_p2p', 'fea_a_4_Q25', 'fea_a_4_Q50', 'fea_a_4_Q75', 'fea_a_4_skew', 'fea_a_4_kurt', 'fea_v_sum', 'fea_a_sum', 'fea_a_5_sum', 'fea_a_5_mean', 'fea_a_5_std', 'fea_a_5_max', 'fea_a_5_min', 'fea_a_5_p2p', 'fea_a_5_unicnt', 'fea_a_5_Q25', 'fea_a_5_Q50', 'fea_a_5_Q75

100%|████████████████████████████████████████████████████████████████████████████████| 116/116 [00:04<00:00, 28.88it/s]


->->->->->->->->->-> 唯一值特征:  0 []
->->->->->->->->->-> 去除唯一值后:  116 ['fea_v_mean', 'fea_v_std', 'fea_v_max', 'fea_v_min', 'fea_v_p2p', 'fea_v_unicnt', 'fea_v_Q25', 'fea_v_Q50', 'fea_v_Q75', 'fea_v_skew', 'fea_v_kurt', 'fea_a_std', 'fea_a_max', 'fea_a_min', 'fea_a_p2p', 'fea_a_unicnt', 'fea_a_Q25', 'fea_a_Q75', 'fea_a_skew', 'fea_a_kurt', 'fea_dt_mean', 'fea_dt_Q25', 'fea_dt_Q75', 'fea_dt_kurt', 'fea_a_2_sum', 'fea_a_2_mean', 'fea_a_2_std', 'fea_a_2_min', 'fea_a_2_p2p', 'fea_a_2_Q25', 'fea_a_2_Q50', 'fea_a_2_Q75', 'fea_a_2_skew', 'fea_a_2_kurt', 'fea_a_3_sum', 'fea_a_3_mean', 'fea_a_3_std', 'fea_a_3_min', 'fea_a_3_p2p', 'fea_a_3_Q25', 'fea_a_3_Q50', 'fea_a_3_Q75', 'fea_a_3_skew', 'fea_a_3_kurt', 'fea_a_4_sum', 'fea_a_4_mean', 'fea_a_4_std', 'fea_a_4_min', 'fea_a_4_p2p', 'fea_a_4_Q25', 'fea_a_4_Q50', 'fea_a_4_Q75', 'fea_a_4_skew', 'fea_a_4_kurt', 'fea_v_sum', 'fea_a_sum', 'fea_a_5_sum', 'fea_a_5_mean', 'fea_a_5_std', 'fea_a_5_max', 'fea_a_5_min', 'fea_a_5_p2p', 'fea_a_5_unicnt', 'fea_a_5

In [16]:
test_df_info = test_df[['filename']].reset_index(drop=True)

In [17]:
del train_df, test_df
[gc.collect() for i in range(5)]

[40, 0, 0, 0, 0]

In [18]:
test_pred_proba_lst = []
nclass =  len(np.unique(y_train))
for i in range(5):
    model = joblib.load( '../model/sd1999110_feacut_10drop_v13_42_cab_{}.pkl'.format(i))
    print('load model {}'.format(i))
    # 定义批大小
    batch_size = 2000000  # 根据您的内存情况进行调整

    # 计算总的样本数
    num_samples = x_test.shape[0]

    # 初始化一个空的数组来存储预测结果
    test_pred_proba = np.empty((num_samples, nclass ))  # 假设 model.classes_ 返回类别数

    # 分批次进行预测
    for start in tqdm(range(0, num_samples, batch_size) ):
        end = min(start + batch_size, num_samples)  # 确保不超出边界
        print(start, end)

        # 进行预测并直接存储到预测结果数组中
        test_pred_proba[start:end] = model.predict_proba(x_test[start:end] )
    test_pred_proba_lst.append(test_pred_proba)
    
    del test_pred_proba
    [gc.collect() for _ in range(5)]

load model 0


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

0 2000000


 17%|██████████████                                                                      | 1/6 [01:31<07:39, 91.94s/it]

2000000 4000000


 33%|████████████████████████████                                                        | 2/6 [03:04<06:09, 92.29s/it]

4000000 6000000


 50%|██████████████████████████████████████████                                          | 3/6 [04:37<04:37, 92.50s/it]

6000000 8000000


 67%|████████████████████████████████████████████████████████                            | 4/6 [06:10<03:05, 92.71s/it]

8000000 10000000


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [07:43<01:32, 92.87s/it]

10000000 10135596


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [07:49<00:00, 78.28s/it]


load model 1


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

0 2000000


 17%|██████████████                                                                      | 1/6 [01:33<07:45, 93.00s/it]

2000000 4000000


 33%|████████████████████████████                                                        | 2/6 [03:05<06:10, 92.66s/it]

4000000 6000000


 50%|██████████████████████████████████████████                                          | 3/6 [04:38<04:38, 92.88s/it]

6000000 8000000


 67%|████████████████████████████████████████████████████████                            | 4/6 [06:13<03:07, 93.71s/it]

8000000 10000000


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [07:47<01:33, 93.73s/it]

10000000 10135596


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [07:53<00:00, 78.94s/it]


load model 2


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

0 2000000


 17%|██████████████                                                                      | 1/6 [01:31<07:39, 91.81s/it]

2000000 4000000


 33%|████████████████████████████                                                        | 2/6 [03:03<06:05, 91.47s/it]

4000000 6000000


 50%|██████████████████████████████████████████                                          | 3/6 [04:36<04:37, 92.35s/it]

6000000 8000000


 67%|████████████████████████████████████████████████████████                            | 4/6 [06:08<03:04, 92.36s/it]

8000000 10000000


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [07:40<01:32, 92.29s/it]

10000000 10135596


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [07:47<00:00, 77.85s/it]


load model 3


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

0 2000000


 17%|██████████████                                                                      | 1/6 [01:31<07:36, 91.35s/it]

2000000 4000000


 33%|████████████████████████████                                                        | 2/6 [02:57<05:53, 88.40s/it]

4000000 6000000


 50%|██████████████████████████████████████████                                          | 3/6 [04:24<04:22, 87.46s/it]

6000000 8000000


 67%|████████████████████████████████████████████████████████                            | 4/6 [05:49<02:53, 86.80s/it]

8000000 10000000


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [07:15<01:26, 86.23s/it]

10000000 10135596


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [07:20<00:00, 73.46s/it]


load model 4


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

0 2000000


 17%|██████████████                                                                      | 1/6 [01:32<07:44, 92.91s/it]

2000000 4000000


 33%|████████████████████████████                                                        | 2/6 [03:05<06:10, 92.67s/it]

4000000 6000000


 50%|██████████████████████████████████████████                                          | 3/6 [04:38<04:38, 92.73s/it]

6000000 8000000


 67%|████████████████████████████████████████████████████████                            | 4/6 [06:10<03:05, 92.61s/it]

8000000 10000000


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [07:42<01:32, 92.48s/it]

10000000 10135596


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [07:49<00:00, 78.19s/it]


In [19]:
del x_test
[gc.collect() for _ in range(5)]

[20, 0, 0, 0, 0]

In [None]:
m = test_pred_proba_lst[0].shape[0]
m

10135596

In [56]:
try:
    del test
    [gc.collect() for _ in range(5)]
except:
    print('no need del')
    
test = np.zeros((m, nclass))    # 为多分类任务初始化
for i in tqdm([0,1,2,3,4]):
    if i in [1]:
        test += test_pred_proba_lst[i] *0.4
    else:
        test += test_pred_proba_lst[i] *0.2
        
    [gc.collect() for _ in range(5)]

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:37<00:00, 43.44s/it]


In [57]:
try:
    del combined_df, max_values_, mean_values_, combined_df_mge
    [gc.collect() for _ in range(5)]
except:
    print('no need del')
    
# 合并概率矩阵和 file 列
combined_df = pd.concat([test_df_info, pd.DataFrame(1.0*test )], axis=1)
# combined_df = pd.concat([test_df[['filename']].reset_index(drop=True), pd.DataFrame( 0.6*cab_test_lst[1] + 0.4*cab_test_lst[0] )], axis=1)

# 根据 'file' 列分组并计算 n 列的最大值
max_values_ = combined_df.groupby('filename').max().reset_index()
mean_values_ = combined_df.groupby('filename').mean().reset_index()

combined_df_mge = pd.concat([max_values_, mean_values_], axis=0)

In [58]:
mge_values = combined_df_mge.groupby('filename').mean().reset_index()

# mge_values.iloc[:, 3] = mge_values.iloc[:, 3]*0.85
print(mge_values.shape)
test_df_out = mge_values.copy()
print(mge_values.shape)

(315720, 92)
(315720, 92)


In [59]:
col_label = 'label_combined_num'
test_df_out[col_label] = np.argmax(mge_values.iloc[:, 1:].values, axis=1)
test_df_out['prob'] = np.max(mge_values.iloc[:, 1:].values, axis=1)
print(test_df_out[col_label].value_counts().head())

file = 'v51_sd1999110_feacut_10drop_merge_v13_cab_5fold_5mean5max_42_stack1bigger'
test_df_out['label_combined'] = test_df_out['label_combined_num'].apply(lambda x: dict_num2str[x])
for i in tqdm(range(len(cols_label))):
    test_df_out[cols_label[i]] = test_df_out['label_combined'].apply(lambda x: int(x.split(',')[i])/2+0.5 )
print(test_df_out['Zone_Air_Temperature_Sensor'].value_counts())

for col in tqdm(cols_label):
    test_df_out[col] = test_df_out[col].apply(lambda x: round(x,4))
cols_out = ['filename'] + cols_label
test_df_out[cols_out].to_csv('../res/{}.csv'.format(file), index=False)

0    72653
2    38222
1    36581
9    14371
5    13641
Name: label_combined_num, dtype: int64


100%|██████████████████████████████████████████████████████████████████████████████████| 94/94 [01:37<00:00,  1.04s/it]


0.0    303052
1.0     11082
0.5      1586
Name: Zone_Air_Temperature_Sensor, dtype: int64


100%|██████████████████████████████████████████████████████████████████████████████████| 94/94 [00:18<00:00,  5.14it/s]


In [60]:
# 示例 CSV 文件列表
csv_files = ['../res/{}.csv'.format(file)]  # 替换为你的 CSV 文件名
output_filename = '../tar/{}.tar.gz'.format(file)

# 调用函数
compress_csv_to_tar_gz(csv_files, output_filename)

print(f"已将 CSV 文件压缩为 {output_filename}")

已将 CSV 文件压缩为 ../tar/v51_sd7_feacut_10drop_merge_v13_cab_5fold_5mean5max_42_stack1bigger.tar.gz


In [None]:
# 0.616	0.574	0.692	0.615	0.989
# v51_sd7_feacut_10drop_merge_v13_cab_5fold_5mean5max_42_stack1bigger
# for i in tqdm([0,1,2,3,4]):
#     if i in [1]:
#         test += test_pred_proba_lst[i] *0.4
#     else:
#         test += test_pred_proba_lst[i] *0.2