In [1]:
import joblib
import pandas as pd
import os
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier as cab
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')
from sklearn import metrics
# from gensim.models import Word2Vec

import tarfile

def compress_csv_to_tar_gz(csv_files, output_filename):
    # 创建 tar.gz 文件
    with tarfile.open(output_filename, "w:gz") as tar:
        for csv_file in csv_files:
            # 确保文件存在
            if os.path.isfile(csv_file):
                tar.add(csv_file, arcname=os.path.basename(csv_file))
            else:
                print(f"文件 {csv_file} 不存在，跳过。")
                
def reduce_mem_usage(df, only_fp64=False, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    if only_fp64==True:
        numerics = [ 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def cv_model(clf, train_x, train_y, test_x, clf_name, train_y_2, sd):
    folds = 5
    seed = sd
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    # 初始化 GroupKFold，设置折数为 5
#     kf = GroupKFold(n_splits=5)
    
    train = np.zeros((train_x.shape[0], len(np.unique(train_y))))  # 为多分类任务初始化
    test = np.zeros((test_x.shape[0], len(np.unique(train_y))))    # 为多分类任务初始化

    cv_scores = []


    model_lst = []
    # 进行五折交叉验证
#     for i, (train_index, valid_index) in enumerate(kf.split(train_x, groups=train_y_2)):
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i + 1)))
        trn_x, trn_y = train_x.iloc[train_index], train_y[train_index]
        val_x, val_y = train_x.iloc[valid_index], train_y[valid_index]
#         sample_weight = y_train_sample_weight.iloc[train_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'multiclass',  # multiclassova
                'metric': 'multiclassova',  # 使用多分类的评价指标
                'num_class': len(np.unique(train_y)),  # 类别数量
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2022,
                'n_jobs': -1,
                'verbose': -1,
            }
            model = clf.train(params, train_matrix, 5000, valid_sets=[train_matrix, valid_matrix], 
                              categorical_feature=[], verbose_eval=500, early_stopping_rounds=500,
                            # feval=WeightedF1Metric,  # 使用自定义 F1 评分函数
                             )
            val_pred_proba = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred_proba = model.predict(test_x, num_iteration=model.best_iteration)


        elif clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x, label=trn_y)
            valid_matrix = clf.DMatrix(val_x, label=val_y)
            test_matrix = clf.DMatrix(test_x)
            params = {
                'booster': 'gbtree',
                'objective': 'multi:softprob',  # 修改为多分类
                'num_class': len(np.unique(train_y)),  # 类别数量
                'eval_metric': 'mlogloss',  # 使用多分类的评价指标
                'gamma': 1,
                'min_child_weight': 1.5,
                'max_depth': 5,
                'lambda': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'eta': 0.3
                ,
                'tree_method': 'exact',
                'seed': 2020,
                'n_jobs': -1,
                "silent": True,
                'tree_method': 'gpu_hist',      # 使用 GPU 加速
                'predictor': 'gpu_predictor',
            }
            watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]
            model = clf.train(params, train_matrix, 8000, evals=watchlist, 
                              verbose_eval=500, early_stopping_rounds=500)
            val_pred_proba = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred_proba = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)

#             print(val_pred_proba[:10])
#             print(val_pred_proba[:10, :10])

        elif clf_name == "cab":
            params = {
                'learning_rate': 0.3,
                'l2_leaf_reg': 10,
                'od_type': 'Iter',
                'od_wait': 70,
                'bootstrap_type': 'Bernoulli',
                'random_seed': 11251,
                'depth': 5,
                'task_type': 'GPU',  # 启用 GPU
                'loss_function': 'MultiClassOneVsAll',  # 修改为多分类
                
            }
            model = clf(iterations=8000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      use_best_model=True, verbose=500,
                      cat_features = [],
#                       sample_weight=sample_weight  # 添加样本权重
#                       custom_metric=[f1_metric]  # 使用自定义多分类 F1 作为评估指标
                     )
            
            # 获取概率预测
            val_pred_proba = model.predict_proba(val_x)
            test_pred_proba = model.predict_proba(test_x)

        # 获取预测标签
        val_pred = np.argmax(val_pred_proba, axis=1)
#             test_labels = np.argmax(test_pred_proba, axis=1)

        # 对于多分类，val_pred 和 test_pred 是类别的索引
        train[valid_index] = val_pred_proba
        test += test_pred_proba / kf.n_splits

        # 计算 F1 分数（可以根据需要选择其他多分类指标）
        f1 = f1_score(val_y, val_pred, average='micro')  # 使用加权平均 F1 分数
        cv_scores.append(f1)

#         print(val_y.iloc[:10])
#         print(val_pred_proba[:10,:10])

        print(cv_scores)
        model_lst.append(model)

    print("%s_score_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    
    
    
    return train, test, cv_scores, np.mean(cv_scores), np.std(cv_scores), model_lst

def lgb_model(x_train, y_train, x_test, train_y_2, sd):
    lgb_train, lgb_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst = cv_model(lgb, x_train, y_train, x_test, "lgb", train_y_2, sd)
    return lgb_train, lgb_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst
def cab_model(x_train, y_train, x_test, train_y_2, sd):
    cab_train, cab_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst = cv_model(cab, x_train, y_train, x_test, "cab", train_y_2, sd)
    return cab_train, cab_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst
def xgb_model(x_train, y_train, x_test, train_y_2, sd):
    xgb_train, xgb_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst = cv_model(xgb, x_train, y_train, x_test, "xgb", train_y_2, sd)
    return xgb_train, xgb_test, cv_scores, cv_scores_mean, cv_scores_std, model_lst

# ============= V5特征已经完美复现V4最佳结果

In [None]:
root_dir = '../data_fea_sub/v14'
file_lst_train = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'train' in filename][:]


train_df = pd.DataFrame()
for file in tqdm(file_lst_train):
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    train_df = pd.concat([train_df, df_], axis=0)
    
root_dir = '../data_fea_sub/v13'
file_lst_test = [filename for filename in os.listdir(root_dir) if filename.endswith('.pkl') and 'test' in filename][:]
    
test_df = pd.DataFrame()
for file in tqdm(file_lst_test):
    df_ = pd.read_pickle('{}/{}'.format(root_dir, file))
    test_df = pd.concat([test_df, df_], axis=0)

train_df.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in train_df.columns.tolist()[3:]]
test_df.columns = ['filename', 'dt_rk', 't_cut'] + ['fea_'+f for f in test_df.columns.tolist()[3:]]

# train_df, test_df = train_df_v7, test_df_v7

# train_df=train_df[train_df['t_cut'].isin([2,])].reset_index(drop=True)
# test_df=test_df[test_df['t_cut'].isin([2,])].reset_index(drop=True)

train_df.shape, test_df.shape

100%|██████████| 6/6 [00:01<00:00,  5.45it/s]
 67%|██████▋   | 4/6 [00:05<00:02,  1.31s/it]

In [None]:
# train_df = reduce_mem_usage(train_df)
# test_df = reduce_mem_usage(test_df)

In [4]:
test_df['t_cut'].value_counts()

0    5063112
Name: t_cut, dtype: int64

In [5]:
# train_df[train_df['filename'] =='train_X0.pkl']

In [6]:
train_label = pd.read_csv('../data/train_y_v0.1.0.csv')
cols_label = train_label.columns.tolist()[1:]

# 将指定列拼接成新的一列
train_label['label_combined'] = train_label[cols_label].apply(lambda row: ', '.join(row.values.astype(str)), axis=1)

# train_label.head()

In [7]:
label_combined_unique = train_label['label_combined'].value_counts().index.tolist()
dict_str2num= {k: v for k, v in zip(label_combined_unique, range(len(label_combined_unique)))}
dict_num2str= {k: v for k, v in zip(range(len(label_combined_unique)), label_combined_unique)}

len(label_combined_unique)

91

In [8]:
train_label['label_combined_num'] = train_label['label_combined'].apply(lambda x: dict_str2num[x])

In [9]:
train_df = train_df.merge(train_label, on='filename')

In [10]:
train_df.shape, test_df.shape

((515401, 138), (5063112, 42))

In [11]:
# train_df['fea_count'] = train_df.groupby('filename')['filename'].transform('size')
# test_df['fea_count'] = test_df.groupby('filename')['filename'].transform('size')

In [12]:
cols_drop = ['fea_t_max','fea_a_mean', 'fea_dt_p2p'
             , 'fea_dt_unicnt','fea_dt_skew','fea_dt_std', 'fea_dt_min', 'fea_dt_max'
            ,'fea_dt_count'
            ,'fea_t_min','fea_v_count','fea_a_count'
            ,'fea_t_p2p','fea_dt_Q50'
#              ,'fea_dt_rk'             
            ]

train_df = train_df.drop(cols_drop, axis=1)
test_df = test_df.drop(cols_drop, axis=1)

In [13]:
col_label = 'label_combined_num'
# 训练数据/测试数据准备
features = [f for f in train_df.columns if 'fea_' in f and 'dv/dt' not in f
            and '4rd' not in f and '5rd' not in f and 'fea_t_p2p_freq' not in f and 'fft_' not in f
           and 'pos_' not in f and 'neg_' not in f
           ]
print('->'*10, '去除唯一值前: ', len(features), features)
cols_single = []
for col in tqdm(features):
    unicnt = train_df[col].nunique()
    if unicnt==1:
        cols_single.append(col)
print('->'*10, '唯一值特征: ', len(cols_single), cols_single)
features = [f for f in features if f not in cols_single]
print('->'*10, '去除唯一值后: ', len(features), features)

# for col in features:
#     num_5, num_95 = train_df[col].quantile(0.05), train_df[col].quantile(0.95)
#     train_df[col] = train_df[col].apply(lambda x: num_5 if x<num_5 else num_95 if x>num_95 else x)

            
train = train_df.reset_index(drop=True).copy()
# train[col_label] = train[col_label].apply(lambda x: 1 if x>-1 else -1)
# train = train[train[col_label]!=0].reset_index(drop=True)
test = test_df.reset_index(drop=True)
# train[col_label] = train[col_label].apply(lambda x: 1 if x==1 else 0)

x_train = train[features]
x_test = test[features]

y_train = train[col_label]

100%|██████████| 25/25 [00:00<00:00, 130.26it/s]

->->->->->->->->->-> 去除唯一值前:  25 ['fea_v_mean', 'fea_v_std', 'fea_v_max', 'fea_v_min', 'fea_v_p2p', 'fea_v_unicnt', 'fea_v_Q25', 'fea_v_Q50', 'fea_v_Q75', 'fea_v_skew', 'fea_v_kurt', 'fea_a_std', 'fea_a_max', 'fea_a_min', 'fea_a_p2p', 'fea_a_unicnt', 'fea_a_Q25', 'fea_a_Q50', 'fea_a_Q75', 'fea_a_skew', 'fea_a_kurt', 'fea_dt_mean', 'fea_dt_Q25', 'fea_dt_Q75', 'fea_dt_kurt']
->->->->->->->->->-> 唯一值特征:  0 []
->->->->->->->->->-> 去除唯一值后:  25 ['fea_v_mean', 'fea_v_std', 'fea_v_max', 'fea_v_min', 'fea_v_p2p', 'fea_v_unicnt', 'fea_v_Q25', 'fea_v_Q50', 'fea_v_Q75', 'fea_v_skew', 'fea_v_kurt', 'fea_a_std', 'fea_a_max', 'fea_a_min', 'fea_a_p2p', 'fea_a_unicnt', 'fea_a_Q25', 'fea_a_Q50', 'fea_a_Q75', 'fea_a_skew', 'fea_a_kurt', 'fea_dt_mean', 'fea_dt_Q25', 'fea_dt_Q75', 'fea_dt_kurt']





In [14]:
train['filename'].nunique(), test['filename'].nunique()

(15918, 157860)

In [15]:
# x_train.to_csv('../data_tmp/x_train.csv', index=False)
# x_test.to_csv('../data_tmp/x_test.csv', index=False)

# y_train.to_csv('../data_tmp/y_train.csv', index=False)

In [16]:
# cols = ['fea_v_mean', 'fea_v_std', 'fea_v_max', 'fea_v_min', 'fea_v_p2p',
#        'fea_v_unicnt', 'fea_v_Q25', 'fea_v_Q75', 'fea_v_skew', 'fea_v_kurt',
#        'fea_a_std', 'fea_a_max', 'fea_a_min', 'fea_a_p2p', 'fea_a_unicnt',
#        'fea_a_Q25', 'fea_a_Q75', 'fea_a_skew', 'fea_a_kurt', 'fea_dt_mean',
#        'fea_dt_Q25', 'fea_dt_Q75', 'fea_dt_kurt', 'fea_v_Q50', 'fea_a_Q50']
# len(cols)

In [17]:
# col_lst = []
# cv_scores_lst = []
# cv_scores_mean_lst = []
# cv_scores_std_lst = []
# cab_train_lst = []
# cab_test_lst = []


# col_bst = ''
# cv_scores_bst = -1
# cv_scores_mean_bst = -1
# cv_scores_std_bst = 99
# cab_train_bst = []
# cab_test_bst = []

# for col in tqdm(features):
#     cab_train, cab_test, cv_scores, cv_scores_mean, cv_scores_std = cab_model(x_train.drop(col, axis=1), y_train, x_test.drop(col, axis=1), y_train, sd)
    
#     col_lst.append(col)
#     cv_scores_lst.append(cv_scores)
#     cv_scores_mean_lst.append(cv_scores_mean)
#     cv_scores_std_lst.append(cv_scores_std)
#     cab_train_lst.append(cab_train)
#     cab_test_lst.append(cab_test)
    
#     if cv_scores_mean>cv_scores_mean_bst and (cv_scores_std<=cv_scores_std_bst or (cv_scores_std>cv_scores_std_bst and cv_scores_std-cv_scores_std_bst<=0.001)):
#         col_bst = col
#         cv_scores_bst = cv_scores
#         cv_scores_mean_bst = cv_scores_mean
#         cv_scores_std_bst = cv_scores_std
#         cab_train_bst = cab_train
#         cab_test_bst = cab_test
#         print('->'*20, 'fea_bst: {}, cv_scores_bst:{} , cv_scores_mean_bst:{}, cv_scores_std:{}'.format(col_bst,cv_scores_bst,cv_scores_mean_bst,cv_scores_std) )

In [18]:
# sd_lst = []
# cv_scores_lst = []
# cv_scores_mean_lst = []
# cv_scores_std_lst = []
# cab_train_lst = []
# cab_test_lst = []


# sd_bst = -1
# cv_scores_bst = -1
# cv_scores_mean_bst = -1
# cv_scores_std_bst = 99
# cab_train_bst = []
# cab_test_bst = []

# for sd in tqdm(range(100)):
#     cab_train, cab_test, cv_scores, cv_scores_mean, cv_scores_std = cab_model(x_train, y_train, x_test, y_train, sd)
    
#     sd_lst.append(sd)
#     cv_scores_lst.append(cv_scores)
#     cv_scores_mean_lst.append(cv_scores_mean)
#     cv_scores_std_lst.append(cv_scores_std)
#     cab_train_lst.append(cab_train)
#     cab_test_lst.append(cab_test)
    
#     if cv_scores_mean>cv_scores_mean_bst and (cv_scores_std<=cv_scores_std_bst or (cv_scores_std>cv_scores_std_bst and cv_scores_std-cv_scores_std_bst<=0.001)):
#         sd_bst = sd
#         cv_scores_bst = cv_scores
#         cv_scores_mean_bst = cv_scores_mean
#         cv_scores_std_bst = cv_scores_std
#         cab_train_bst = cab_train
#         cab_test_bst = cab_test
#         print('->'*20, 'sd_bst: {}, cv_scores_bst:{} , cv_scores_mean_bst:{}, cv_scores_std:{}'.format(sd_bst,cv_scores_bst,cv_scores_mean_bst,cv_scores_std) )

In [19]:
# from sklearn.preprocessing import StandardScaler
# import joblib

# # 创建 StandardScaler 对象
# scaler = StandardScaler()

# # 使用 X_train 拟合 scaler
# scaler.fit(x_train)

# # 保存 scaler 对象
# joblib.dump(scaler, '../model/scaler_v7.pkl')

# # 使用拟合好的 scaler 转换
# x_train = scaler.transform(x_train)
# x_test = scaler.transform(x_test)

# x_train = pd.DataFrame(x_train, columns = features)
# x_test = pd.DataFrame(x_test, columns = features)

In [20]:
x_train.columns

Index(['fea_v_mean', 'fea_v_std', 'fea_v_max', 'fea_v_min', 'fea_v_p2p',
       'fea_v_unicnt', 'fea_v_Q25', 'fea_v_Q50', 'fea_v_Q75', 'fea_v_skew',
       'fea_v_kurt', 'fea_a_std', 'fea_a_max', 'fea_a_min', 'fea_a_p2p',
       'fea_a_unicnt', 'fea_a_Q25', 'fea_a_Q50', 'fea_a_Q75', 'fea_a_skew',
       'fea_a_kurt', 'fea_dt_mean', 'fea_dt_Q25', 'fea_dt_Q75', 'fea_dt_kurt'],
      dtype='object')

In [None]:
sd = 518885135

cab_train, cab_test, cv_scores, cv_scores_mean, cv_scores_std, cab_model_lst = cab_model(x_train, y_train, x_test, train_df['filename'].values, sd)

************************************ 1 ************************************
0:	learn: 0.4464535	test: 0.4464598	best: 0.4464598 (0)	total: 41.2ms	remaining: 5m 29s
500:	learn: 0.0129499	test: 0.0132466	best: 0.0132466 (500)	total: 16.7s	remaining: 4m 9s
1000:	learn: 0.0121183	test: 0.0126366	best: 0.0126366 (1000)	total: 33.6s	remaining: 3m 55s
1500:	learn: 0.0116794	test: 0.0124094	best: 0.0124094 (1500)	total: 50.9s	remaining: 3m 40s
2000:	learn: 0.0113957	test: 0.0123148	best: 0.0123148 (2000)	total: 1m 8s	remaining: 3m 24s
2500:	learn: 0.0111597	test: 0.0122484	best: 0.0122484 (2500)	total: 1m 26s	remaining: 3m 9s
3000:	learn: 0.0109626	test: 0.0122095	best: 0.0122093 (2998)	total: 1m 43s	remaining: 2m 52s
3500:	learn: 0.0107990	test: 0.0121885	best: 0.0121884 (3494)	total: 2m 1s	remaining: 2m 35s
bestTest = 0.01217305487
bestIteration = 3839
Shrink model to first 3840 iterations.


In [None]:
xgb_train, xgb_test, cv_scores, cv_scores_mean, cv_scores_std, xgb_model_lst = xgb_model(x_train, y_train, x_test, y_train, sd)

************************************ 1 ************************************
[0]	train-mlogloss:1.72947	eval-mlogloss:1.73126
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 500 rounds.
[500]	train-mlogloss:0.56726	eval-mlogloss:0.62343
[1000]	train-mlogloss:0.52926	eval-mlogloss:0.59997
[1500]	train-mlogloss:0.51743	eval-mlogloss:0.59632
[2000]	train-mlogloss:0.51016	eval-mlogloss:0.59459
[2500]	train-mlogloss:0.50429	eval-mlogloss:0.59340
[3000]	train-mlogloss:0.49962	eval-mlogloss:0.59238
[3500]	train-mlogloss:0.49593	eval-mlogloss:0.59187
[4000]	train-mlogloss:0.49271	eval-mlogloss:0.59163
[4500]	train-mlogloss:0.48989	eval-mlogloss:0.59139
[5000]	train-mlogloss:0.48752	eval-mlogloss:0.59110
[5500]	train-mlogloss:0.48527	eval-mlogloss:0.59093
[6000]	train-mlogloss:0.48335	eval-mlogloss:0.59088


In [26]:
for i in tqdm(range(len(cab_model_lst))):
    joblib.dump(cab_model_lst[i], '../model/v51_sd518885135_feacut_10drop_merge_v13_cab_{}.pkl'.format(i))

# test = np.zeros((x_test.shape[0], len(np.unique(y_train))))    # 为多分类任务初始化
# for i in tqdm(range(5)):
#     model = joblib.load( '../model/v51_sd518885135_feacut_10drop_merge_v41_cab_{}.pkl'.format(i))
#     print('load model {}'.format(i))
#     test_pred_proba = model.predict_proba(x_test)
#     test += test_pred_proba / 5


for i in tqdm(range(len(xgb_model_lst))):
    joblib.dump(xgb_model_lst[i], '../model/v51_sd518885135_feacut_10drop_merge_v13_xgb_{}.pkl'.format(i))
    
# test = np.zeros((x_test.shape[0], len(np.unique(y_train))))    # 为多分类任务初始化
# for i in tqdm(range(5)):
#     model = joblib.load('../model/v51_sd518885135_feacut_10drop_merge_v43_xgb_{}.pkl'.format(i) )
#     print('load model {}'.format(i))
#     test_pred_proba = model.predict(xgb.DMatrix(x_test), ntree_limit=model.best_ntree_limit)
#     test += test_pred_proba / 5



100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:17<00:00,  3.45s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.40it/s]


In [None]:
# 合并概率矩阵和 file 列
combined_df = pd.concat([test_df[['filename']].reset_index(drop=True), pd.DataFrame(1.0*cab_test )], axis=1)
# combined_df = pd.concat([test_df[['filename']].reset_index(drop=True), pd.DataFrame(0.5*cab_test +0.5*xgb_test )], axis=1)

# 根据 'file' 列分组并计算 n 列的最大值
max_values = combined_df.groupby('filename').mean().reset_index()
print(max_values.shape)
test_df_out = max_values.copy()
print(max_values.shape)

In [None]:
col_label = 'label_combined_num'
test_df_out[col_label] = np.argmax(max_values.iloc[:, 1:].values, axis=1)
test_df_out['prob'] = np.max(max_values.iloc[:, 1:].values, axis=1)
print(test_df_out[col_label].value_counts().head())

file = 'v51_sd518885135_feacut_10drop_merge_v13_cab_5fold_mean'
test_df_out['label_combined'] = test_df_out['label_combined_num'].apply(lambda x: dict_num2str[x])
for i in tqdm(range(len(cols_label))):
    test_df_out[cols_label[i]] = test_df_out['label_combined'].apply(lambda x: int(x.split(',')[i])/2+0.5 )
print(test_df_out['Zone_Air_Temperature_Sensor'].value_counts())

for col in tqdm(cols_label):
    test_df_out[col] = test_df_out[col].apply(lambda x: round(x,4))
cols_out = ['filename'] + cols_label
test_df_out[cols_out].to_csv('../res/{}.csv'.format(file), index=False)

In [None]:
# 示例 CSV 文件列表
csv_files = ['../res/{}.csv'.format(file)]  # 替换为你的 CSV 文件名
output_filename = '../tar/{}.tar.gz'.format(file)

# 调用函数
compress_csv_to_tar_gz(csv_files, output_filename)

print(f"已将 CSV 文件压缩为 {output_filename}")

In [None]:
0    70397
1    37300
2    34790
9    13737
5    13590
Name: label_combined_num, dtype: int64
100%|██████████████████████████████████████████████████████████████████████████████████| 94/94 [01:39<00:00,  1.06s/it]
0.0    304271
1.0      9839
0.5      1610

In [46]:
import tarfile
import os
import numpy as np
import pandas as pd



df_bef = pd.read_csv('../res/v41_sd61_feacut_10drop_merge_v14_cab_5fold.csv')
df_aft = pd.read_csv('../res/{}.csv'.format(file) )

df_bef.shape, df_aft.shape
df_mge = pd.concat([df_bef, df_aft], axis=0)
df_mge = df_mge.groupby('filename').tail(1)

file_fix = '{}_fix'.format(file)
df_mge.to_csv('../res/{}.csv'.format(file_fix))

In [47]:
# 指定要压缩的CSV文件路径
csv_file = '../res/{}.csv'.format(file_fix)

# 指定输出的tar.gz文件路径
output_tar_gz_path = '../tar/{}.tar.gz'.format(file_fix)

# 检查CSV文件是否存在
if not os.path.exists(csv_file):
    print(f'File {csv_file} does not exist.')
else:
    # 创建一个tar.gz文件
    with tarfile.open(output_tar_gz_path, 'w:gz') as tar:
        # 添加文件到tar归档中
        tar.add(csv_file, arcname=os.path.basename(csv_file))
        print(f'Added {csv_file} to {output_tar_gz_path}')

    print(f'File {csv_file} has been compressed into {output_tar_gz_path}')

Added ../res/v51_sd518885135_feacut_10drop_merge_v43_5cab5xgb_5fold_max_fix.csv to ../tar/v51_sd518885135_feacut_10drop_merge_v43_5cab5xgb_5fold_max_fix.tar.gz
File ../res/v51_sd518885135_feacut_10drop_merge_v43_5cab5xgb_5fold_max_fix.csv has been compressed into ../tar/v51_sd518885135_feacut_10drop_merge_v43_5cab5xgb_5fold_max_fix.tar.gz


In [None]:
dict_prob = {
#             'cab_train':cab_train, 
            'cab_test':cab_test,
#             'xgb_train':xgb_train, 
            'xgb_test':xgb_test,
            'dict_num2str':dict_num2str, 'dict_str2num':dict_str2num, 'cols_label':cols_label}


# 保存为 .npy 文件
np.save('../res/v51_sd518885135_feacut_10drop_merge_v12_5fold.npy', dict_prob)

In [486]:
# v41_cab 0.441	0.417	0.576	0.411	0.987	
# v41_lgb 0.399	0.343	0.445	0.397	0.988
# v41_xgb 0.444	0.399	0.541	0.414	0.987

# v41_8cab_2lgb      0.446	0.427	0.594	0.416	0.987
# v41_7cab_3lgb      0.449	0.428	0.598	0.419	0.987
# v41_7xgb_3cab      0.457	0.430	0.595	0.423	0.987
# v41_6xgb_4cab      0.461	0.434	0.600	0.426	0.987
# v41_5xgb_5cab      0.461	0.434	0.600	0.426	0.987
# v41_4xgb_4cab_2lgb 0.460	0.428	0.590	0.427	0.987
# v41_sd61_feacut_1_5cab_5xgb 0.469	0.439	0.603	0.432	0.987
# v41_sd61_feacut_1_6cab_4xgb 0.470	0.435	0.598	0.432	0.987

# v41_sd61_feacut_6drop_6xgb_4cab 0.470	0.433	0.591	0.434	0.988


# v42_cab 0.429	0.414	0.588	0.397	0.987
# v41-sd1024_cab 0.441	0.423	0.596	0.411	0.987	
# v41-sd61_cab 0.444	0.423	0.590	0.413	0.987

# v41_sd61_feacut_1_lgb 0.398	0.340	0.441	0.397	0.988
# v41_sd61_feacut_3_cab 0.448	0.416	0.575	0.421	0.987
# v41-sd61_feacut_6drop_cab 0.454	0.422	0.583	0.425	0.987
# v41-sd61_feacut_7drop_cab 0.455	0.424	0.584	0.428	0.987

# v41-sd61_feacut_7drop_v4_cab 0.468	0.433	0.595	0.437	0.987
# v41_sd61_feacut_7drop_v4_5cab_5xgb	0.479	0.445	0.606	0.438	0.988

# v41_sd61_feacut_10drop_v1_cab 0.471	0.436	0.595	0.439	0.987
# v41_sd61_feacut_10drop_v1_5cab_5xgb 0.482	0.450	0.617	0.442	0.987
# v41_sd61_feacut_10drop_v1_55cab_45xgb	0.483	0.451	0.619	0.443	0.987	
# v41_sd61_feacut_10drop_v2_cab_5fold 	0.487	0.446	0.601	0.457	0.987
# v41_sd61_feacut_10drop_v23_cab_5fold 0.487	0.443	0.588	0.468	0.988
# v41_sd61_feacut_10drop_v25_cab_5fold 0.489	0.448	0.593	0.469	0.988
# v41_sd61_feacut_10drop_merge_v1_55cab_45xgb 0.498	0.461	0.617	0.465	0.988
# v41_sd61_feacut_10drop_merge_v12_cab_5fold 0.495	0.453	0.606	0.471	0.988

# v41_sd61_feacut_10drop_merge_v13_cab_5fold 0.496	0.456	0.604	0.472	0.988


# v41_sd61_feacut_10drop_merge_v24_cab_5fold_fix 0.567	0.516	0.644	0.557	0.989
# v41_sd61_feacut_10drop_merge_v25_cab_5fold_fix 0.570	0.514	0.642	0.563	0.989	
# v41_sd61_feacut_10drop_merge_v26_cab_5fold_fix 0.575	0.518	0.647	0.568	0.989

# v41_sd61_feacut_10drop_merge_v27_cab_5fold_fix 0.576	0.520	0.649	0.570	0.989	
# v41_sd61_feacut_10drop_merge_v28_cab_5fold_fix 0.579	0.522	0.648	0.572	0.989

# v51_sd518885_feacut_10drop_merge_v1_cab_5fold         0.580	0.523	0.648	0.574	0.989	
# v51_sd518885_feacut_10drop_merge_v1_cab_5fold_mean	0.580	0.531	0.652	0.570	0.989	
# v51_sd518885_feacut_10drop_merge_v22_cab_5fold_max  0.580	0.524	0.650	0.576	0.989
# v51_sd518885_feacut_10drop_merge_v22_cab_5fold_mean 0.581	0.533	0.656	0.572	0.990	

# v51_sd518885135_feacut_10drop_merge_v22_cab_5fold_mean 0.582	0.535	0.657	0.573	0.990

# v51_sd518885135_feacut_10drop_merge_v22_5cab5xgb_5fold_mean 0.591 0.552 0.683 0.571 0.990


# v51_sd518885135_feacut_10drop_merge_v41_cab_5fold_mean 0.586	0.534	0.659	0.579	0.990	
# v51_sd518885135_feacut_10drop_merge_v41_xgb_5fold_mean 0.582	0.537	0.671	0.564	0.990
# v51_sd518885135_feacut_10drop_merge_v41_6cab4xgb_5fold_mean 0.594 0.549 0.677 0.578 0.990

# v51_sd518885135_feacut_10drop_merge_v42_xgb_5fold_mean 0.583	0.540	0.671	0.564	0.989
# v51_sd518885135_feacut_10drop_merge_v42_6cab4xgb_5fold_mean 0.594	0.551	0.680	0.577	0.990	
# v51_sd518885135_feacut_10drop_merge_v42_5cab5xgb_5fold_max 0.598	0.545	0.673	0.585	0.989
# v51_sd518885135_feacut_10drop_merge_v42_55cab45xgb_5fold_max 0.598	0.544	0.672	0.587	0.989

# v51_sd518885135_feacut_10drop_merge_v43_5cab5xgb_5fold_max	0.598	0.545	0.673	0.586	0.989



In [None]:
# 全量数据 切片
# v51_sd518885135_feacut_10drop_merge_v10_cab_5fold_max	0.554	0.504	0.642	0.544	0.989
# v51_sd518885135_feacut_10drop_merge_v10_5cab5xgb_5fold_max 0.557	0.512	0.655	0.539	0.989

In [597]:
# dict_prob = np.load('../res/v41_lgb_cab_xgb.npy', allow_pickle=True).item()

# lgb_train = dict_prob['lgb_train']
# cab_train = dict_prob['cab_train']

In [None]:
# xgboost                            1.0.1
# lightgbm                           3.1.1