In [3]:

import os
import gc 
import math 

import pandas as pd 
import numpy as np

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from tqdm import tqdm 
import matplotlib.pyplot as plt 
import time 
import warnings
warnings.filterwarnings('ignore')


# 读取数据

In [4]:
path = '/Users/ying.xie/Documents/finance_analysis/datawhale_projects/heartbeat_tianchi'
train_data = pd.read_csv(os.path.join(path,'train.csv'))
test_data = pd.read_csv(os.path.join(path,'testA.csv'))

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 100000 non-null  int64  
 1   heartbeat_signals  100000 non-null  object 
 2   label              100000 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 2.3+ MB


In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 20000 non-null  int64 
 1   heartbeat_signals  20000 non-null  object
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


In [7]:
train_data.head()

Unnamed: 0,id,heartbeat_signals,label
0,0,"0.9912297987616655,0.9435330436439665,0.764677...",0.0
1,1,"0.9714822034884503,0.9289687459588268,0.572932...",0.0
2,2,"1.0,0.9591487564065292,0.7013782792997189,0.23...",2.0
3,3,"0.9757952826275774,0.9340884687738161,0.659636...",0.0
4,4,"0.0,0.055816398940721094,0.26129357194994196,0...",2.0


In [8]:
test_data.head()

Unnamed: 0,id,heartbeat_signals
0,100000,"0.9915713654170097,1.0,0.6318163407681274,0.13..."
1,100001,"0.6075533139615096,0.5417083883163654,0.340694..."
2,100002,"0.9752726292239277,0.6710965234906665,0.686758..."
3,100003,"0.9956348033996116,0.9170249621481004,0.521096..."
4,100004,"1.0,0.8879490481178918,0.745564725322326,0.531..."


In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score

## 数据预处理

In [9]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print(f" Memory usage of dataframe is {start_mem:.2f} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')
            
    end_mem = df.memory_usage().sum() / 1024**2 
    
    print(f" Memory usage after optimization is {end_mem:.2f}MB")
    print(f" Decreased by {(start_mem-end_mem)*100/start_mem:.1f}%")
    
    return df
        

In [10]:
train = reduce_mem_usage(train_data)

 Memory usage of dataframe is 2.29 MB
 Memory usage after optimization is 1.34MB
 Decreased by 41.7%


In [11]:
test = reduce_mem_usage(test_data)

 Memory usage of dataframe is 0.31 MB
 Memory usage after optimization is 0.23MB
 Decreased by 25.0%


## 简单预处理

In [12]:
train_list = []

for item in train.values:
    train_list.append([item[0]]+[float(i) for i in item[1].split(',')]+[item[2]])
    
train = pd.DataFrame(np.array(train_list))
train.columns = ['id'] + [f's_{i}' for i in range(len(train_list[0])-2)] + ['label']
train = reduce_mem_usage(train)

test_list = []
for item in test.values:
    test_list.append([item[0]] + [float(i) for i in item[1].split(',')])

test = pd.DataFrame(np.array(test_list))
test.columns = ['id'] + [f's_{i}' for i in range(len(test_list[0])-1)]
test = reduce_mem_usage(test)
    

 Memory usage of dataframe is 157.93 MB
 Memory usage after optimization is 39.67MB
 Decreased by 74.9%
 Memory usage of dataframe is 31.43 MB
 Memory usage after optimization is 7.90MB
 Decreased by 74.9%


In [13]:
train.head()

Unnamed: 0,id,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,...,s_196,s_197,s_198,s_199,s_200,s_201,s_202,s_203,s_204,label
0,0.0,0.991211,0.943359,0.764648,0.618652,0.379639,0.190796,0.040222,0.026001,0.031708,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.97168,0.929199,0.572754,0.178467,0.122986,0.132324,0.094421,0.0896,0.030487,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,1.0,0.958984,0.701172,0.231812,0.0,0.080688,0.128418,0.1875,0.280762,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,3.0,0.975586,0.934082,0.659668,0.249878,0.237061,0.281494,0.249878,0.249878,0.241455,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.055817,0.26123,0.359863,0.433105,0.453613,0.499023,0.542969,0.616699,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [14]:
test.head()

Unnamed: 0,id,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,...,s_195,s_196,s_197,s_198,s_199,s_200,s_201,s_202,s_203,s_204
0,100000.0,0.991699,1.0,0.631836,0.13623,0.041412,0.102722,0.12085,0.123413,0.10791,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100001.0,0.607422,0.541504,0.340576,0.0,0.090698,0.164917,0.195068,0.168823,0.198853,...,0.389893,0.386963,0.367188,0.364014,0.360596,0.357178,0.350586,0.350586,0.350586,0.36377
2,100002.0,0.975098,0.670898,0.686523,0.708496,0.71875,0.716797,0.720703,0.70166,0.59668,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100003.0,0.995605,0.916992,0.520996,0.0,0.221802,0.404053,0.490479,0.527344,0.518066,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100004.0,1.0,0.888184,0.745605,0.531738,0.380371,0.224609,0.091125,0.057648,0.003914,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 训练数据/测试数据准备

In [15]:
x_train = train.drop(['id', 'label'],axis=1)
y_train = train['label']
x_test = test.drop(['id'],axis=1)

# 训练模型

In [16]:
def abs_sum(y_pre, y_tru):
    y_pre = np.array(y_pre)
    y_tru = np.array(y_tru)
    loss = sum(sum(abs(y_pre-y_tru)))
    return loss

In [27]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2021
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    test = np.zeros((test_x.shape[0],4))
    
    cv_scores = []
    onehot_encoder = OneHotEncoder(sparse=False)
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print(f"*********** {i+1} ************")
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
        
        if clf_name == 'lgb':
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)
            
            params = {
                'boosting_type': 'gbdt',
                'objective': 'multiclass',
                'num_class': 4,
                'num_leaves': 2 **5,
                'feature_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': seed,
                'nthread': 28,
                'n_jobs': 24,
                'verbose': -1,
            }
            
            model = clf.train(params, 
                              train_set=train_matrix,
                              valid_sets=valid_matrix,
                              num_boost_round=2000,
                              verbose_eval=100,
                              early_stopping_rounds=200)
            
            val_pred = model.predict(val_x, num_interation=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
        val_y= np.array(val_y).reshape(-1,1)
        val_y= onehot_encoder.fit_transform(val_y)
        print('预测的概率矩阵为:')
        print(test_pred)
        
        test += test_pred
        score = abs_sum(val_y, val_pred)
        cv_scores.append(score)
        print(cv_scores)
        
    print(f"{clf_name}_score_train_list {cv_scores}")
    print(f"{clf_name}_score_mean {np.mean(cv_scores)}")
    print(f"{clf_name}_score_std {np.std(cv_scores)}")
    
    test = test/kf.n_splits
    
    return test
    

In [19]:
def lgb_model(x_train, y_train, x_test):
    lgb_test = cv_model(lgb, x_train, y_train, x_test, 'lgb')
    return lgb_test

In [28]:
lgb_test = lgb_model(x_train, y_train, x_test)

*********** 1 ************
Training until validation scores don't improve for 200 rounds
[100]	valid_0's multi_logloss: 0.0533137
[200]	valid_0's multi_logloss: 0.0423642
[300]	valid_0's multi_logloss: 0.0414219
[400]	valid_0's multi_logloss: 0.0427645
Early stopping, best iteration is:
[259]	valid_0's multi_logloss: 0.041237
预测的概率矩阵为:
[[9.99919845e-01 7.60732054e-05 1.17983493e-06 2.90163499e-06]
 [2.13311303e-04 9.40944123e-04 9.98845697e-01 4.73504059e-08]
 [4.33107342e-06 6.63071871e-07 2.44483431e-06 9.99992561e-01]
 ...
 [8.31359160e-02 3.46899032e-04 9.16478761e-01 3.84240990e-05]
 [9.99879669e-01 1.19852330e-04 2.94194980e-07 1.84078657e-07]
 [9.62974182e-01 2.07470761e-02 8.26377118e-03 8.01497109e-03]]
[681.3519143142246]
*********** 2 ************
Training until validation scores don't improve for 200 rounds
[100]	valid_0's multi_logloss: 0.0585298
[200]	valid_0's multi_logloss: 0.047303
[300]	valid_0's multi_logloss: 0.0454707
[400]	valid_0's multi_logloss: 0.0463542
Early 

In [29]:
lgb_test

array([[9.99958730e-01, 3.90181673e-05, 5.66762133e-07, 1.68502797e-06],
       [7.76489458e-05, 3.85607851e-04, 9.99536727e-01, 1.60762734e-08],
       [3.54458940e-06, 3.29177999e-07, 1.12820566e-06, 9.99994998e-01],
       ...,
       [5.18447260e-02, 6.74267289e-04, 9.47462457e-01, 1.85498585e-05],
       [9.99900682e-01, 9.91248868e-05, 1.24044994e-07, 6.87668540e-08],
       [9.26122147e-01, 1.52383057e-02, 4.54663771e-02, 1.31731705e-02]])

In [30]:
temp = pd.DataFrame(lgb_test)
temp

Unnamed: 0,0,1,2,3
0,0.999959,3.901817e-05,5.667621e-07,1.685028e-06
1,0.000078,3.856079e-04,9.995367e-01,1.607627e-08
2,0.000004,3.291780e-07,1.128206e-06,9.999950e-01
3,0.999963,1.926525e-05,1.771848e-05,2.018472e-08
4,0.999960,8.852832e-06,2.971621e-05,9.887409e-07
...,...,...,...,...
19995,0.998484,4.747561e-04,2.884610e-04,7.529623e-04
19996,0.999811,1.609610e-04,2.735612e-05,1.862927e-07
19997,0.051845,6.742673e-04,9.474625e-01,1.854986e-05
19998,0.999901,9.912489e-05,1.240450e-07,6.876685e-08


In [34]:
result = pd.read_csv(os.path.join(path,'sample_submit.csv'))

In [36]:
result['label_0'] = temp[0]

In [38]:
result['label_1'] = temp[1]

In [39]:
result['label_2'] = temp[2]

In [40]:
result['label_3'] = temp[3]

In [41]:
result

Unnamed: 0,id,label_0,label_1,label_2,label_3
0,100000,0.999959,3.901817e-05,5.667621e-07,1.685028e-06
1,100001,0.000078,3.856079e-04,9.995367e-01,1.607627e-08
2,100002,0.000004,3.291780e-07,1.128206e-06,9.999950e-01
3,100003,0.999963,1.926525e-05,1.771848e-05,2.018472e-08
4,100004,0.999960,8.852832e-06,2.971621e-05,9.887409e-07
...,...,...,...,...,...
19995,119995,0.998484,4.747561e-04,2.884610e-04,7.529623e-04
19996,119996,0.999811,1.609610e-04,2.735612e-05,1.862927e-07
19997,119997,0.051845,6.742673e-04,9.474625e-01,1.854986e-05
19998,119998,0.999901,9.912489e-05,1.240450e-07,6.876685e-08


In [42]:
result.to_csv(os.path.join(path, 'submit.csv'),index=False)

In [43]:
result

Unnamed: 0,id,label_0,label_1,label_2,label_3
0,100000,0.999959,3.901817e-05,5.667621e-07,1.685028e-06
1,100001,0.000078,3.856079e-04,9.995367e-01,1.607627e-08
2,100002,0.000004,3.291780e-07,1.128206e-06,9.999950e-01
3,100003,0.999963,1.926525e-05,1.771848e-05,2.018472e-08
4,100004,0.999960,8.852832e-06,2.971621e-05,9.887409e-07
...,...,...,...,...,...
19995,119995,0.998484,4.747561e-04,2.884610e-04,7.529623e-04
19996,119996,0.999811,1.609610e-04,2.735612e-05,1.862927e-07
19997,119997,0.051845,6.742673e-04,9.474625e-01,1.854986e-05
19998,119998,0.999901,9.912489e-05,1.240450e-07,6.876685e-08
