# 融合各个模型结果

In [1]:
import os
import random
import time
import copy
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pack_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import DataLoader, Dataset, SequentialSampler

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

%load_ext autoreload
%autoreload 2
torch.__version__

'1.4.0'

In [2]:
# set random seeds to keep the results identical
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    
GLOBAL_SEED = 2020
setup_seed(GLOBAL_SEED)

In [3]:
data_path = './processed_data/'
res_path = './result/'
save_path = './processed_result'
if not os.path.exists(save_path):
    os.makedirs(save_path)

## 读取数据

In [4]:
df = pd.read_pickle(os.path.join(data_path, 'processed_data_numerical.pkl'))
df['age'] = df['age'] - 1
df['gender'] = df['gender'] - 1

In [5]:
os.listdir(res_path)

['lstm_v8_300size_win40_5folds_1.4638.npy',
 '.ipynb_checkpoints',
 'lstm_v10_300size_win30_10folds_1.4648.npy',
 'lstm_v1_300size_win10_5folds_1.4634.npy',
 'lstm_v11_128_128_10folds_1.4646.npy',
 'attention_lstm_v1_128_128_5folds_1.4613.npy',
 'lstm_v5_512size_win10_5folds_1.4624.npy',
 'lstm_v4_128_128_5folds_1.4629.npy',
 'lstm_v6_300size_win20_5folds_1.4642.npy',
 'lstm_v2_300size_win10_dropout_5folds_1.4644.npy',
 'submission.csv',
 'lstm_v3_300size_win100_5folds_1.4624.npy',
 'lstm_v9_300size_win50_5folds_1.4642.npy',
 'lstm_v7_300size_win30_5folds_1.4642.npy']

In [6]:
def load_res(name):
    res = np.load(os.path.join(res_path, name))
    X_train = res[:3000000, :12]
    y_train = res[:3000000, 12:]
    X_test = res[3000000:, :12]
    return X_train, y_train, X_test

X_train_list = []
y_train_list = []
X_test_list = []
select_res = [
 'lstm_v10_300size_win30_10folds_1.4648.npy',
 'lstm_v11_128_128_10folds_1.4646.npy',
 'lstm_v6_300size_win20_5folds_1.4642.npy',
 'lstm_v7_300size_win30_5folds_1.4642.npy',
 'lstm_v8_300size_win40_5folds_1.4638.npy',
 'lstm_v9_300size_win50_5folds_1.4642.npy',
 'lstm_v1_300size_win10_5folds_1.4634.npy',
 'lstm_v2_300size_win10_dropout_5folds_1.4644.npy',
 'lstm_v3_300size_win100_5folds_1.4624.npy',
 'lstm_v4_128_128_5folds_1.4629.npy',
 'lstm_v5_512size_win10_5folds_1.4624.npy',
 'attention_lstm_v1_128_128_5folds_1.4613.npy']
for name in select_res:
    X_train, y_train, X_test = load_res(name)
    X_train_list.append(X_train)
    y_train_list.append(y_train)
    X_test_list.append(X_test)
    
X_train = np.stack(X_train_list)
y_train = y_train_list[0]
X_test = np.stack(X_test_list)

In [25]:
y_pred_age = X_test.mean(axis=0)[:, :10].argmax(axis=1)
y_pred_gender = X_test.mean(axis=0)[:, 10:].argmax(axis=1)

df_submit = df.iloc[3000000:, -2:].rename({'age': 'predicted_age', 'gender':'predicted_gender'}, axis=1)
df_submit['predicted_age'] = y_pred_age + 1
df_submit['predicted_gender'] = y_pred_gender + 1
df_submit.to_csv(os.path.join(res_path, "submission.csv"))

In [26]:
df_submit

Unnamed: 0_level_0,predicted_age,predicted_gender
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3000001,3,1
3000002,7,2
3000003,2,2
3000004,3,1
3000005,4,1
...,...,...
3999996,2,1
3999997,2,1
3999998,2,1
3999999,3,1
