# 工具导入

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import gc
from collections import Counter
import copy

import warnings
warnings.filterwarnings("ignore")
 
%matplotlib inline

# 数据读取

In [2]:
'''
读取数据集
'''
test_data = pd.read_csv('./data_format1/test_format1.csv')
train_data = pd.read_csv('./data_format1/train_format1.csv')

user_info = pd.read_csv('./data_format1/user_info_format1.csv')
user_log = pd.read_csv('./data_format1/user_log_format1.csv')

In [3]:
test_data.head(5)

Unnamed: 0,user_id,merchant_id,prob
0,163968,4605,
1,360576,1581,
2,98688,1964,
3,98688,3645,
4,295296,3361,


In [4]:
train_data.head(5)

Unnamed: 0,user_id,merchant_id,label
0,34176,3906,0
1,34176,121,0
2,34176,4356,1
3,34176,2217,0
4,230784,4818,0


In [5]:
user_info.head(5)

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
1,234512,5.0,0.0
2,344532,5.0,0.0
3,186135,5.0,0.0
4,30230,5.0,0.0


In [6]:
user_log.head(5)

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661.0,829,0
1,328862,844400,1271,2882,2661.0,829,0
2,328862,575153,1271,2882,2661.0,829,0
3,328862,996875,1271,2882,2661.0,829,0
4,328862,1086186,1271,1253,1049.0,829,0


# 内存压缩

In [7]:
def read_csv(file_name, num_rows):
    return pd.read_csv(file_name, nrows=num_rows)

检测超出机器可表示范围的数值，强行转换类型：

In [8]:
def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
                    
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

# 对数据进行内存压缩

In [9]:
num_rows = None
num_rows = 200 * 10000 # 1000条测试代码使用

train_file = './data_format1/train_format1.csv'
test_file = './data_format1/test_format1.csv'
user_info_file = './data_format1/user_info_format1.csv'
user_log_file = './data_format1/user_log_format1.csv'

train_data = reduce_mem_usage(read_csv(train_file, num_rows))
test_data = reduce_mem_usage(read_csv(test_file, num_rows))
user_info = reduce_mem_usage(read_csv(user_info_file, num_rows))
user_log = reduce_mem_usage(read_csv(user_log_file, num_rows))

Memory usage after optimization is: 1.74 MB
Decreased by 70.8%
Memory usage after optimization is: 3.49 MB
Decreased by 41.7%
Memory usage after optimization is: 3.24 MB
Decreased by 66.7%
Memory usage after optimization is: 32.43 MB
Decreased by 69.6%


In [10]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260864 entries, 0 to 260863
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   user_id      260864 non-null  int32
 1   merchant_id  260864 non-null  int16
 2   label        260864 non-null  int8 
dtypes: int16(1), int32(1), int8(1)
memory usage: 1.7 MB


In [11]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261477 entries, 0 to 261476
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      261477 non-null  int32  
 1   merchant_id  261477 non-null  int16  
 2   prob         0 non-null       float64
dtypes: float64(1), int16(1), int32(1)
memory usage: 3.5 MB


In [12]:
user_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    424170 non-null  int32  
 1   age_range  421953 non-null  float16
 2   gender     417734 non-null  float16
dtypes: float16(2), int32(1)
memory usage: 3.2 MB


In [13]:
user_log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int32  
 1   item_id      int32  
 2   cat_id       int16  
 3   seller_id    int16  
 4   brand_id     float16
 5   time_stamp   int16  
 6   action_type  int8   
dtypes: float16(1), int16(3), int32(2), int8(1)
memory usage: 32.4 MB


# 数据处理
## 合并用户信息

In [14]:
del test_data['prob']
all_data = train_data.append(test_data)
all_data = all_data.merge(user_info,on=['user_id'],how='left')
all_data.head(5)

Unnamed: 0,user_id,merchant_id,label,age_range,gender
0,34176,3906,0.0,6.0,0.0
1,34176,121,0.0,6.0,0.0
2,34176,4356,1.0,6.0,0.0
3,34176,2217,0.0,6.0,0.0
4,230784,4818,0.0,0.0,0.0


In [15]:
# 释放内存
del train_data, test_data, user_info
gc.collect()

4

## 将用户行为日志根据时间进行排序

In [16]:
user_log = user_log.sort_values(['user_id','time_stamp'])
user_log

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
61975,16,980982,437,650,4276.0,914,0
61976,16,980982,437,650,4276.0,914,0
61977,16,980982,437,650,4276.0,914,0
61978,16,962763,19,650,4276.0,914,0
61979,16,391126,437,650,4276.0,914,0
...,...,...,...,...,...,...,...
541446,424164,61016,737,859,3724.0,1111,0
541447,424164,72017,662,606,376.0,1111,0
541448,424164,125913,1577,606,376.0,1111,0
541449,424164,20716,1238,606,376.0,1111,0


## 根据用户id分组，汇总Item_id,cat_id,seller_id,brand_id,time_stamp,action_type字段

In [17]:
# 合并函数
list_join_func = lambda x: " ".join([str(i) for i in x])

agg_dict = {
            'item_id' : list_join_func,
            'cat_id' : list_join_func,
            'seller_id' : list_join_func,
            'brand_id' : list_join_func,
            'time_stamp' : list_join_func,
            'action_type' : list_join_func
        }

new_column = {
            'item_id' : 'item_path',
            'cat_id' : 'cat_path',
            'seller_id' : 'seller_path',
            'brand_id' : 'brand_path',
            'time_stamp' : 'time_stamp_path',
            'action_type' : 'action_type_path'
}

user_log_path = user_log.groupby('user_id').agg(agg_dict).reset_index().rename(columns=new_column)

In [18]:
user_log_path.head(5)

Unnamed: 0,user_id,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path
0,16,980982 980982 980982 962763 391126 827174 6731...,437 437 437 19 437 437 437 437 895 19 437 437 ...,650 650 650 650 650 650 650 650 3948 650 650 6...,4276.0 4276.0 4276.0 4276.0 4276.0 4276.0 4276...,914 914 914 914 914 914 914 914 914 914 914 91...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2 0 ...
1,19,388018 388018 88673 88673 88673 88673 846066 5...,949 949 614 614 614 614 420 1401 948 948 513 1...,2772 2772 4066 4066 4066 4066 4951 4951 2872 2...,2112.0 2112.0 1552.0 1552.0 1552.0 1552.0 5200...,710 710 711 711 711 711 908 908 1105 1105 1105...,0 2 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,41,60215 1004605 60215 60215 60215 60215 628525 5...,1308 1308 1308 1308 1308 1308 1271 656 656 656...,2128 3207 2128 2128 2128 2128 3142 4618 4618 4...,3848.0 3848.0 3848.0 3848.0 3848.0 3848.0 1014...,521 521 521 521 521 522 529 828 828 828 828 82...,0 0 0 2 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 ...
3,56,889499 528459 765746 553259 889499 22435 40047...,662 1075 662 1577 662 11 184 1604 11 11 177 11...,4048 601 3104 3828 4048 4766 2419 2768 2565 26...,5360.0 1040.0 8240.0 1446.0 5360.0 4360.0 3428...,517 520 525 528 602 602 610 610 610 610 610 61...,3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 ...
4,155,979639 890128 981780 211366 211366 797946 4567...,267 1271 1505 267 267 1075 1075 407 407 1075 4...,2429 4785 3784 800 800 1595 1418 2662 2662 315...,2276.0 1422.0 5692.0 6328.0 6328.0 5800.0 7140...,529 529 602 604 604 607 607 607 607 607 607 60...,0 0 0 2 2 0 0 0 0 0 0 2 0 0 0 0 0 0 0 2 0 0 2 ...


## 合并用户label信息和用户行为信息

In [19]:
all_data_path = all_data.merge(user_log_path,on='user_id',how='inner')

In [20]:
all_data_path.head(5)

Unnamed: 0,user_id,merchant_id,label,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path
0,105600,1487,0.0,6.0,1.0,986160 681407 681407 910680 681407 592698 3693...,35 1554 1554 119 1554 662 1095 662 35 833 833 ...,4811 4811 4811 1897 4811 3315 2925 1340 1875 4...,127.0 127.0 127.0 4704.0 127.0 1605.0 6000.0 1...,518 518 518 520 520 524 524 524 525 525 525 52...,2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,110976,159,0.0,5.0,0.0,396970 961553 627712 926681 1012423 825576 149...,1023 420 407 1505 962 602 184 1606 351 1505 11...,1435 1648 223 3178 2418 1614 3004 2511 2285 78...,5504.0 7780.0 1751.0 7540.0 6652.0 8116.0 5328...,517 520 522 522 527 530 530 530 601 601 602 60...,2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 2 0 ...
2,374400,302,0.0,5.0,1.0,256546 202393 927572 2587 10956 549283 270303 ...,1188 646 1175 1188 1414 681 1175 681 681 115 1...,805 390 4252 3979 1228 2029 2029 2029 4252 923...,1842.0 5920.0 133.0 6304.0 7584.0 133.0 133.0 ...,517 604 604 604 607 609 609 609 609 615 621 62...,2 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,189312,1760,0.0,4.0,0.0,290583 166235 556025 217894 166235 556025 5589...,601 601 601 601 601 601 601 601 601 601 601 60...,3139 3139 3524 3139 3139 3524 3139 3139 3139 3...,549.0 549.0 549.0 549.0 549.0 549.0 549.0 549....,924 924 924 924 924 924 924 924 924 924 924 92...,0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,189312,2511,0.0,4.0,0.0,290583 166235 556025 217894 166235 556025 5589...,601 601 601 601 601 601 601 601 601 601 601 60...,3139 3139 3524 3139 3139 3524 3139 3139 3139 3...,549.0 549.0 549.0 549.0 549.0 549.0 549.0 549....,924 924 924 924 924 924 924 924 924 924 924 92...,0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


## 删除数据并回收内存

In [21]:
# user_log数据不再需要，释放内存
del user_log
gc.collect()

0

# 定义数据统计函数：
## 1. 统计数据总数

In [22]:
def cnt_(x):
    try:
        return len(x.split(' '))
    except:
        return -1

## 2. 统计唯一数据总数

In [23]:
def nunique_(x):
    try:
        return len(set(x.split(' ')))
    except:
        return -1

## 3. 统计数据最大值

In [24]:
def max_(x):
    try:
        return np.max([int(i) for i in x.split(' ')])
    except:
        return -1

## 4. 统计数据最小值

In [25]:
def min_(x):
    try:
        return np.min([int(i) for i in x.split(' ')])
    except:
        return -1

## 5. 统计数据的标准差

In [26]:
def std_(x):
    try:
        return np.std([float(i) for i in x.split(' ')])
    except:
        return -1

## 6. 统计数据中top N的数据

In [27]:
def most_n(x,n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][0]
    except:
        return -1

## 7. 统计数据中top N的数据的总数

In [28]:
def most_n_cnt(x,n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][1]
    except:
        return -1

## 8. 函数封装

In [29]:
def user_cnt(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(cnt_)
    return df_data

def user_nunique(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(nunique_)
    return df_data
    
def user_max(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(max_)
    return df_data

def user_min(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(min_)
    return df_data
    
def user_std(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(std_)
    return df_data

def user_most_n(df_data, single_col, name, n=1):
    func = lambda x: most_n(x, n)
    df_data[name] = df_data[single_col].apply(func)
    return df_data

def user_most_n_cnt(df_data, single_col, name, n=1):
    func = lambda x: most_n_cnt(x, n)
    df_data[name] = df_data[single_col].apply(func)
    return df_data

# 提取商铺的基本统计特征

In [30]:
all_data_test = all_data_path.head(2000)
all_data_test.head(5)

Unnamed: 0,user_id,merchant_id,label,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path
0,105600,1487,0.0,6.0,1.0,986160 681407 681407 910680 681407 592698 3693...,35 1554 1554 119 1554 662 1095 662 35 833 833 ...,4811 4811 4811 1897 4811 3315 2925 1340 1875 4...,127.0 127.0 127.0 4704.0 127.0 1605.0 6000.0 1...,518 518 518 520 520 524 524 524 525 525 525 52...,2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,110976,159,0.0,5.0,0.0,396970 961553 627712 926681 1012423 825576 149...,1023 420 407 1505 962 602 184 1606 351 1505 11...,1435 1648 223 3178 2418 1614 3004 2511 2285 78...,5504.0 7780.0 1751.0 7540.0 6652.0 8116.0 5328...,517 520 522 522 527 530 530 530 601 601 602 60...,2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 2 0 ...
2,374400,302,0.0,5.0,1.0,256546 202393 927572 2587 10956 549283 270303 ...,1188 646 1175 1188 1414 681 1175 681 681 115 1...,805 390 4252 3979 1228 2029 2029 2029 4252 923...,1842.0 5920.0 133.0 6304.0 7584.0 133.0 133.0 ...,517 604 604 604 607 609 609 609 609 615 621 62...,2 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,189312,1760,0.0,4.0,0.0,290583 166235 556025 217894 166235 556025 5589...,601 601 601 601 601 601 601 601 601 601 601 60...,3139 3139 3524 3139 3139 3524 3139 3139 3139 3...,549.0 549.0 549.0 549.0 549.0 549.0 549.0 549....,924 924 924 924 924 924 924 924 924 924 924 92...,0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,189312,2511,0.0,4.0,0.0,290583 166235 556025 217894 166235 556025 5589...,601 601 601 601 601 601 601 601 601 601 601 60...,3139 3139 3524 3139 3139 3524 3139 3139 3139 3...,549.0 549.0 549.0 549.0 549.0 549.0 549.0 549....,924 924 924 924 924 924 924 924 924 924 924 92...,0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


In [31]:
# 统计用户点击总次数
all_data_test = user_cnt(all_data_test,'action_type_path','user_cnt')
# 统计不同店铺个数
all_data_test = user_nunique(all_data_test,  'seller_path', 'seller_nunique')
# 统计不同商品品类的个数
all_data_test = user_nunique(all_data_test,  'cat_path', 'cat_nunique')
# 统计不同品牌的个数
all_data_test = user_nunique(all_data_test,  'brand_path', 'brand_nunique')
# 统计不同商品的数量
all_data_test = user_nunique(all_data_test,'item_path','item_nunique')
# 统计每个用户的活跃天数
all_data_test = user_nunique(all_data_test,'time_stamp_path','time_stamp_nunique')
# 统计每个用户不同行为种类数
all_data_test = user_nunique(all_data_test,'action_type_path','action_type_nunique')

In [32]:
all_data_test.head(5)

Unnamed: 0,user_id,merchant_id,label,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique,time_stamp_nunique,action_type_nunique
0,105600,1487,0.0,6.0,1.0,986160 681407 681407 910680 681407 592698 3693...,35 1554 1554 119 1554 662 1095 662 35 833 833 ...,4811 4811 4811 1897 4811 3315 2925 1340 1875 4...,127.0 127.0 127.0 4704.0 127.0 1605.0 6000.0 1...,518 518 518 520 520 524 524 524 525 525 525 52...,2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,310,96,37,88,217,29,2
1,110976,159,0.0,5.0,0.0,396970 961553 627712 926681 1012423 825576 149...,1023 420 407 1505 962 602 184 1606 351 1505 11...,1435 1648 223 3178 2418 1614 3004 2511 2285 78...,5504.0 7780.0 1751.0 7540.0 6652.0 8116.0 5328...,517 520 522 522 527 530 530 530 601 601 602 60...,2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 2 0 ...,274,181,70,159,233,52,3
2,374400,302,0.0,5.0,1.0,256546 202393 927572 2587 10956 549283 270303 ...,1188 646 1175 1188 1414 681 1175 681 681 115 1...,805 390 4252 3979 1228 2029 2029 2029 4252 923...,1842.0 5920.0 133.0 6304.0 7584.0 133.0 133.0 ...,517 604 604 604 607 609 609 609 609 615 621 62...,2 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,278,57,59,62,148,35,3
3,189312,1760,0.0,4.0,0.0,290583 166235 556025 217894 166235 556025 5589...,601 601 601 601 601 601 601 601 601 601 601 60...,3139 3139 3524 3139 3139 3524 3139 3139 3139 3...,549.0 549.0 549.0 549.0 549.0 549.0 549.0 549....,924 924 924 924 924 924 924 924 924 924 924 92...,0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,237,49,35,45,170,9,2
4,189312,2511,0.0,4.0,0.0,290583 166235 556025 217894 166235 556025 5589...,601 601 601 601 601 601 601 601 601 601 601 60...,3139 3139 3524 3139 3139 3524 3139 3139 3139 3...,549.0 549.0 549.0 549.0 549.0 549.0 549.0 549....,924 924 924 924 924 924 924 924 924 924 924 92...,0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,237,49,35,45,170,9,2


In [33]:
# 最晚时间
all_data_test = user_max(all_data_test,'time_stamp_path','time_stamp_max')
# 最早时间
all_data_test = user_min(all_data_test,'time_stamp_path','time_stamp_min')
# 活跃天数方差
all_data_test = user_std(all_data_test,'time_stamp_path','time_stamp_std')
# 最早和最晚相差天数
all_data_test['time_stamp_range'] = all_data_test['time_stamp_max'] - all_data_test['time_stamp_min']

In [34]:
# 每个用户最喜欢的店铺
all_data_test = user_most_n(all_data_test,'seller_path','seller_most_1',n=1)
# 每个用户最喜欢的商品品类
all_data_test = user_most_n(all_data_test,'cat_path','cat_most_1',n=1)
# 每个用户最喜欢的品牌
all_data_test = user_most_n(all_data_test,'brand_path','brand_most_1',n=1)
# 每个用户最频繁的行为
all_data_test = user_most_n(all_data_test,'action_type_path','action_type_1',n=1)

In [35]:
# 每个用户最喜欢的店铺 行为总次数
all_data_test = user_most_n_cnt(all_data_test,'seller_path','seller_most_1_cnt',n=1)
# 每个用户最喜欢的商品品类 行为总次数
all_data_test = user_most_n_cnt(all_data_test,'cat_path','cat_most_1_cnt',n=1)
# 每个用户最喜欢的品牌 行为总次数
all_data_test = user_most_n_cnt(all_data_test,'brand_path','brand_most_1_cnt',n=1)
# 每个用户最频繁的行为 行为总次数
all_data_test = user_most_n_cnt(all_data_test,'action_type_path','action_type_1_cnt',n=1)

In [36]:
all_data_test.head(5)

Unnamed: 0,user_id,merchant_id,label,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,...,time_stamp_std,time_stamp_range,seller_most_1,cat_most_1,brand_most_1,action_type_1,seller_most_1_cnt,cat_most_1_cnt,brand_most_1_cnt,action_type_1_cnt
0,105600,1487,0.0,6.0,1.0,986160 681407 681407 910680 681407 592698 3693...,35 1554 1554 119 1554 662 1095 662 35 833 833 ...,4811 4811 4811 1897 4811 3315 2925 1340 1875 4...,127.0 127.0 127.0 4704.0 127.0 1605.0 6000.0 1...,518 518 518 520 520 524 524 524 525 525 525 52...,...,196.143254,593,1704,629,5580.0,0,35,43,35,299
1,110976,159,0.0,5.0,0.0,396970 961553 627712 926681 1012423 825576 149...,1023 420 407 1505 962 602 184 1606 351 1505 11...,1435 1648 223 3178 2418 1614 3004 2511 2285 78...,5504.0 7780.0 1751.0 7540.0 6652.0 8116.0 5328...,517 520 522 522 527 530 530 530 601 601 602 60...,...,188.604871,594,3645,662,3928.0,0,9,56,11,259
2,374400,302,0.0,5.0,1.0,256546 202393 927572 2587 10956 549283 270303 ...,1188 646 1175 1188 1414 681 1175 681 681 115 1...,805 390 4252 3979 1228 2029 2029 2029 4252 923...,1842.0 5920.0 133.0 6304.0 7584.0 133.0 133.0 ...,517 604 604 604 607 609 609 609 609 615 621 62...,...,145.929386,594,1369,1213,3332.0,0,93,29,48,241
3,189312,1760,0.0,4.0,0.0,290583 166235 556025 217894 166235 556025 5589...,601 601 601 601 601 601 601 601 601 601 601 60...,3139 3139 3524 3139 3139 3524 3139 3139 3139 3...,549.0 549.0 549.0 549.0 549.0 549.0 549.0 549....,924 924 924 924 924 924 924 924 924 924 924 92...,...,53.067342,187,361,602,5736.0,0,45,68,45,228
4,189312,2511,0.0,4.0,0.0,290583 166235 556025 217894 166235 556025 5589...,601 601 601 601 601 601 601 601 601 601 601 60...,3139 3139 3524 3139 3139 3524 3139 3139 3139 3...,549.0 549.0 549.0 549.0 549.0 549.0 549.0 549....,924 924 924 924 924 924 924 924 924 924 924 92...,...,53.067342,187,361,602,5736.0,0,45,68,45,228


# 分别统计用户的点击、加购、购买、收藏的特征
## 不同行为的业务函数定义：

In [37]:
# 统计符合action_type的行为总数
def col_cnt_(df_data, columns_list, action_type):
    try:
        data_dict = {}

        col_list = copy.deepcopy(columns_list)
        if action_type != None:
            col_list += ['action_type_path']

        for col in col_list:
            data_dict[col] = df_data[col].split(' ')

        path_len = len(data_dict[col])

        data_out = []
        for i_ in range(path_len):
            data_txt = ''
            for col_ in columns_list:
                if data_dict['action_type_path'][i_] == action_type:
                    data_txt += '_' + data_dict[col_][i_]
            data_out.append(data_txt)

        return len(data_out)  
    except:
        return -1
    
# 统计符合action_type的非重复行为次数
def col_nuique_(df_data, columns_list, action_type):
    try:
        data_dict = {}

        col_list = copy.deepcopy(columns_list)
        if action_type != None:
            col_list += ['action_type_path']

        for col in col_list:
            data_dict[col] = df_data[col].split(' ')

        path_len = len(data_dict[col])

        data_out = []
        for i_ in range(path_len):
            data_txt = ''
            for col_ in columns_list:
                if data_dict['action_type_path'][i_] == action_type:
                    data_txt += '_' + data_dict[col_][i_]
            data_out.append(data_txt)

        return len(set(data_out))
    except:
        return -1
    
    
    
    
# 封装函数
def user_col_cnt(df_data, columns_list, action_type, name):
    df_data[name] = df_data.apply(lambda x: col_cnt_(x, columns_list, action_type), axis=1)
    return df_data

def user_col_nunique(df_data, columns_list, action_type, name):
    df_data[name] = df_data.apply(lambda x: col_nuique_(x, columns_list, action_type), axis=1)
    return df_data

## 统计店铺被用户点击的次数、加购的次数、购买的次数、收藏的次数

In [38]:
# 点击次数
all_data_test = user_col_cnt(all_data_test,  ['seller_path'], '0', 'user_cnt_0')
# 加购次数
all_data_test = user_col_cnt(all_data_test,  ['seller_path'], '1', 'user_cnt_1')
# 购买次数
all_data_test = user_col_cnt(all_data_test,  ['seller_path'], '2', 'user_cnt_2')
# 收藏次数
all_data_test = user_col_cnt(all_data_test,  ['seller_path'], '3', 'user_cnt_3')

# 不同店铺点击次数
all_data_test = user_col_nunique(all_data_test,  ['seller_path'], '0', 'seller_nunique_0')
# 不同店铺加购次数
all_data_test = user_col_nunique(all_data_test,  ['seller_path'], '1', 'seller_nunique_1')
# 不同店铺购买次数
all_data_test = user_col_nunique(all_data_test,  ['seller_path'], '2', 'seller_nunique_2')
# 不同店铺收藏次数
all_data_test = user_col_nunique(all_data_test,  ['seller_path'], '3', 'seller_nunique_3')

In [39]:
all_data_test.head(5)

Unnamed: 0,user_id,merchant_id,label,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,...,brand_most_1_cnt,action_type_1_cnt,user_cnt_0,user_cnt_1,user_cnt_2,user_cnt_3,seller_nunique_0,seller_nunique_1,seller_nunique_2,seller_nunique_3
0,105600,1487,0.0,6.0,1.0,986160 681407 681407 910680 681407 592698 3693...,35 1554 1554 119 1554 662 1095 662 35 833 833 ...,4811 4811 4811 1897 4811 3315 2925 1340 1875 4...,127.0 127.0 127.0 4704.0 127.0 1605.0 6000.0 1...,518 518 518 520 520 524 524 524 525 525 525 52...,...,35,299,310,310,310,310,97,1,9,1
1,110976,159,0.0,5.0,0.0,396970 961553 627712 926681 1012423 825576 149...,1023 420 407 1505 962 602 184 1606 351 1505 11...,1435 1648 223 3178 2418 1614 3004 2511 2285 78...,5504.0 7780.0 1751.0 7540.0 6652.0 8116.0 5328...,517 520 522 522 527 530 530 530 601 601 602 60...,...,11,259,274,274,274,274,181,1,9,6
2,374400,302,0.0,5.0,1.0,256546 202393 927572 2587 10956 549283 270303 ...,1188 646 1175 1188 1414 681 1175 681 681 115 1...,805 390 4252 3979 1228 2029 2029 2029 4252 923...,1842.0 5920.0 133.0 6304.0 7584.0 133.0 133.0 ...,517 604 604 604 607 609 609 609 609 615 621 62...,...,48,241,278,278,278,278,56,1,9,7
3,189312,1760,0.0,4.0,0.0,290583 166235 556025 217894 166235 556025 5589...,601 601 601 601 601 601 601 601 601 601 601 60...,3139 3139 3524 3139 3139 3524 3139 3139 3139 3...,549.0 549.0 549.0 549.0 549.0 549.0 549.0 549....,924 924 924 924 924 924 924 924 924 924 924 92...,...,45,228,237,237,237,237,50,1,6,1
4,189312,2511,0.0,4.0,0.0,290583 166235 556025 217894 166235 556025 5589...,601 601 601 601 601 601 601 601 601 601 601 60...,3139 3139 3524 3139 3139 3524 3139 3139 3139 3...,549.0 549.0 549.0 549.0 549.0 549.0 549.0 549....,924 924 924 924 924 924 924 924 924 924 924 92...,...,45,228,237,237,237,237,50,1,6,1


# 组合特征
## 查看提取的特征

In [40]:
list(all_data_test.columns)

['user_id',
 'merchant_id',
 'label',
 'age_range',
 'gender',
 'item_path',
 'cat_path',
 'seller_path',
 'brand_path',
 'time_stamp_path',
 'action_type_path',
 'user_cnt',
 'seller_nunique',
 'cat_nunique',
 'brand_nunique',
 'item_nunique',
 'time_stamp_nunique',
 'action_type_nunique',
 'time_stamp_max',
 'time_stamp_min',
 'time_stamp_std',
 'time_stamp_range',
 'seller_most_1',
 'cat_most_1',
 'brand_most_1',
 'action_type_1',
 'seller_most_1_cnt',
 'cat_most_1_cnt',
 'brand_most_1_cnt',
 'action_type_1_cnt',
 'user_cnt_0',
 'user_cnt_1',
 'user_cnt_2',
 'user_cnt_3',
 'seller_nunique_0',
 'seller_nunique_1',
 'seller_nunique_2',
 'seller_nunique_3']

# 利用countvector，tfidf提取特征
Refercence:https://www.cnblogs.com/nxf-rabbit75/p/9353212.html

In [41]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from scipy import sparse

tfidfVec = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, ngram_range=(1, 1), max_features=100)

columns_list = ['seller_path']
for i, col in enumerate(columns_list):
    all_data_test[col] = all_data_test[col].astype(str)
    tfidfVec.fit(all_data_test[col])
    data_ = tfidfVec.transform(all_data_test[col])
    if i == 0:
        data_cat = data_
    else:
        data_cat = sparse.hstack((data_cat, data_))

## 特征重命名&合并——原数据+TF-IDF文本特征提取结果 
### (2000 Instances × 100 words）

In [42]:
df_tfidf = pd.DataFrame(data_cat.toarray())
df_tfidf.columns = ['tfidf_' + str(i) for i in df_tfidf.columns]
all_data_test = pd.concat([all_data_test, df_tfidf],axis=1)

In [43]:
all_data_test.head(5)

Unnamed: 0,user_id,merchant_id,label,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,...,tfidf_90,tfidf_91,tfidf_92,tfidf_93,tfidf_94,tfidf_95,tfidf_96,tfidf_97,tfidf_98,tfidf_99
0,105600,1487,0.0,6.0,1.0,986160 681407 681407 910680 681407 592698 3693...,35 1554 1554 119 1554 662 1095 662 35 833 833 ...,4811 4811 4811 1897 4811 3315 2925 1340 1875 4...,127.0 127.0 127.0 4704.0 127.0 1605.0 6000.0 1...,518 518 518 520 520 524 524 524 525 525 525 52...,...,0.0,0.0,0.0,0.115594,0.0,0.0,0.0,0.0,0.0,0.0
1,110976,159,0.0,5.0,0.0,396970 961553 627712 926681 1012423 825576 149...,1023 420 407 1505 962 602 184 1606 351 1505 11...,1435 1648 223 3178 2418 1614 3004 2511 2285 78...,5504.0 7780.0 1751.0 7540.0 6652.0 8116.0 5328...,517 520 522 522 527 530 530 530 601 601 602 60...,...,0.0,0.151756,0.0,0.0,0.438598,0.0,0.163503,0.0,0.0,0.0
2,374400,302,0.0,5.0,1.0,256546 202393 927572 2587 10956 549283 270303 ...,1188 646 1175 1188 1414 681 1175 681 681 115 1...,805 390 4252 3979 1228 2029 2029 2029 4252 923...,1842.0 5920.0 133.0 6304.0 7584.0 133.0 133.0 ...,517 604 604 604 607 609 609 609 609 615 621 62...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.203564,0.0
3,189312,1760,0.0,4.0,0.0,290583 166235 556025 217894 166235 556025 5589...,601 601 601 601 601 601 601 601 601 601 601 60...,3139 3139 3524 3139 3139 3524 3139 3139 3139 3...,549.0 549.0 549.0 549.0 549.0 549.0 549.0 549....,924 924 924 924 924 924 924 924 924 924 924 92...,...,0.0,0.0,0.0,0.0,0.053659,0.0,0.015003,0.0,0.0,0.0
4,189312,2511,0.0,4.0,0.0,290583 166235 556025 217894 166235 556025 5589...,601 601 601 601 601 601 601 601 601 601 601 60...,3139 3139 3524 3139 3139 3524 3139 3139 3139 3...,549.0 549.0 549.0 549.0 549.0 549.0 549.0 549....,924 924 924 924 924 924 924 924 924 924 924 92...,...,0.0,0.0,0.0,0.0,0.053659,0.0,0.015003,0.0,0.0,0.0


# embeeding特征
#### 离散数字特征——>100维向量

In [46]:
import gensim

# Train Word2Vec model

model = gensim.models.Word2Vec(all_data_test['seller_path'].apply(lambda x: x.split(' ')), window=5, min_count=5, workers=4)

def mean_w2v_(x, model, size=100):
    try:
        i = 0
        for word in x.split(' '):
            if word in model.wv.vocab:
                i += 1
                if i == 1:
                    vec = np.zeros(size)
                vec += model.wv[word]
        return vec / i 
    except:
        return  np.zeros(size)


def get_mean_w2v(df_data, columns, model, size):
    data_array = []
    for index, row in df_data.iterrows():
        w2v = mean_w2v_(row[columns], model, size)
        data_array.append(w2v)
    return pd.DataFrame(data_array)

df_embeeding = get_mean_w2v(all_data_test, 'seller_path', model, 100)
df_embeeding.columns = ['embeeding_' + str(i) for i in df_embeeding.columns]

## embeeding特征和原始特征合并

In [50]:
all_data_test = pd.concat([all_data_test,df_embeeding],axis=1)

# Stacking特征
Reference:https://www.youtube.com/watch?v=lcXKFS65BI0

In [77]:
# 导入模型
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
from scipy import sparse
import xgboost
import lightgbm
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor,ExtraTreesRegressor
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss,mean_absolute_error,mean_squared_error
from sklearn.naive_bayes import MultinomialNB,GaussianNB

In [78]:
"""
-- 回归
-- stacking 回归特征
"""
def stacking_reg(clf,train_x,train_y,test_x,clf_name,kf,label_split=None):
    train=np.zeros((train_x.shape[0],1))
    test=np.zeros((test_x.shape[0],1))
    test_pre=np.empty((folds,test_x.shape[0],1))
    cv_scores=[]
    for i,(train_index,test_index) in enumerate(kf.split(train_x,label_split)):       
        tr_x=train_x[train_index]
        tr_y=train_y[train_index]
        te_x=train_x[test_index]
        te_y = train_y[test_index]
        if clf_name in ["rf","ada","gb","et","lr"]:
            clf.fit(tr_x,tr_y)
            pre=clf.predict(te_x).reshape(-1,1)
            train[test_index]=pre
            test_pre[i,:]=clf.predict(test_x).reshape(-1,1)
            cv_scores.append(mean_squared_error(te_y, pre))
        elif clf_name in ["xgb"]:
            train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=-1)
            test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
            z = clf.DMatrix(test_x, label=te_y, missing=-1)
            params = {'booster': 'gbtree',
                      'eval_metric': 'rmse',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.03,
                      'tree_method': 'exact',
                      'seed': 2017,
                      'nthread': 12
                      }
            num_round = 10000
            early_stopping_rounds = 100
            watchlist = [(train_matrix, 'train'),
                         (test_matrix, 'eval')
                         ]
            if test_matrix:
                model = clf.train(params, train_matrix, num_boost_round=num_round,evals=watchlist,
                                  early_stopping_rounds=early_stopping_rounds
                                  )
                pre= model.predict(test_matrix,ntree_limit=model.best_ntree_limit).reshape(-1,1)
                train[test_index]=pre
                test_pre[i, :]= model.predict(z, ntree_limit=model.best_ntree_limit).reshape(-1,1)
                cv_scores.append(mean_squared_error(te_y, pre))

        elif clf_name in ["lgb"]:
            train_matrix = clf.Dataset(tr_x, label=tr_y)
            test_matrix = clf.Dataset(te_x, label=te_y)
            params = {
                      'boosting_type': 'gbdt',
                      'objective': 'regression_l2',
                      'metric': 'mse',
                      'min_child_weight': 1.5,
                      'num_leaves': 2**5,
                      'lambda_l2': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'learning_rate': 0.03,
                      'tree_method': 'exact',
                      'seed': 2017,
                      'nthread': 12,
                      'silent': True,
                      }
            num_round = 10000
            early_stopping_rounds = 100
            if test_matrix:
                model = clf.train(params, train_matrix,num_round,valid_sets=test_matrix,
                                  early_stopping_rounds=early_stopping_rounds
                                  )
                pre= model.predict(te_x,num_iteration=model.best_iteration).reshape(-1,1)
                train[test_index]=pre
                test_pre[i, :]= model.predict(test_x, num_iteration=model.best_iteration).reshape(-1,1)
                cv_scores.append(mean_squared_error(te_y, pre))
        else:
            raise IOError("Please add new clf.")
        print("%s now score is:"%clf_name,cv_scores)
    test[:]=test_pre.mean(axis=0)
    print("%s_score_list:"%clf_name,cv_scores)
    print("%s_score_mean:"%clf_name,np.mean(cv_scores))
    return train.reshape(-1,1),test.reshape(-1,1)

def rf_reg(x_train, y_train, x_valid, kf, label_split=None):
    randomforest = RandomForestRegressor(n_estimators=600, max_depth=20, n_jobs=-1, random_state=2017, max_features="auto",verbose=1)
    rf_train, rf_test = stacking_reg(randomforest, x_train, y_train, x_valid, "rf", kf, label_split=label_split)
    return rf_train, rf_test,"rf_reg"

def ada_reg(x_train, y_train, x_valid, kf, label_split=None):
    adaboost = AdaBoostRegressor(n_estimators=30, random_state=2017, learning_rate=0.01)
    ada_train, ada_test = stacking_reg(adaboost, x_train, y_train, x_valid, "ada", kf, label_split=label_split)
    return ada_train, ada_test,"ada_reg"

def gb_reg(x_train, y_train, x_valid, kf, label_split=None):
    gbdt = GradientBoostingRegressor(learning_rate=0.04, n_estimators=100, subsample=0.8, random_state=2017,max_depth=5,verbose=1)
    gbdt_train, gbdt_test = stacking_reg(gbdt, x_train, y_train, x_valid, "gb", kf, label_split=label_split)
    return gbdt_train, gbdt_test,"gb_reg"

def et_reg(x_train, y_train, x_valid, kf, label_split=None):
    extratree = ExtraTreesRegressor(n_estimators=600, max_depth=35, max_features="auto", n_jobs=-1, random_state=2017,verbose=1)
    et_train, et_test = stacking_reg(extratree, x_train, y_train, x_valid, "et", kf, label_split=label_split)
    return et_train, et_test,"et_reg"

def lr_reg(x_train, y_train, x_valid, kf, label_split=None):
    lr_reg=LinearRegression(n_jobs=-1)
    lr_train, lr_test = stacking_reg(lr_reg, x_train, y_train, x_valid, "lr", kf, label_split=label_split)
    return lr_train, lr_test, "lr_reg"

def xgb_reg(x_train, y_train, x_valid, kf, label_split=None):
    xgb_train, xgb_test = stacking_reg(xgboost, x_train, y_train, x_valid, "xgb", kf, label_split=label_split)
    return xgb_train, xgb_test,"xgb_reg"

def lgb_reg(x_train, y_train, x_valid, kf, label_split=None):
    lgb_train, lgb_test = stacking_reg(lightgbm, x_train, y_train, x_valid, "lgb", kf, label_split=label_split)
    return lgb_train, lgb_test,"lgb_reg"

In [79]:
"""
-- 分类
-- stacking 分类特征
"""
def stacking_clf(clf,train_x,train_y,test_x,clf_name,kf,label_split=None):
    train=np.zeros((train_x.shape[0],1))
    test=np.zeros((test_x.shape[0],1))
    test_pre=np.empty((folds,test_x.shape[0],1))
    cv_scores=[]
    for i,(train_index,test_index) in enumerate(kf.split(train_x,label_split)):       
        tr_x=train_x[train_index]
        tr_y=train_y[train_index]
        te_x=train_x[test_index]
        te_y = train_y[test_index]

        if clf_name in ["rf","ada","gb","et","lr","knn","gnb"]:
            clf.fit(tr_x,tr_y)
            pre=clf.predict_proba(te_x)
            
            train[test_index]=pre[:,0].reshape(-1,1)
            test_pre[i,:]=clf.predict_proba(test_x)[:,0].reshape(-1,1)
            
            cv_scores.append(log_loss(te_y, pre[:,0].reshape(-1,1)))
        elif clf_name in ["xgb"]:
            train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=-1)
            test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
            z = clf.DMatrix(test_x)
            params = {'booster': 'gbtree',
                      'objective': 'multi:softprob',
                      'eval_metric': 'mlogloss',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.03,
                      'tree_method': 'exact',
                      'seed': 2017,
                      "num_class": 2
                      }

            num_round = 10000
            early_stopping_rounds = 100
            watchlist = [(train_matrix, 'train'),
                         (test_matrix, 'eval')
                         ]
            if test_matrix:
                model = clf.train(params, train_matrix, num_boost_round=num_round,evals=watchlist,
                                  early_stopping_rounds=early_stopping_rounds
                                  )
                pre= model.predict(test_matrix,ntree_limit=model.best_ntree_limit)
                train[test_index]=pre[:,0].reshape(-1,1)
                test_pre[i, :]= model.predict(z, ntree_limit=model.best_ntree_limit)[:,0].reshape(-1,1)
                cv_scores.append(log_loss(te_y, pre[:,0].reshape(-1,1)))
        elif clf_name in ["lgb"]:
            train_matrix = clf.Dataset(tr_x, label=tr_y)
            test_matrix = clf.Dataset(te_x, label=te_y)
            params = {
                      'boosting_type': 'gbdt',
                      #'boosting_type': 'dart',
                      'objective': 'multiclass',
                      'metric': 'multi_logloss',
                      'min_child_weight': 1.5,
                      'num_leaves': 2**5,
                      'lambda_l2': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'learning_rate': 0.03,
                      'tree_method': 'exact',
                      'seed': 2017,
                      "num_class": 2,
                      'silent': True,
                      }
            num_round = 10000
            early_stopping_rounds = 100
            if test_matrix:
                model = clf.train(params, train_matrix,num_round,valid_sets=test_matrix,
                                  early_stopping_rounds=early_stopping_rounds
                                  )
                pre= model.predict(te_x,num_iteration=model.best_iteration)
                train[test_index]=pre[:,0].reshape(-1,1)
                test_pre[i, :]= model.predict(test_x, num_iteration=model.best_iteration)[:,0].reshape(-1,1)
                cv_scores.append(log_loss(te_y, pre[:,0].reshape(-1,1)))
        else:
            raise IOError("Please add new clf.")
        print("%s now score is:"%clf_name,cv_scores)
    test[:]=test_pre.mean(axis=0)
    print("%s_score_list:"%clf_name,cv_scores)
    print("%s_score_mean:"%clf_name,np.mean(cv_scores))
    return train.reshape(-1,1),test.reshape(-1,1)

def rf_clf(x_train, y_train, x_valid, kf, label_split=None):
    randomforest = RandomForestClassifier(n_estimators=1200, max_depth=20, n_jobs=-1, random_state=2017, max_features="auto",verbose=1)
    rf_train, rf_test = stacking_clf(randomforest, x_train, y_train, x_valid, "rf", kf, label_split=label_split)
    return rf_train, rf_test,"rf"

def ada_clf(x_train, y_train, x_valid, kf, label_split=None):
    adaboost = AdaBoostClassifier(n_estimators=50, random_state=2017, learning_rate=0.01)
    ada_train, ada_test = stacking_clf(adaboost, x_train, y_train, x_valid, "ada", kf, label_split=label_split)
    return ada_train, ada_test,"ada"

def gb_clf(x_train, y_train, x_valid, kf, label_split=None):
    gbdt = GradientBoostingClassifier(learning_rate=0.04, n_estimators=100, subsample=0.8, random_state=2017,max_depth=5,verbose=1)
    gbdt_train, gbdt_test = stacking_clf(gbdt, x_train, y_train, x_valid, "gb", kf, label_split=label_split)
    return gbdt_train, gbdt_test,"gb"

def et_clf(x_train, y_train, x_valid, kf, label_split=None):
    extratree = ExtraTreesClassifier(n_estimators=1200, max_depth=35, max_features="auto", n_jobs=-1, random_state=2017,verbose=1)
    et_train, et_test = stacking_clf(extratree, x_train, y_train, x_valid, "et", kf, label_split=label_split)
    return et_train, et_test,"et"

def xgb_clf(x_train, y_train, x_valid, kf, label_split=None):
    xgb_train, xgb_test = stacking_clf(xgboost, x_train, y_train, x_valid, "xgb", kf, label_split=label_split)
    return xgb_train, xgb_test,"xgb"

def lgb_clf(x_train, y_train, x_valid, kf, label_split=None):
    xgb_train, xgb_test = stacking_clf(lightgbm, x_train, y_train, x_valid, "lgb", kf, label_split=label_split)
    return xgb_train, xgb_test,"lgb"

def gnb_clf(x_train, y_train, x_valid, kf, label_split=None):
    gnb=GaussianNB()
    gnb_train, gnb_test = stacking_clf(gnb, x_train, y_train, x_valid, "gnb", kf, label_split=label_split)
    return gnb_train, gnb_test,"gnb"

def lr_clf(x_train, y_train, x_valid, kf, label_split=None):
    logisticregression=LogisticRegression(n_jobs=-1,random_state=2017,C=0.1,max_iter=200)
    lr_train, lr_test = stacking_clf(logisticregression, x_train, y_train, x_valid, "lr", kf, label_split=label_split)
    return lr_train, lr_test, "lr"

def knn_clf(x_train, y_train, x_valid, kf, label_split=None):
    kneighbors=KNeighborsClassifier(n_neighbors=200,n_jobs=-1)
    knn_train, knn_test = stacking_clf(kneighbors, x_train, y_train, x_valid, "lr", kf, label_split=label_split)
    return knn_train, knn_test, "knn"

## 获取训练和验证数据

In [80]:
features_columns = [c for c in all_data_test.columns if c not in ['label', 'prob', 'seller_path', 'cat_path', 'brand_path', 'action_type_path', 'item_path', 'time_stamp_path']]
x_train = all_data_test[~all_data_test['label'].isna()][features_columns].values
y_train = all_data_test[~all_data_test['label'].isna()]['label'].values
x_valid = all_data_test[all_data_test['label'].isna()][features_columns].values

## 处理函数值Inf以及nan的函数

In [81]:
def get_matrix(data):
    where_are_nan = np.isnan(data)
    where_are_inf = np.isinf(data)
    data[where_are_nan] = 0
    data[where_are_inf] = 0
    return data

In [82]:
x_train = np.float_(get_matrix(np.float_(x_train)))
y_train = np.int_(y_train)
x_valid = x_train

## 导入划分数据函数（stacking原始训练数据folder=5）

In [83]:
from sklearn.model_selection import StratifiedKFold, KFold
folds = 5
seed = 1
kf = KFold(n_splits=5, shuffle=True, random_state=0)

## 使用lgb和xgb分类模型构造stacking特征

In [84]:
clf_list = [lgb_clf, xgb_clf]
clf_list_col = ['lgb_clf', 'xgb_clf']

## 训练模型，获取stacking特征

In [85]:
clf_list = clf_list
column_list = []
train_data_list=[]
test_data_list=[]
for clf in clf_list:
    train_data,test_data,clf_name=clf(x_train, y_train, x_valid, kf, label_split=None)
    train_data_list.append(train_data)
    test_data_list.append(test_data)
train_stacking = np.concatenate(train_data_list, axis=1)
test_stacking = np.concatenate(test_data_list, axis=1)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7302
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 130
[LightGBM] [Info] Start training from score -0.065873
[LightGBM] [Info] Start training from score -2.752786
[1]	valid_0's multi_logloss: 0.294257
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_logloss: 0.294024
[3]	valid_0's multi_logloss: 0.293904
[4]	valid_0's multi_logloss: 0.293788
[5]	valid_0's multi_logloss: 0.293497
[6]	valid_0's multi_logloss: 0.293532
[7]	valid_0's multi_logloss: 0.293465
[8]	valid_0's multi_logloss: 0.293354
[9]	valid_0's multi_logloss: 0.293289
[10]	valid_0's multi_logloss: 0.293341
[11]	valid_0's multi_logloss: 0.293093
[12]	valid_0's multi_logloss: 0.293153
[13]	valid_0's multi_logloss: 0.292745
[14]	valid_0's multi_logloss: 0.292656
[15]	valid_0's multi_logloss: 0.292593
[16]	valid_0's multi_logloss: 0.292552
[17]	valid_0's multi_logloss: 

[8]	valid_0's multi_logloss: 0.219223
[9]	valid_0's multi_logloss: 0.219039
[10]	valid_0's multi_logloss: 0.218863
[11]	valid_0's multi_logloss: 0.218607
[12]	valid_0's multi_logloss: 0.218579
[13]	valid_0's multi_logloss: 0.218359
[14]	valid_0's multi_logloss: 0.21808
[15]	valid_0's multi_logloss: 0.218005
[16]	valid_0's multi_logloss: 0.217997
[17]	valid_0's multi_logloss: 0.217931
[18]	valid_0's multi_logloss: 0.217667
[19]	valid_0's multi_logloss: 0.217793
[20]	valid_0's multi_logloss: 0.217735
[21]	valid_0's multi_logloss: 0.217838
[22]	valid_0's multi_logloss: 0.217838
[23]	valid_0's multi_logloss: 0.217279
[24]	valid_0's multi_logloss: 0.217145
[25]	valid_0's multi_logloss: 0.21713
[26]	valid_0's multi_logloss: 0.21723
[27]	valid_0's multi_logloss: 0.21743
[28]	valid_0's multi_logloss: 0.217448
[29]	valid_0's multi_logloss: 0.217584
[30]	valid_0's multi_logloss: 0.217565
[31]	valid_0's multi_logloss: 0.217577
[32]	valid_0's multi_logloss: 0.217764
[33]	valid_0's multi_logloss: 0

[74]	valid_0's multi_logloss: 0.240958
[75]	valid_0's multi_logloss: 0.241211
[76]	valid_0's multi_logloss: 0.241437
[77]	valid_0's multi_logloss: 0.24174
[78]	valid_0's multi_logloss: 0.241909
[79]	valid_0's multi_logloss: 0.242171
[80]	valid_0's multi_logloss: 0.242445
[81]	valid_0's multi_logloss: 0.242893
[82]	valid_0's multi_logloss: 0.243169
[83]	valid_0's multi_logloss: 0.243308
[84]	valid_0's multi_logloss: 0.243393
[85]	valid_0's multi_logloss: 0.243805
[86]	valid_0's multi_logloss: 0.2439
[87]	valid_0's multi_logloss: 0.244187
[88]	valid_0's multi_logloss: 0.244425
[89]	valid_0's multi_logloss: 0.244619
[90]	valid_0's multi_logloss: 0.244737
[91]	valid_0's multi_logloss: 0.24508
[92]	valid_0's multi_logloss: 0.245388
[93]	valid_0's multi_logloss: 0.245415
[94]	valid_0's multi_logloss: 0.245609
[95]	valid_0's multi_logloss: 0.245686
[96]	valid_0's multi_logloss: 0.245885
[97]	valid_0's multi_logloss: 0.246124
[98]	valid_0's multi_logloss: 0.246243
[99]	valid_0's multi_logloss:

[44]	valid_0's multi_logloss: 0.217907
[45]	valid_0's multi_logloss: 0.218017
[46]	valid_0's multi_logloss: 0.218144
[47]	valid_0's multi_logloss: 0.218513
[48]	valid_0's multi_logloss: 0.218683
[49]	valid_0's multi_logloss: 0.218913
[50]	valid_0's multi_logloss: 0.219313
[51]	valid_0's multi_logloss: 0.219464
[52]	valid_0's multi_logloss: 0.219578
[53]	valid_0's multi_logloss: 0.219496
[54]	valid_0's multi_logloss: 0.219683
[55]	valid_0's multi_logloss: 0.219667
[56]	valid_0's multi_logloss: 0.219676
[57]	valid_0's multi_logloss: 0.219743
[58]	valid_0's multi_logloss: 0.219943
[59]	valid_0's multi_logloss: 0.219924
[60]	valid_0's multi_logloss: 0.220239
[61]	valid_0's multi_logloss: 0.220424
[62]	valid_0's multi_logloss: 0.220389
[63]	valid_0's multi_logloss: 0.220474
[64]	valid_0's multi_logloss: 0.220483
[65]	valid_0's multi_logloss: 0.220838
[66]	valid_0's multi_logloss: 0.220847
[67]	valid_0's multi_logloss: 0.221146
[68]	valid_0's multi_logloss: 0.221075
[69]	valid_0's multi_logl

[107]	train-mlogloss:0.19899	eval-mlogloss:0.29172
[108]	train-mlogloss:0.19830	eval-mlogloss:0.29185
[109]	train-mlogloss:0.19783	eval-mlogloss:0.29198
[110]	train-mlogloss:0.19715	eval-mlogloss:0.29224
[111]	train-mlogloss:0.19649	eval-mlogloss:0.29236
[112]	train-mlogloss:0.19575	eval-mlogloss:0.29224
[113]	train-mlogloss:0.19517	eval-mlogloss:0.29216
[114]	train-mlogloss:0.19449	eval-mlogloss:0.29238
[115]	train-mlogloss:0.19395	eval-mlogloss:0.29254
[116]	train-mlogloss:0.19331	eval-mlogloss:0.29264
[117]	train-mlogloss:0.19265	eval-mlogloss:0.29276
[118]	train-mlogloss:0.19214	eval-mlogloss:0.29283
[119]	train-mlogloss:0.19172	eval-mlogloss:0.29302
[120]	train-mlogloss:0.19115	eval-mlogloss:0.29296
[121]	train-mlogloss:0.19056	eval-mlogloss:0.29282
[122]	train-mlogloss:0.19000	eval-mlogloss:0.29294
[123]	train-mlogloss:0.18929	eval-mlogloss:0.29311
[124]	train-mlogloss:0.18879	eval-mlogloss:0.29308
[125]	train-mlogloss:0.18836	eval-mlogloss:0.29323
[126]	train-mlogloss:0.18797	ev

[79]	train-mlogloss:0.23827	eval-mlogloss:0.23526
[80]	train-mlogloss:0.23732	eval-mlogloss:0.23458
[81]	train-mlogloss:0.23651	eval-mlogloss:0.23403
[82]	train-mlogloss:0.23540	eval-mlogloss:0.23340
[83]	train-mlogloss:0.23444	eval-mlogloss:0.23264
[84]	train-mlogloss:0.23348	eval-mlogloss:0.23199
[85]	train-mlogloss:0.23249	eval-mlogloss:0.23128
[86]	train-mlogloss:0.23179	eval-mlogloss:0.23077
[87]	train-mlogloss:0.23095	eval-mlogloss:0.23048
[88]	train-mlogloss:0.23023	eval-mlogloss:0.22990
[89]	train-mlogloss:0.22937	eval-mlogloss:0.22952
[90]	train-mlogloss:0.22853	eval-mlogloss:0.22913
[91]	train-mlogloss:0.22784	eval-mlogloss:0.22859
[92]	train-mlogloss:0.22709	eval-mlogloss:0.22845
[93]	train-mlogloss:0.22633	eval-mlogloss:0.22792
[94]	train-mlogloss:0.22566	eval-mlogloss:0.22747
[95]	train-mlogloss:0.22481	eval-mlogloss:0.22719
[96]	train-mlogloss:0.22415	eval-mlogloss:0.22671
[97]	train-mlogloss:0.22349	eval-mlogloss:0.22637
[98]	train-mlogloss:0.22260	eval-mlogloss:0.22603


[241]	train-mlogloss:0.16098	eval-mlogloss:0.21940
[242]	train-mlogloss:0.16079	eval-mlogloss:0.21935
[243]	train-mlogloss:0.16055	eval-mlogloss:0.21919
[244]	train-mlogloss:0.16037	eval-mlogloss:0.21934
[245]	train-mlogloss:0.16005	eval-mlogloss:0.21943
[246]	train-mlogloss:0.15985	eval-mlogloss:0.21933
[247]	train-mlogloss:0.15962	eval-mlogloss:0.21931
[248]	train-mlogloss:0.15941	eval-mlogloss:0.21938
[249]	train-mlogloss:0.15916	eval-mlogloss:0.21933
[250]	train-mlogloss:0.15897	eval-mlogloss:0.21947
[251]	train-mlogloss:0.15866	eval-mlogloss:0.21941
[252]	train-mlogloss:0.15845	eval-mlogloss:0.21944
[253]	train-mlogloss:0.15820	eval-mlogloss:0.21958
[254]	train-mlogloss:0.15800	eval-mlogloss:0.21965
[255]	train-mlogloss:0.15768	eval-mlogloss:0.21967
[256]	train-mlogloss:0.15749	eval-mlogloss:0.21958
[257]	train-mlogloss:0.15727	eval-mlogloss:0.21973
[258]	train-mlogloss:0.15708	eval-mlogloss:0.21970
[259]	train-mlogloss:0.15695	eval-mlogloss:0.21968
[260]	train-mlogloss:0.15666	ev

[93]	train-mlogloss:0.21964	eval-mlogloss:0.23819
[94]	train-mlogloss:0.21879	eval-mlogloss:0.23790
[95]	train-mlogloss:0.21798	eval-mlogloss:0.23767
[96]	train-mlogloss:0.21725	eval-mlogloss:0.23749
[97]	train-mlogloss:0.21646	eval-mlogloss:0.23715
[98]	train-mlogloss:0.21575	eval-mlogloss:0.23708
[99]	train-mlogloss:0.21508	eval-mlogloss:0.23688
[100]	train-mlogloss:0.21448	eval-mlogloss:0.23681
[101]	train-mlogloss:0.21384	eval-mlogloss:0.23663
[102]	train-mlogloss:0.21308	eval-mlogloss:0.23649
[103]	train-mlogloss:0.21221	eval-mlogloss:0.23632
[104]	train-mlogloss:0.21141	eval-mlogloss:0.23627
[105]	train-mlogloss:0.21054	eval-mlogloss:0.23603
[106]	train-mlogloss:0.20994	eval-mlogloss:0.23591
[107]	train-mlogloss:0.20922	eval-mlogloss:0.23580
[108]	train-mlogloss:0.20862	eval-mlogloss:0.23578
[109]	train-mlogloss:0.20805	eval-mlogloss:0.23573
[110]	train-mlogloss:0.20720	eval-mlogloss:0.23565
[111]	train-mlogloss:0.20667	eval-mlogloss:0.23564
[112]	train-mlogloss:0.20604	eval-mlog

[20]	train-mlogloss:0.40351	eval-mlogloss:0.42691
[21]	train-mlogloss:0.39582	eval-mlogloss:0.42007
[22]	train-mlogloss:0.38845	eval-mlogloss:0.41369
[23]	train-mlogloss:0.38135	eval-mlogloss:0.40765
[24]	train-mlogloss:0.37447	eval-mlogloss:0.40168
[25]	train-mlogloss:0.36793	eval-mlogloss:0.39611
[26]	train-mlogloss:0.36160	eval-mlogloss:0.39064
[27]	train-mlogloss:0.35553	eval-mlogloss:0.38564
[28]	train-mlogloss:0.34969	eval-mlogloss:0.38087
[29]	train-mlogloss:0.34411	eval-mlogloss:0.37616
[30]	train-mlogloss:0.33896	eval-mlogloss:0.37190
[31]	train-mlogloss:0.33389	eval-mlogloss:0.36774
[32]	train-mlogloss:0.32911	eval-mlogloss:0.36388
[33]	train-mlogloss:0.32439	eval-mlogloss:0.35989
[34]	train-mlogloss:0.31988	eval-mlogloss:0.35625
[35]	train-mlogloss:0.31554	eval-mlogloss:0.35267
[36]	train-mlogloss:0.31141	eval-mlogloss:0.34941
[37]	train-mlogloss:0.30740	eval-mlogloss:0.34636
[38]	train-mlogloss:0.30350	eval-mlogloss:0.34342
[39]	train-mlogloss:0.29983	eval-mlogloss:0.34078


[183]	train-mlogloss:0.17029	eval-mlogloss:0.29519
[184]	train-mlogloss:0.16997	eval-mlogloss:0.29530
[185]	train-mlogloss:0.16963	eval-mlogloss:0.29537
[186]	train-mlogloss:0.16938	eval-mlogloss:0.29526
[187]	train-mlogloss:0.16911	eval-mlogloss:0.29540
[188]	train-mlogloss:0.16880	eval-mlogloss:0.29547
[189]	train-mlogloss:0.16852	eval-mlogloss:0.29533
[190]	train-mlogloss:0.16812	eval-mlogloss:0.29536
[191]	train-mlogloss:0.16776	eval-mlogloss:0.29531
[192]	train-mlogloss:0.16749	eval-mlogloss:0.29526
[193]	train-mlogloss:0.16711	eval-mlogloss:0.29524
[194]	train-mlogloss:0.16688	eval-mlogloss:0.29547
[195]	train-mlogloss:0.16661	eval-mlogloss:0.29557
[196]	train-mlogloss:0.16640	eval-mlogloss:0.29544
[197]	train-mlogloss:0.16622	eval-mlogloss:0.29546
[198]	train-mlogloss:0.16595	eval-mlogloss:0.29565
[199]	train-mlogloss:0.16567	eval-mlogloss:0.29567
[200]	train-mlogloss:0.16534	eval-mlogloss:0.29578
[201]	train-mlogloss:0.16518	eval-mlogloss:0.29584
[202]	train-mlogloss:0.16487	ev

[125]	train-mlogloss:0.20252	eval-mlogloss:0.22084
[126]	train-mlogloss:0.20197	eval-mlogloss:0.22077
[127]	train-mlogloss:0.20139	eval-mlogloss:0.22073
[128]	train-mlogloss:0.20106	eval-mlogloss:0.22061
[129]	train-mlogloss:0.20033	eval-mlogloss:0.22065
[130]	train-mlogloss:0.19982	eval-mlogloss:0.22052
[131]	train-mlogloss:0.19931	eval-mlogloss:0.22048
[132]	train-mlogloss:0.19880	eval-mlogloss:0.22031
[133]	train-mlogloss:0.19834	eval-mlogloss:0.22027
[134]	train-mlogloss:0.19794	eval-mlogloss:0.22008
[135]	train-mlogloss:0.19753	eval-mlogloss:0.22008
[136]	train-mlogloss:0.19688	eval-mlogloss:0.21997
[137]	train-mlogloss:0.19645	eval-mlogloss:0.21999
[138]	train-mlogloss:0.19600	eval-mlogloss:0.22003
[139]	train-mlogloss:0.19554	eval-mlogloss:0.21983
[140]	train-mlogloss:0.19505	eval-mlogloss:0.21978
[141]	train-mlogloss:0.19453	eval-mlogloss:0.21988
[142]	train-mlogloss:0.19409	eval-mlogloss:0.21966
[143]	train-mlogloss:0.19372	eval-mlogloss:0.21965
[144]	train-mlogloss:0.19312	ev

## 原始特征和stacking特征合并

In [86]:
# 合并所有特征
train = pd.DataFrame(np.concatenate([x_train, train_stacking], axis=1))
test = np.concatenate([x_valid, test_stacking], axis=1)

## 特征重命名

In [91]:
df_train_all = pd.DataFrame(train)
df_train_all.columns = features_columns + clf_list_col
df_test_all = pd.DataFrame(test)
df_test_all.columns = features_columns + clf_list_col

## 获取数据ID以及特征标签LABEL

In [92]:
df_train_all['label'] = all_data_test['label']

## 训练数据和测试数据保存

In [93]:
df_train_all.to_csv('train_all.csv',header=True,index=False)
df_test_all.to_csv('test_all.csv',header=True,index=False)

In [94]:
df_train_all

Unnamed: 0,user_id,merchant_id,age_range,gender,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique,time_stamp_nunique,...,embeeding_93,embeeding_94,embeeding_95,embeeding_96,embeeding_97,embeeding_98,embeeding_99,lgb_clf,xgb_clf,label
0,105600.0,1487.0,6.0,1.0,310.0,96.0,37.0,88.0,217.0,29.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.940233,0.903583,0.0
1,110976.0,159.0,5.0,0.0,274.0,181.0,70.0,159.0,233.0,52.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.948859,0.915382,0.0
2,374400.0,302.0,5.0,1.0,278.0,57.0,59.0,62.0,148.0,35.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.910761,0.945769,0.0
3,189312.0,1760.0,4.0,0.0,237.0,49.0,35.0,45.0,170.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.924750,0.928050,0.0
4,189312.0,2511.0,4.0,0.0,237.0,49.0,35.0,45.0,170.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.933608,0.931714,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,305721.0,3734.0,3.0,1.0,35.0,13.0,7.0,16.0,21.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.942053,0.957956,0.0
1996,109881.0,2639.0,4.0,0.0,284.0,51.0,55.0,53.0,186.0,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.930127,0.915183,0.0
1997,185145.0,4950.0,4.0,1.0,84.0,31.0,34.0,33.0,58.0,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.929671,0.926973,0.0
1998,131385.0,1582.0,3.0,1.0,92.0,38.0,18.0,36.0,63.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.930781,0.956378,0.0


In [95]:
df_test_all

Unnamed: 0,user_id,merchant_id,age_range,gender,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique,time_stamp_nunique,...,embeeding_92,embeeding_93,embeeding_94,embeeding_95,embeeding_96,embeeding_97,embeeding_98,embeeding_99,lgb_clf,xgb_clf
0,105600.0,1487.0,6.0,1.0,310.0,96.0,37.0,88.0,217.0,29.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.934078,0.856887
1,110976.0,159.0,5.0,0.0,274.0,181.0,70.0,159.0,233.0,52.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.939018,0.941427
2,374400.0,302.0,5.0,1.0,278.0,57.0,59.0,62.0,148.0,35.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.927639,0.935769
3,189312.0,1760.0,4.0,0.0,237.0,49.0,35.0,45.0,170.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.934111,0.931642
4,189312.0,2511.0,4.0,0.0,237.0,49.0,35.0,45.0,170.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.932230,0.927470
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,305721.0,3734.0,3.0,1.0,35.0,13.0,7.0,16.0,21.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.939274,0.940982
1996,109881.0,2639.0,4.0,0.0,284.0,51.0,55.0,53.0,186.0,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.925575,0.897880
1997,185145.0,4950.0,4.0,1.0,84.0,31.0,34.0,33.0,58.0,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.941051,0.938261
1998,131385.0,1582.0,3.0,1.0,92.0,38.0,18.0,36.0,63.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.945235,0.955478
