In [55]:
import time
import numpy as np
import pandas as pd
import os
import gc
from sklearn import metrics
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from numpy import array
from sklearn.model_selection import KFold
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

In [56]:
# 读取数据
train = pd.read_csv('../data/train.csv', parse_dates=["activation_date"]).sample(10000)
test = pd.read_csv('../data/test.csv', parse_dates=["activation_date"]).sample(10000)

In [57]:
train.head(1)

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
745706,a602da549e32,96e1cbb95473,Красноярский край,Красноярск,Личные вещи,"Одежда, обувь, аксессуары",Женская одежда,Обувь,< 35,Сапоги Mario Cerutti ботфорты - замшевые,"Сапоги Mario Cerutti ботфорты - замшевые, кабл...",5000.0,38,2017-03-27,Private,fc5f2ea53d2419530fc9ec9bf9ef63f53512ad677cf265...,499.0,0.0


In [58]:
traindex = train.shape[0]
testdex = test.shape[0]
train_y = train["deal_probability"]

print("Train shape: {} Rows, {} Columns".format(*train.shape))
print("Test shape: {} Rows, {} Columns".format(*test.shape))

Train shape: 10000 Rows, 18 Columns
Test shape: 10000 Rows, 17 Columns


In [59]:
df = pd.concat([train, test], axis=0, ignore_index=True)

In [60]:
df.isnull().sum()

activation_date             0
category_name               0
city                        0
deal_probability        10000
description               760
image                    1569
image_top_1              1569
item_id                     0
item_seq_number             0
param_1                   871
param_2                  9034
param_3                 11816
parent_category_name        0
price                    1093
region                      0
title                       0
user_id                     0
user_type                   0
dtype: int64

样本对应的特征包括：类别，文本，数值，日期。

对于特征的处理主要分为三大块：

1. 特征工程

2. 文本NLP

In [61]:
print('df.shape: ', df.shape)
df.head(1)

df.shape:  (20000, 18)


Unnamed: 0,activation_date,category_name,city,deal_probability,description,image,image_top_1,item_id,item_seq_number,param_1,param_2,param_3,parent_category_name,price,region,title,user_id,user_type
0,2017-03-27,"Одежда, обувь, аксессуары",Красноярск,0.0,"Сапоги Mario Cerutti ботфорты - замшевые, кабл...",fc5f2ea53d2419530fc9ec9bf9ef63f53512ad677cf265...,499.0,a602da549e32,38,Женская одежда,Обувь,< 35,Личные вещи,5000.0,Красноярский край,Сапоги Mario Cerutti ботфорты - замшевые,96e1cbb95473,Private


In [62]:
### 1. 价格特征
# 对价格进行均值填充
df["price"].fillna(np.nanmean(df["price"].values))

# 对价格取log后进行填充
df["price_new"] = np.log(df["price"].values)
df["price_new"].fillna(np.nanmean(df["price_new"].values), inplace=True)

In [63]:
# image_top_1  -  Avito的图像分类代码。
# 采用-999进行填充，决策树会将-999分为一类，而线性模型不可以。
df["image_top_1"].fillna(-999, inplace=True)

In [64]:
# 2. 时间特征
# activation_date 广告上架时间
df["Weekday"] = df['activation_date'].dt.weekday
df["Weekday of Year"] = df['activation_date'].dt.week
df["Day of Month"] = df['activation_date'].dt.day

In [65]:
# 3. 类别特征
# 类别特征很多，这里直接将这些类别进行labelencoder；
# 由于采用的是基于决策树的lightgbm，所以没有必要进行onehot编码。
categorical = ["user_id", "region", "city", "parent_category_name", "category_name", "user_type",
               "image_top_1", "param_1", "param_2", "param_3"]
print("Encoding: ", categorical)

lbl = preprocessing.LabelEncoder()
for col in categorical:
    df[col] = lbl.fit_transform(df[col].astype(str))

Encoding:  ['user_id', 'region', 'city', 'parent_category_name', 'category_name', 'user_type', 'image_top_1', 'param_1', 'param_2', 'param_3']


In [66]:
# 采用lightgbm可以对类别特征做自动填充，对于文本类和数值类的还需要自己做一些填充工作。

In [67]:
# 4. 文本特征
# 采用NA进行填充
textfeatures = ["description", "title"]
df["description"].fillna("NA", inplace=True)

for col in textfeatures:
    df[col] = df[col].astype(str)
    df[col] = df[col].astype(str).fillna('missing') 
    df[col] = df[col].str.lower() # 小写所有文本
    df[col + '_num_words'] = df[col].apply(lambda d: len(d.split())) # 文本长度
    df[col + '_num_unique_words'] = df[col].apply(lambda d: len(set(w for w in d.split())))
    df[col + '_words_vs_unique'] = df[col + '_num_unique_words'] / df[col + '_num_words'] * 100 # 统计独特的单词

In [68]:
df.head()

Unnamed: 0,activation_date,category_name,city,deal_probability,description,image,image_top_1,item_id,item_seq_number,param_1,...,price_new,Weekday,Weekday of Year,Day of Month,description_num_words,description_num_unique_words,description_words_vs_unique,title_num_words,title_num_unique_words,title_words_vs_unique
0,2017-03-27,28,342,0.0,"сапоги mario cerutti ботфорты - замшевые, кабл...",fc5f2ea53d2419530fc9ec9bf9ef63f53512ad677cf265...,1934,a602da549e32,38,94,...,8.517193,0,13,27,17,17,100.0,6,6,100.0
1,2017-03-15,33,487,0.5,создание сайтов:/\n• сайт-визитка /\n• одностр...,e230c0c870eb6be7063c54976e906fb772d0c92c6d4dd7...,1097,b5e8b448b012,9,11,...,8.29405,2,11,15,87,56,64.367816,4,4,100.0
2,2017-03-17,42,138,0.0,"рубец ягнят, рубец говяжий, обрезь шей ягнят, ...",,0,1e7224621bbd,47,30,...,4.248495,4,11,17,10,8,80.0,6,6,100.0
3,2017-03-20,24,203,0.0,продаю звуковаю карту k-mic kx-2 родом из кита...,bc3697ebbba2248b9e7011f6429c516c2bae6382d3e7f6...,1571,a34a95096c06,18,84,...,7.600902,0,12,20,43,41,95.348837,4,4,100.0
4,2017-03-19,41,727,0.0,"бортики и балдахин в кроватку ,в идеальном сос...",57cc16db7b171822721e81abba0379f3842ee3c59fe23d...,19,a06fb8d6e926,5,194,...,6.684612,6,11,19,8,8,100.0,5,5,100.0


In [69]:
# TFIDF Vectorizer
russian_stop = set(stopwords.words('russian'))

params = {
    "stop_words": russian_stop,
    "analyzer": "word",
    "token_pattern": r'\w{1,}',
    "norm": "l2",
    "max_features": 1000,
    "smooth_idf": False,
    "ngram_range": (1,1) # n-gram的n-values的下限和上限范围
    # 比如'Python is useful'中的ngram_range(1,3)之后可得到'Python'  'is'  'useful'  'Python is'  'is useful' 和'Python is useful'如果是ngram_range (1,1) 则只能得到单个单词'Python'  'is'和'useful'
}

TSVD的作用是对矩阵进行降维，我们可以指定降维后的主题个数K，这里K指定为10。

In [70]:
n_comp = 10
for col in textfeatures:
    tfidf_vec = TfidfVectorizer(params)
    full_tfidf = tfidf_vec.fit_transform(df[col].values.tolist()) # 将文本corpus输入，TF-IDF权重矩阵
    svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
    svd_obj.fit(full_tfidf)
    full_svd = pd.DataFrame(svd_obj.transform(full_tfidf))
    df = pd.concat([df, full_svd], axis=1)

In [71]:
df.head()

Unnamed: 0,activation_date,category_name,city,deal_probability,description,image,image_top_1,item_id,item_seq_number,param_1,...,0,1,2,3,4,5,6,7,8,9
0,2017-03-27,28,342,0.0,"сапоги mario cerutti ботфорты - замшевые, кабл...",fc5f2ea53d2419530fc9ec9bf9ef63f53512ad677cf265...,1934,a602da549e32,38,94,...,0.0001477091,0.0004667072,0.001633061,0.007124,0.018737,1e-05,-0.00752,-0.000748,-0.001052,0.008968233
1,2017-03-15,33,487,0.5,создание сайтов:/\n• сайт-визитка /\n• одностр...,e230c0c870eb6be7063c54976e906fb772d0c92c6d4dd7...,1097,b5e8b448b012,9,11,...,4.586683e-08,1.137147e-07,8.774628e-07,2e-06,3e-06,-3e-06,4e-06,2e-06,-3e-06,-3.087632e-06
2,2017-03-17,42,138,0.0,"рубец ягнят, рубец говяжий, обрезь шей ягнят, ...",,0,1e7224621bbd,47,30,...,0.0003147257,0.009857759,0.006955826,0.048131,0.064083,-0.068758,0.104394,0.079145,-0.056302,-0.049469
3,2017-03-20,24,203,0.0,продаю звуковаю карту k-mic kx-2 родом из кита...,bc3697ebbba2248b9e7011f6429c516c2bae6382d3e7f6...,1571,a34a95096c06,18,84,...,9.843634e-06,1.10297e-06,1.77804e-05,5e-06,2.5e-05,-2.1e-05,-3.3e-05,1.9e-05,1e-05,1.462484e-07
4,2017-03-19,41,727,0.0,"бортики и балдахин в кроватку ,в идеальном сос...",57cc16db7b171822721e81abba0379f3842ee3c59fe23d...,19,a06fb8d6e926,5,194,...,0.0001245459,0.0003859618,0.002074231,0.001746,0.005239,-0.004293,-0.003056,-0.001794,-0.001129,-0.002231097


In [72]:
drop_columns = ["item_id", "user_id", "title", "description",'deal_probability',"activation_date", "image"]
df = df.drop(drop_columns, axis=1)

In [73]:
train_X = df.loc[:traindex-1, :]
test_X = df.loc[traindex:, :]
print(train_X.shape)
print(test_X.shape)

(10000, 41)
(10000, 41)


In [74]:
train_X.head()

Unnamed: 0,category_name,city,image_top_1,item_seq_number,param_1,param_2,param_3,parent_category_name,price,region,...,0,1,2,3,4,5,6,7,8,9
0,28,342,1934,38,94,120,103,4,5000.0,10,...,0.0001477091,0.0004667072,0.001633061,0.007124,0.018737,1e-05,-0.00752,-0.000748,-0.001052,0.008968233
1,33,487,1097,9,11,14,298,7,4000.0,12,...,4.586683e-08,1.137147e-07,8.774628e-07,2e-06,3e-06,-3e-06,4e-06,2e-06,-3e-06,-3.087632e-06
2,42,138,0,47,30,49,298,3,70.0,16,...,0.0003147257,0.009857759,0.006955826,0.048131,0.064083,-0.068758,0.104394,0.079145,-0.056302,-0.049469
3,24,203,1571,18,84,49,298,8,2000.0,19,...,9.843634e-06,1.10297e-06,1.77804e-05,5e-06,2.5e-05,-2.1e-05,-3.3e-05,1.9e-05,1e-05,1.462484e-07
4,41,727,19,5,194,49,298,4,800.0,26,...,0.0001245459,0.0003859618,0.002074231,0.001746,0.005239,-0.004293,-0.003056,-0.001794,-0.001129,-0.002231097


K-Fold API，n-split就是K值。

In [75]:
### 调参
from sklearn.model_selection import KFold

categorical = ["region", "city", "parent_category_name", "category_name", "user_type",
               "image_top_1", "param_1", "param_2", "param_3"]

N_Split = 5

In [76]:
kfold = KFold(n_splits=N_Split, shuffle=False, random_state=2019)

score_all = []
for n_round in range(3500, 5000, 500): # 不同的循环次数
    for learning_rate in [0.01, 0.05, 0.01]: # 不同的学习率
        params = {
            "objective": "regression",
            "boosting_type": "gbdt",
            "metric": "rmse",
            "learning_rate": 0.01,
            
            "max_depth": 7,
            "num_leaves": 100,
            
            'max_bin':  255,
            "min_data_in_leaf": 50,

            'feature_fraction': 0.70, # 每颗树训练之前选择70%的特征
            'bagging_fraction': 0.70, # 加速训练，处理过拟合
            'bagging_freq': 5,

            # "drop_rate": 0.1,
            # "max_drop": 50,
            "min_child_samples": 10,  
            "min_child_weight": 150,     

            'verbose': 1
        }
        
        current_score_tmp_all = []
        kf = kfold.split(train_X, train_y)
        
        for train_index, valid_index in kf:
            X_train_x, X_train_y, valid_x, valid_y = train_X.iloc[train_index], train_y.iloc[train_index], train_X.iloc[valid_index], train_y.iloc[valid_index]
            train_data = lgb.Dataset(X_train_x, X_train_y, categorical_feature=categorical)
            valid_data = lgb.Dataset(valid_x, valid_y, categorical_feature=categorical)
            bst = lgb.train(params, 
                            train_data,
                           num_boost_round=n_round,
                           valid_sets=valid_data,
                           verbose_eval=50, # 迭代多少次打印
                           early_stopping_rounds=50 # 有多少次分数没有提高则停止
                           )
            
            print("Model Evaluate....")
            pred_y = bst.predict(valid_x)
            current_score = np.sqrt(metrics.mean_squared_error(valid_y, pred_y))
            current_score_tmp_all.append(current_score)
        cur_score = np.sum(current_score_tmp_all) / N_Split
        print('RMSE score:', cur_score, '[learnig_rate]', learning_rate,'[n_round]',n_round)
        score_all.append([cur_score, learning_rate, n_round])

Training until validation scores don't improve for 50 rounds.
[50]	valid_0's rmse: 0.253047
[100]	valid_0's rmse: 0.247155
[150]	valid_0's rmse: 0.244106
[200]	valid_0's rmse: 0.242686
[250]	valid_0's rmse: 0.241984
[300]	valid_0's rmse: 0.241726
[350]	valid_0's rmse: 0.241613
Early stopping, best iteration is:
[341]	valid_0's rmse: 0.241554
Model Evaluate....
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's rmse: 0.248225
[100]	valid_0's rmse: 0.241979
[150]	valid_0's rmse: 0.239186
[200]	valid_0's rmse: 0.23778
[250]	valid_0's rmse: 0.236946
[300]	valid_0's rmse: 0.236487
[350]	valid_0's rmse: 0.236002
[400]	valid_0's rmse: 0.235882
[450]	valid_0's rmse: 0.235983
Early stopping, best iteration is:
[425]	valid_0's rmse: 0.235816
Model Evaluate....
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's rmse: 0.236632
[100]	valid_0's rmse: 0.231554
[150]	valid_0's rmse: 0.229422
[200]	valid_0's rmse: 0.22861
[250]	valid_0's rmse: 0.22851

[300]	valid_0's rmse: 0.241726
[350]	valid_0's rmse: 0.241613
Early stopping, best iteration is:
[341]	valid_0's rmse: 0.241554
Model Evaluate....
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's rmse: 0.248225
[100]	valid_0's rmse: 0.241979
[150]	valid_0's rmse: 0.239186
[200]	valid_0's rmse: 0.23778
[250]	valid_0's rmse: 0.236946
[300]	valid_0's rmse: 0.236487
[350]	valid_0's rmse: 0.236002
[400]	valid_0's rmse: 0.235882
[450]	valid_0's rmse: 0.235983
Early stopping, best iteration is:
[425]	valid_0's rmse: 0.235816
Model Evaluate....
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's rmse: 0.236632
[100]	valid_0's rmse: 0.231554
[150]	valid_0's rmse: 0.229422
[200]	valid_0's rmse: 0.22861
[250]	valid_0's rmse: 0.228514
[300]	valid_0's rmse: 0.228479
Early stopping, best iteration is:
[287]	valid_0's rmse: 0.22843
Model Evaluate....
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's rmse: 0.248993
[100]	v

[100]	valid_0's rmse: 0.241979
[150]	valid_0's rmse: 0.239186
[200]	valid_0's rmse: 0.23778
[250]	valid_0's rmse: 0.236946
[300]	valid_0's rmse: 0.236487
[350]	valid_0's rmse: 0.236002
[400]	valid_0's rmse: 0.235882
[450]	valid_0's rmse: 0.235983
Early stopping, best iteration is:
[425]	valid_0's rmse: 0.235816
Model Evaluate....
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's rmse: 0.236632
[100]	valid_0's rmse: 0.231554
[150]	valid_0's rmse: 0.229422
[200]	valid_0's rmse: 0.22861
[250]	valid_0's rmse: 0.228514
[300]	valid_0's rmse: 0.228479
Early stopping, best iteration is:
[287]	valid_0's rmse: 0.22843
Model Evaluate....
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's rmse: 0.248993
[100]	valid_0's rmse: 0.243342
[150]	valid_0's rmse: 0.240836
[200]	valid_0's rmse: 0.23988
[250]	valid_0's rmse: 0.239095
[300]	valid_0's rmse: 0.238777
[350]	valid_0's rmse: 0.238832
[400]	valid_0's rmse: 0.238751
[450]	valid_0's rmse: 0.238693

In [77]:
score_all

[[0.23567720053520808, 0.01, 3500],
 [0.23567720053520808, 0.05, 3500],
 [0.23567720053520808, 0.01, 3500],
 [0.23567720053520808, 0.01, 4000],
 [0.23567720053520808, 0.05, 4000],
 [0.23567720053520808, 0.01, 4000],
 [0.23567720053520808, 0.01, 4500],
 [0.23567720053520808, 0.05, 4500],
 [0.23567720053520808, 0.01, 4500]]

### LightGBM

1. 支持分类特征

使用本地分类特征，LightGBM可以提供良好的精确度，不像简单的one-hot编码，LightGBM可以找到分类特征的最优分割，相对于one-hot编码结果，LightGBM可以提供准确的最优分割。

用categorical_feature指定分类特征。

对于高基数的分类特征，最好把它转化为数字特征。