In [55]:
import time
import numpy as np
import pandas as pd
import os
import gc
from sklearn import metrics
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from numpy import array
from sklearn.model_selection import KFold
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

In [None]:
# 读取数据
train = pd.read_csv('../data/train.csv', parse_dates=["activation_date"]).sample(10000)
test = pd.read_csv('../data/test.csv', parse_dates=["activation_date"]).sample(10000)

In [None]:
train.head(1)

In [None]:
traindex = train.shape[0]
testdex = test.shape[0]
train_y = train["deal_probability"]

print("Train shape: {} Rows, {} Columns".format(*train.shape))
print("Test shape: {} Rows, {} Columns".format(*test.shape))

In [None]:
df = pd.concat([train, test], axis=0, ignore_index=True)

In [None]:
df.isnull().sum()

样本对应的特征包括：类别，文本，数值，日期。

对于特征的处理主要分为三大块：

1. 特征工程

2. 文本NLP

In [None]:
print('df.shape: ', df.shape)
df.head(1)

In [None]:
### 1. 价格特征
# 对价格进行均值填充
df["price"].fillna(np.nanmean(df["price"].values))

# 对价格取log后进行填充
df["price_new"] = np.log(df["price"].values)
df["price_new"].fillna(np.nanmean(df["price_new"].values), inplace=True)

In [None]:
# image_top_1  -  Avito的图像分类代码。
# 采用-999进行填充，决策树会将-999分为一类，而线性模型不可以。
df["image_top_1"].fillna(-999, inplace=True)

In [None]:
# 2. 时间特征
# activation_date 广告上架时间
df["Weekday"] = df['activation_date'].dt.weekday
df["Weekday of Year"] = df['activation_date'].dt.week
df["Day of Month"] = df['activation_date'].dt.day

In [None]:
# 3. 类别特征
# 类别特征很多，这里直接将这些类别进行labelencoder；
# 由于采用的是基于决策树的lightgbm，所以没有必要进行onehot编码。
categorical = ["user_id", "region", "city", "parent_category_name", "category_name", "user_type",
               "image_top_1", "param_1", "param_2", "param_3"]
print("Encoding: ", categorical)

lbl = preprocessing.LabelEncoder()
for col in categorical:
    df[col] = lbl.fit_transform(df[col].astype(str))

In [None]:
# 采用lightgbm可以对类别特征做自动填充，对于文本类和数值类的还需要自己做一些填充工作。

In [None]:
# 4. 文本特征
# 采用NA进行填充
textfeatures = ["description", "title"]
df["description"].fillna("NA", inplace=True)

for col in textfeatures:
    df[col] = df[col].astype(str)
    df[col] = df[col].astype(str).fillna('missing') 
    df[col] = df[col].str.lower() # 小写所有文本
    df[col + '_num_words'] = df[col].apply(lambda d: len(d.split())) # 文本长度
    df[col + '_num_unique_words'] = df[col].apply(lambda d: len(set(w for w in d.split())))
    df[col + '_words_vs_unique'] = df[col + '_num_unique_words'] / df[col + '_num_words'] * 100 # 统计独特的单词

In [None]:
df.head()

In [None]:
# TFIDF Vectorizer
russian_stop = set(stopwords.words('russian'))

params = {
    "stop_words": russian_stop,
    "analyzer": "word",
    "token_pattern": r'\w{1,}',
    "norm": "l2",
    "max_features": 1000,
    "smooth_idf": False,
    "ngram_range": (1,1) # n-gram的n-values的下限和上限范围
    # 比如'Python is useful'中的ngram_range(1,3)之后可得到'Python'  'is'  'useful'  'Python is'  'is useful' 和'Python is useful'如果是ngram_range (1,1) 则只能得到单个单词'Python'  'is'和'useful'
}

TSVD的作用是对矩阵进行降维，我们可以指定降维后的主题个数K，这里K指定为10。

In [None]:
n_comp = 10
for col in textfeatures:
    tfidf_vec = TfidfVectorizer(params)
    full_tfidf = tfidf_vec.fit_transform(df[col].values.tolist()) # 将文本corpus输入，TF-IDF权重矩阵
    svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
    svd_obj.fit(full_tfidf)
    full_svd = pd.DataFrame(svd_obj.transform(full_tfidf))
    df = pd.concat([df, full_svd], axis=1)

In [None]:
df.head()

In [None]:
drop_columns = ["item_id", "user_id", "title", "description",'deal_probability',"activation_date", "image"]
df = df.drop(drop_columns, axis=1)

In [None]:
train_X = df.loc[:traindex-1, :]
test_X = df.loc[traindex:, :]
print(train_X.shape)
print(test_X.shape)

In [None]:
train_X.head()

K-Fold API，n-split就是K值。

In [None]:
### 调参
from sklearn.model_selection import KFold

categorical = ["region", "city", "parent_category_name", "category_name", "user_type",
               "image_top_1", "param_1", "param_2", "param_3"]

N_Split = 5

In [None]:
kfold = KFold(n_splits=N_Split, shuffle=False, random_state=2019)

score_all = []
for n_round in range(3500, 5000, 500): # 不同的循环次数
    for learning_rate in [0.01, 0.05, 0.01]: # 不同的学习率
        params = {
            "objective": "regression",
            "boosting_type": "gbdt",
            "metric": "rmse",
            "learning_rate": 0.01,
            
            "max_depth": 7,
            "num_leaves": 100,
            
            'max_bin':  255,
            "min_data_in_leaf": 50,

            'feature_fraction': 0.70, # 每颗树训练之前选择70%的特征
            'bagging_fraction': 0.70, # 加速训练，处理过拟合
            'bagging_freq': 5,

            # "drop_rate": 0.1,
            # "max_drop": 50,
            "min_child_samples": 10,  
            "min_child_weight": 150,     

            'verbose': 1
        }
        
        current_score_tmp_all = []
        kf = kfold.split(train_X, train_y)
        
        for train_index, valid_index in kf:
            X_train_x, X_train_y, valid_x, valid_y = train_X.iloc[train_index], train_y.iloc[train_index], train_X.iloc[valid_index], train_y.iloc[valid_index]
            train_data = lgb.Dataset(X_train_x, X_train_y, categorical_feature=categorical)
            valid_data = lgb.Dataset(valid_x, valid_y, categorical_feature=categorical)
            bst = lgb.train(params, 
                            train_data,
                           num_boost_round=n_round,
                           valid_sets=valid_data,
                           verbose_eval=50, # 迭代多少次打印
                           early_stopping_rounds=50 # 有多少次分数没有提高则停止
                           )
            
            print("Model Evaluate....")
            pred_y = bst.predict(valid_x)
            current_score = np.sqrt(metrics.mean_squared_error(valid_y, pred_y))
            current_score_tmp_all.append(current_score)
        cur_score = np.sum(current_score_tmp_all) / N_Split
        print('RMSE score:', cur_score, '[learnig_rate]', learning_rate,'[n_round]',n_round)
        score_all.append([cur_score, learning_rate, n_round])

In [None]:
score_all

### LightGBM

1. 支持分类特征

使用本地分类特征，LightGBM可以提供良好的精确度，不像简单的one-hot编码，LightGBM可以找到分类特征的最优分割，相对于one-hot编码结果，LightGBM可以提供准确的最优分割。

用categorical_feature指定分类特征。

对于高基数的分类特征，最好把它转化为数字特征。