In [1]:
import numpy as np 
import pandas as pd 
import os

from xgboost import XGBClassifier
import lightgbm as lgb

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, cross_val_score, train_test_split,StratifiedKFold
from sklearn.metrics import f1_score, recall_score, accuracy_score, roc_auc_score
from sklearn.preprocessing.data import OneHotEncoder, Binarizer, PolynomialFeatures
from mlxtend.classifier import StackingClassifier

import matplotlib
import matplotlib.pyplot as plt
import pandas_profiling
from collections import Counter

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/sdadasda/case2_training.csv
/kaggle/input/sdadasda/case2_testing.csv


In [2]:
# read data
train = pd.read_csv('/kaggle/input/sdadasda/case2_training.csv')
test = pd.read_csv('/kaggle/input/sdadasda/case2_testing.csv')

# Data Analysis
train.describe()

Unnamed: 0,ID,Region,Date,Weekday,Apartment,Beds,Review,Pic Quality,Price,Accept
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,25000.5,5.49546,182.8243,3.99276,0.80184,1.49576,4.509402,0.750261,316.963436,0.27032
std,14433.901067,2.862611,105.653134,2.00837,0.398617,0.693189,0.460287,0.193746,92.644791,0.44413
min,1.0,1.0,1.0,1.0,0.0,1.0,3.0,0.036809,80.0,0.0
25%,12500.75,3.0,91.0,2.0,1.0,1.0,4.306552,0.630497,251.637842,0.0
50%,25000.5,5.0,183.0,4.0,1.0,1.0,4.652005,0.794165,315.334386,0.0
75%,37500.25,8.0,274.0,6.0,1.0,2.0,4.855238,0.908876,378.92037,1.0
max,50000.0,10.0,365.0,7.0,1.0,4.0,4.999996,0.999985,726.527643,1.0


In [3]:
# 数据可视化
pfr = pandas_profiling.ProfileReport(train)
pfr.to_file('EDA.html')

HBox(children=(FloatProgress(value=0.0, description='variables', max=10.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='correlations', max=6.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='interactions [continuous]', max=49.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=0.0, description='table', max=1.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='missing', max=2.0, style=ProgressStyle(description_width=…









HBox(children=(FloatProgress(value=0.0, description='package', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='build report structure', max=1.0, style=ProgressStyle(des…




In [4]:
# 不存在缺失值
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ID           50000 non-null  int64  
 1   Region       50000 non-null  int64  
 2   Date         50000 non-null  int64  
 3   Weekday      50000 non-null  int64  
 4   Apartment    50000 non-null  int64  
 5   Beds         50000 non-null  int64  
 6   Review       50000 non-null  float64
 7   Pic Quality  50000 non-null  float64
 8   Price        50000 non-null  float64
 9   Accept       50000 non-null  int64  
dtypes: float64(3), int64(7)
memory usage: 3.8 MB


In [5]:
# 类别标签Accept包含0和1
train.head(10)

Unnamed: 0,ID,Region,Date,Weekday,Apartment,Beds,Review,Pic Quality,Price,Accept
0,1,4,32,5,1,1,4.526975,0.739994,230.552998,0
1,2,2,87,4,1,1,4.548455,0.862642,348.203425,0
2,3,5,277,5,1,2,4.965732,0.515548,430.462327,0
3,4,3,246,2,1,1,4.752735,0.561547,149.305433,1
4,5,1,155,2,1,2,3.89096,0.817142,165.264184,1
5,6,2,116,5,1,1,4.943372,0.990913,270.001967,0
6,7,8,115,4,1,3,4.322199,0.59314,576.754677,0
7,8,7,140,1,1,3,4.811223,0.786569,457.293737,1
8,9,5,132,7,1,2,4.577265,0.799762,435.935012,0
9,10,10,88,5,1,2,4.851933,0.943481,307.155959,0


In [6]:
# 通过Pearson Correlation Coefficient找到 强特征--Price，因此可以挖掘更多价格相关的特征
corr_matrix = train.corr()
corr_matrix['Accept'].sort_values()

Price         -0.295958
Region        -0.004934
Review        -0.002587
Pic Quality   -0.001084
Weekday       -0.000923
ID            -0.000375
Apartment      0.000037
Beds           0.003853
Date           0.007600
Accept         1.000000
Name: Accept, dtype: float64

In [7]:
def mode(input_data, n_bin = 5):
    """均值无法表示表示最集中区域--大家看到最多的价格/评论分数区间的状况，所以采用众数
       意味着我们更关注最集中区域的均数，而不是全局均数
    """
    if input_data.empty:
        return input_data.mean()
    else:
        bin_ = pd.cut(input_data, n_bin, labels=False)
        count_num = Counter(bin_)
        most_freq = 0
        most_freq_num = 0
        for i in range(n_bin):
            if count_num[i] > most_freq_num:
                most_freq = i
                most_freq_num = count_num[i]
        return input_data.loc[bin_ == most_freq].mean() 

In [8]:
def feature_extraction(df):
    
    # Feature Engineering 
    # 新特征--月份，房屋租赁售卖存在淡旺季
    # 新特征--是否周末，周末是出游旺盛的时间，工作日是出差旺盛时间
    # 新特征--评论分组分桶
    df['month'] =  df['Date'].apply(lambda x: x//30+1 if x<360 else 12)
    df['isweekend'] = df['Weekday'].apply(lambda x: 1 if x in [6, 7] else 0)
    df['Review_range'] = pd.cut(df['Review'], 11,labels=False)

    # 简单的特征组合
    df['p/picqual'] = df['Price']/df['Pic Quality'] # 性价比1
    df['p/Rev'] = df['Price']/df['Review'] # 性价比2
    df['p/Beds'] = df['Price']/df['Beds']  # 床位性价比3
    df['Rev/Beds'] = df['Review']/df['Beds'] # 评分和床位比值
    
    # 按照规则细分房间位置,住房日期,公寓和房间情况，每组的交易额减去交易价格平均值，得到价格差异。目的是了解不同细粒度等级的价格是否具有吸引力
    for i in range(1, 11):
        df.loc[df['Region'] == i, 'expensive than average region'] = df.loc[df['Region'] == i, 'Price'] - \
        df.loc[df['Region'] == i, 'Price'].mean()
    for i in range(1, 8):
        df.loc[df['Weekday'] == i, 'expensive than average weekday'] = df.loc[df['Weekday'] == i, 'Price'] - \
        df.loc[df['Weekday'] == i, 'Price'].mean()
    for i in range(1, 366):
        df.loc[df['Date'] == i, 'expensive than average date'] = df.loc[df['Date'] == i, 'Price'] - \
        df.loc[df['Date'] == i, 'Price'].mean()
    for i in range(2):
        df.loc[df['Apartment'] == i, 'expensive than average apartment'] = df.loc[df['Apartment'] == i, 'Price'] - \
        df.loc[df['Apartment'] == i, 'Price'].mean()
    for i in range(1, 5):
        df.loc[df['Beds'] == i, 'expensive than average bed'] = df.loc[df['Beds'] == i, 'Price'] - \
        df.loc[df['Beds'] == i, 'Price'].mean()
    for i in range(1, 13):
        df.loc[df['month'] == i, 'expensive than average month'] = df.loc[df['month'] == i, 'Price'] - \
        df.loc[df['month'] == i, 'Price'].mean()

    # 按照规则细分房间位置,住房日期,公寓和房间情况，每组的评分值减去评分平均值，得到评分差异。目的是了解不同细粒度等级的评分是否具有吸引力
    for i in range(1, 11):
        df.loc[df['Region'] == i, 'positive review than average region'] = df.loc[df['Region'] == i, 'Review'] - \
        df.loc[df['Region'] == i, 'Review'].mean()
    for i in range(1, 8):
        df.loc[df['Weekday'] == i, 'positive review than average weekday'] = df.loc[df['Weekday'] == i, 'Review'] - \
        df.loc[df['Weekday'] == i, 'Review'].mean()
    for i in range(1, 366):
        df.loc[df['Date'] == i, 'positive review than average date'] = df.loc[df['Date'] == i, 'Review'] - \
        df.loc[df['Date'] == i, 'Review'].mean()
    for i in range(2):
        df.loc[df['Apartment'] == i, 'positive review than average apartment'] = df.loc[df['Apartment'] == i, 'Review'] - \
        df.loc[df['Apartment'] == i, 'Review'].mean()
    for i in range(1, 5):
        df.loc[df['Beds'] == i, 'positive review than average bed'] = df.loc[df['Beds'] == i, 'Review'] - \
        df.loc[df['Beds'] == i, 'Review'].mean()
    for i in range(1, 13):
        df.loc[df['month'] == i, 'positive review than average month'] = df.loc[df['month'] == i, 'Review'] - \
        df.loc[df['month'] == i, 'Review'].mean()

    # 更细粒度挖掘：考虑双重特征约束下，交易价格是否具有吸引力
    for i in range(1, 11):
        for j in range(1, 8):
            df.loc[(df['Region'] == i) & (df['Weekday'] == j) , 'expensive than average region and average weekday'] = df.loc[(df['Region'] == i) & (df['Weekday'] == j), 'Price'] - \
            df.loc[(df['Region'] == i) & (df['Weekday'] == j), 'Price'].mean()

    for i in range(1, 11):
        for j in range(1, 366):
            df.loc[(df['Region'] == i) & (df['Date'] == j) , 'expensive than average region and average day'] = df.loc[(df['Region'] == i) & (df['Date'] == j), 'Price'] - \
            df.loc[(df['Region'] == i) & (df['Date'] == j), 'Price'].mean()

    for i in range(1, 11):
        for j in range(2):
            df.loc[(df['Region'] == i) & (df['Apartment'] == j) , 'expensive than average region and if Apartment'] = df.loc[(df['Region'] == i) & (df['Apartment'] == j), 'Price'] - \
            df.loc[(df['Region'] == i) & (df['Apartment'] == j), 'Price'].mean()

    for i in range(1, 11):
        for j in range(1, 5):
            df.loc[(df['Region'] == i) & (df['Beds'] == j) , 'expensive than average region and average bed num'] = df.loc[(df['Region'] == i) & (df['Beds'] == j), 'Price'] - \
            df.loc[(df['Region'] == i) & (df['Beds'] == j), 'Price'].mean()

    for i in range(1, 11):
        for j in range(1, 13):
            df.loc[(df['Region'] == i) & (df['month'] == j) , 'expensive than average region and average month'] = df.loc[(df['Region'] == i) & (df['month'] == j), 'Price'] - \
            df.loc[(df['Region'] == i) & (df['month'] == j), 'Price'].mean()

    for i in range(1, 8):
        for j in range(2):
            df.loc[(df['Weekday'] == i) & (df['Apartment'] == j) , 'expensive than average weekday and if Apartment'] = df.loc[(df['Weekday'] == i) & (df['Apartment'] == j), 'Price'] - \
            df.loc[(df['Weekday'] == i) & (df['Apartment'] == j), 'Price'].mean()

    for i in range(1, 8):
        for j in range(1, 5):
            df.loc[(df['Weekday'] == i) & (df['Beds'] == j) , 'expensive than average weekday and bed num'] = df.loc[(df['Weekday'] == i) & (df['Beds'] == j), 'Price'] - \
            df.loc[(df['Weekday'] == i) & (df['Beds'] == j), 'Price'].mean()

    for i in range(1, 366):
        for j in range(2):
            df.loc[(df['Date'] == i) & (df['Apartment'] == j) , 'expensive than average day and if Apartment'] = df.loc[(df['Date'] == i) & (df['Apartment'] == j), 'Price'] - \
            df.loc[(df['Date'] == i) & (df['Apartment'] == j), 'Price'].mean()

    for i in range(1, 366):
        for j in range(1, 5):
            df.loc[(df['Date'] == i) & (df['Beds'] == j) , 'expensive than average day and bed num'] = df.loc[(df['Date'] == i) & (df['Beds'] == j), 'Price'] - \
            df.loc[(df['Date'] == i) & (df['Beds'] == j), 'Price'].mean()

    for i in range(2):
        for j in range(1, 5):
            df.loc[(df['Apartment'] == i) & (df['Beds'] == j) , 'expensive than if Apartment and bed num'] = df.loc[(df['Apartment'] == i) & (df['Beds'] == j), 'Price'] - \
            df.loc[(df['Apartment'] == i) & (df['Beds'] == j), 'Price'].mean()

    for i in range(2):
        for j in range(1, 13):
            df.loc[(df['Apartment'] == i) & (df['month'] == j) , 'expensive than if Apartment and average month'] = df.loc[(df['Apartment'] == i) & (df['month'] == j), 'Price'] - \
            df.loc[(df['Apartment'] == i) & (df['month'] == j), 'Price'].mean()

    for i in range(1, 5):
        for j in range(1, 13):
            df.loc[(df['Beds'] == i) & (df['month'] == j), 'expensive than bed num and average month'] = df.loc[(df['Beds'] == i) & (df['month'] == j), 'Price'] - \
            df.loc[(df['Beds'] == i) & (df['month'] == j), 'Price'].mean()

    # 更细粒度挖掘：考虑双重特征约束下，评分是否具有吸引力
    for i in range(1, 11):
        for j in range(1, 8):
            df.loc[(df['Region'] == i) & (df['Weekday'] == j) , 'positive review than average region and average weekday'] = df.loc[(df['Region'] == i) & (df['Weekday'] == j), 'Review'] - \
            df.loc[(df['Region'] == i) & (df['Weekday'] == j), 'Review'].mean()

    for i in range(1, 11):
        for j in range(1, 366):
            df.loc[(df['Region'] == i) & (df['Date'] == j) , 'positive review than average region and average day'] = df.loc[(df['Region'] == i) & (df['Date'] == j), 'Review'] - \
            df.loc[(df['Region'] == i) & (df['Date'] == j), 'Review'].mean()

    for i in range(1, 11):
        for j in range(2):
            df.loc[(df['Region'] == i) & (df['Apartment'] == j) , 'positive review than average region and if Apartment'] = df.loc[(df['Region'] == i) & (df['Apartment'] == j), 'Review'] - \
            df.loc[(df['Region'] == i) & (df['Apartment'] == j), 'Review'].mean()

    for i in range(1, 11):
        for j in range(1, 5):
            df.loc[(df['Region'] == i) & (df['Beds'] == j) , 'positive review than average region and average bed num'] = df.loc[(df['Region'] == i) & (df['Beds'] == j), 'Review'] - \
            df.loc[(df['Region'] == i) & (df['Beds'] == j), 'Review'].mean()

    for i in range(1, 11):
        for j in range(1, 13):
            df.loc[(df['Region'] == i) & (df['month'] == j) , 'positive review than average region and average month'] = df.loc[(df['Region'] == i) & (df['month'] == j), 'Review'] - \
            df.loc[(df['Region'] == i) & (df['month'] == j), 'Review'].mean()

    for i in range(1, 8):
        for j in range(2):
            df.loc[(df['Weekday'] == i) & (df['Apartment'] == j) , 'positive review than average weekday and if Apartment'] = df.loc[(df['Weekday'] == i) & (df['Apartment'] == j), 'Review'] - \
            df.loc[(df['Weekday'] == i) & (df['Apartment'] == j), 'Review'].mean()

    for i in range(1, 8):
        for j in range(1, 5):
            df.loc[(df['Weekday'] == i) & (df['Beds'] == j) , 'positive review than average weekday and bed num'] = df.loc[(df['Weekday'] == i) & (df['Beds'] == j), 'Review'] - \
            df.loc[(df['Weekday'] == i) & (df['Beds'] == j), 'Review'].mean()

    for i in range(1, 366):
        for j in range(2):
            df.loc[(df['Date'] == i) & (df['Apartment'] == j) , 'positive review than average day and if Apartment'] = df.loc[(df['Date'] == i) & (df['Apartment'] == j), 'Review'] - \
            df.loc[(df['Date'] == i) & (df['Apartment'] == j), 'Review'].mean()

    for i in range(1, 366):
        for j in range(1, 5):
            df.loc[(df['Date'] == i) & (df['Beds'] == j) , 'positive review than average day and bed num'] = df.loc[(df['Date'] == i) & (df['Beds'] == j), 'Review'] - \
            df.loc[(df['Date'] == i) & (df['Beds'] == j), 'Review'].mean()

    for i in range(2):
        for j in range(1, 5):
            df.loc[(df['Apartment'] == i) & (df['Beds'] == j) , 'positive review than if Apartment and bed num'] = df.loc[(df['Apartment'] == i) & (df['Beds'] == j), 'Review'] - \
            df.loc[(df['Apartment'] == i) & (df['Beds'] == j), 'Review'].mean()

    for i in range(2):
        for j in range(1, 13):
            df.loc[(df['Apartment'] == i) & (df['month'] == j) , 'positive review than if Apartment and average month'] = df.loc[(df['Apartment'] == i) & (df['month'] == j), 'Review'] - \
            df.loc[(df['Apartment'] == i) & (df['month'] == j), 'Review'].mean()

    for i in range(1, 5):
        for j in range(1, 13):
            df.loc[(df['Beds'] == i) & (df['month'] == j), 'positive review than bed num and average month'] = df.loc[(df['Beds'] == i) & (df['month'] == j), 'Review'] - \
            df.loc[(df['Beds'] == i) & (df['month'] == j), 'Review'].mean()

    # 挖掘多重组合关系，同时关注价格和评分（即性价比）与同一分组的差异特征。
    for i in range(1, 11):
        for j in range(1, 8):
            df.loc[(df['Region'] == i) & (df['Weekday'] == j) , 'pos & exp than average region and average weekday'] = df.loc[(df['Region'] == i) & (df['Weekday'] == j), 'p/Rev'] - \
            df.loc[(df['Region'] == i) & (df['Weekday'] == j), 'p/Rev'].mean()

    for i in range(1, 11):
        for j in range(1, 366):
            df.loc[(df['Region'] == i) & (df['Date'] == j) , 'pos & exp than average region and average day'] = df.loc[(df['Region'] == i) & (df['Date'] == j), 'p/Rev'] - \
            df.loc[(df['Region'] == i) & (df['Date'] == j), 'p/Rev'].mean()

    for i in range(1, 11):
        for j in range(2):
            df.loc[(df['Region'] == i) & (df['Apartment'] == j) , 'pos & exp than average region and if Apartment'] = df.loc[(df['Region'] == i) & (df['Apartment'] == j), 'p/Rev'] - \
            df.loc[(df['Region'] == i) & (df['Apartment'] == j), 'p/Rev'].mean()

    for i in range(1, 11):
        for j in range(1, 5):
            df.loc[(df['Region'] == i) & (df['Beds'] == j) , 'pos & exp than average region and average bed num'] = df.loc[(df['Region'] == i) & (df['Beds'] == j), 'p/Rev'] - \
            df.loc[(df['Region'] == i) & (df['Beds'] == j), 'p/Rev'].mean()

    for i in range(1, 11):
        for j in range(1, 13):
            df.loc[(df['Region'] == i) & (df['month'] == j) , 'pos & exp than average region and average month'] = df.loc[(df['Region'] == i) & (df['month'] == j), 'p/Rev'] - \
            df.loc[(df['Region'] == i) & (df['month'] == j), 'p/Rev'].mean()

    for i in range(1, 8):
        for j in range(2):
            df.loc[(df['Weekday'] == i) & (df['Apartment'] == j) , 'pos & exp than average weekday and if Apartment'] = df.loc[(df['Weekday'] == i) & (df['Apartment'] == j), 'p/Rev'] - \
            df.loc[(df['Weekday'] == i) & (df['Apartment'] == j), 'p/Rev'].mean()

    for i in range(1, 8):
        for j in range(1, 5):
            df.loc[(df['Weekday'] == i) & (df['Beds'] == j) , 'pos & exp than average weekday and bed num'] = df.loc[(df['Weekday'] == i) & (df['Beds'] == j), 'p/Rev'] - \
            df.loc[(df['Weekday'] == i) & (df['Beds'] == j), 'p/Rev'].mean()

    for i in range(1, 366):
        for j in range(2):
            df.loc[(df['Date'] == i) & (df['Apartment'] == j) , 'pos & exp than average day and if Apartment'] = df.loc[(df['Date'] == i) & (df['Apartment'] == j), 'p/Rev'] - \
            df.loc[(df['Date'] == i) & (df['Apartment'] == j), 'p/Rev'].mean()

    for i in range(1, 366):
        for j in range(1, 5):
            df.loc[(df['Date'] == i) & (df['Beds'] == j) , 'pos & exp than average day and bed num'] = df.loc[(df['Date'] == i) & (df['Beds'] == j), 'p/Rev'] - \
            df.loc[(df['Date'] == i) & (df['Beds'] == j), 'p/Rev'].mean()

    for i in range(2):
        for j in range(1, 5):
            df.loc[(df['Apartment'] == i) & (df['Beds'] == j) , 'pos & exp than if Apartment and bed num'] = df.loc[(df['Apartment'] == i) & (df['Beds'] == j), 'p/Rev'] - \
            df.loc[(df['Apartment'] == i) & (df['Beds'] == j), 'p/Rev'].mean()

    for i in range(2):
        for j in range(1, 13):
            df.loc[(df['Apartment'] == i) & (df['month'] == j) , 'pos & exp than if Apartment and average month'] = df.loc[(df['Apartment'] == i) & (df['month'] == j), 'p/Rev'] - \
            df.loc[(df['Apartment'] == i) & (df['month'] == j), 'p/Rev'].mean()

    for i in range(1, 5):
        for j in range(1, 13):
            df.loc[(df['Beds'] == i) & (df['month'] == j), 'pos & exp than bed num and average month'] = df.loc[(df['Beds'] == i) & (df['month'] == j), 'p/Rev'] - \
            df.loc[(df['Beds'] == i) & (df['month'] == j), 'p/Rev'].mean()


    # 按照规则细分房间位置,住房日期,公寓和房间情况，每组的交易额减去交易价格分布最多的区间的均值，得到价格差异。目的是了解不同细粒度等级的价格是否具有吸引力
    for i in range(1, 11):
        df.loc[df['Region'] == i, 'expensive than Frequent region'] = df.loc[df['Region'] == i, 'Price'] - \
        mode(df.loc[df['Region'] == i, 'Price'])
    for i in range(1, 8):
        df.loc[df['Weekday'] == i, 'expensive than Frequent weekday'] = df.loc[df['Weekday'] == i, 'Price'] - \
        mode(df.loc[df['Weekday'] == i, 'Price'])
    for i in range(1, 366):
        df.loc[df['Date'] == i, 'expensive than Frequent date'] = df.loc[df['Date'] == i, 'Price'] - \
        mode(df.loc[df['Date'] == i, 'Price'])
    for i in range(2):
        df.loc[df['Apartment'] == i, 'expensive than Frequent apartment'] = df.loc[df['Apartment'] == i, 'Price'] - \
        mode(df.loc[df['Apartment'] == i, 'Price'])
    for i in range(1, 5):
        df.loc[df['Beds'] == i, 'expensive than Frequent bed'] = df.loc[df['Beds'] == i, 'Price'] - \
        mode(df.loc[df['Beds'] == i, 'Price'])
    for i in range(1, 13):
        df.loc[df['month'] == i, 'expensive than Frequent month'] = df.loc[df['month'] == i, 'Price'] - \
        mode(df.loc[df['month'] == i, 'Price'])

    # 按照规则细分房间位置,住房日期,公寓和房间情况，每组的评分值减去评分分布最多的区间的均值，得到评分差异。目的是了解不同细粒度等级的评分是否具有吸引力
    for i in range(1, 11):
        df.loc[df['Region'] == i, 'positive review than Frequent region'] = df.loc[df['Region'] == i, 'Review'] - \
        mode(df.loc[df['Region'] == i, 'Review'])
    for i in range(1, 8):
        df.loc[df['Weekday'] == i, 'positive review than Frequent weekday'] = df.loc[df['Weekday'] == i, 'Review'] - \
        mode(df.loc[df['Weekday'] == i, 'Review'])
    for i in range(1, 366):
        df.loc[df['Date'] == i, 'positive review than Frequent date'] = df.loc[df['Date'] == i, 'Review'] - \
        mode(df.loc[df['Date'] == i, 'Review'])
    for i in range(2):
        df.loc[df['Apartment'] == i, 'positive review than Frequent apartment'] = df.loc[df['Apartment'] == i, 'Review'] - \
        mode(df.loc[df['Apartment'] == i, 'Review'])
    for i in range(1, 5):
        df.loc[df['Beds'] == i, 'positive review than Frequent bed'] = df.loc[df['Beds'] == i, 'Review'] - \
        mode(df.loc[df['Beds'] == i, 'Review'])
    for i in range(1, 13):
        df.loc[df['month'] == i, 'positive review than Frequent month'] = df.loc[df['month'] == i, 'Review'] - \
        mode(df.loc[df['month'] == i, 'Review'])

    # 更细粒度挖掘：考虑双重特征约束下，交易价格是否具有吸引力
    for i in range(1, 11):
        for j in range(1, 8):
            df.loc[(df['Region'] == i) & (df['Weekday'] == j) , 'expensive than Frequent region and Frequent weekday'] = df.loc[(df['Region'] == i) & (df['Weekday'] == j), 'Price'] - \
            mode(df.loc[(df['Region'] == i) & (df['Weekday'] == j), 'Price'])

    for i in range(1, 11):
        for j in range(1, 366):
            df.loc[(df['Region'] == i) & (df['Date'] == j) , 'expensive than Frequent region and Frequent day'] = df.loc[(df['Region'] == i) & (df['Date'] == j), 'Price'] - \
            mode(df.loc[(df['Region'] == i) & (df['Date'] == j), 'Price'])

    for i in range(1, 11):
        for j in range(2):
            df.loc[(df['Region'] == i) & (df['Apartment'] == j) , 'expensive than Frequent region and if Apartment'] = df.loc[(df['Region'] == i) & (df['Apartment'] == j), 'Price'] - \
            mode(df.loc[(df['Region'] == i) & (df['Apartment'] == j), 'Price'])

    for i in range(1, 11):
        for j in range(1, 5):
            df.loc[(df['Region'] == i) & (df['Beds'] == j) , 'expensive than Frequent region and Frequent bed num'] = df.loc[(df['Region'] == i) & (df['Beds'] == j), 'Price'] - \
            mode(df.loc[(df['Region'] == i) & (df['Beds'] == j), 'Price'])

    for i in range(1, 11):
        for j in range(1, 13):
            df.loc[(df['Region'] == i) & (df['month'] == j) , 'expensive than Frequent region and Frequent month'] = df.loc[(df['Region'] == i) & (df['month'] == j), 'Price'] - \
            mode(df.loc[(df['Region'] == i) & (df['month'] == j), 'Price'])

    for i in range(1, 8):
        for j in range(2):
            df.loc[(df['Weekday'] == i) & (df['Apartment'] == j) , 'expensive than Frequent weekday and if Apartment'] = df.loc[(df['Weekday'] == i) & (df['Apartment'] == j), 'Price'] - \
            mode(df.loc[(df['Weekday'] == i) & (df['Apartment'] == j), 'Price'])

    for i in range(1, 8):
        for j in range(1, 5):
            df.loc[(df['Weekday'] == i) & (df['Beds'] == j) , 'expensive than Frequent weekday and bed num'] = df.loc[(df['Weekday'] == i) & (df['Beds'] == j), 'Price'] - \
            mode(df.loc[(df['Weekday'] == i) & (df['Beds'] == j), 'Price'])

    for i in range(1, 366):
        for j in range(2):
            df.loc[(df['Date'] == i) & (df['Apartment'] == j) , 'expensive than Frequent day and if Apartment'] = df.loc[(df['Date'] == i) & (df['Apartment'] == j), 'Price'] - \
            mode(df.loc[(df['Date'] == i) & (df['Apartment'] == j), 'Price'])

    for i in range(1, 366):
        for j in range(1, 5):
            df.loc[(df['Date'] == i) & (df['Beds'] == j) , 'expensive than Frequent day and bed num'] = df.loc[(df['Date'] == i) & (df['Beds'] == j), 'Price'] - \
            mode(df.loc[(df['Date'] == i) & (df['Beds'] == j), 'Price'])

    for i in range(2):
        for j in range(1, 5):
            df.loc[(df['Apartment'] == i) & (df['Beds'] == j) , 'expensive than Frequent if Apartment and bed num'] = df.loc[(df['Apartment'] == i) & (df['Beds'] == j), 'Price'] - \
            mode(df.loc[(df['Apartment'] == i) & (df['Beds'] == j), 'Price'])

    for i in range(2):
        for j in range(1, 13):
            df.loc[(df['Apartment'] == i) & (df['month'] == j) , 'expensive than if Apartment and Frequent month'] = df.loc[(df['Apartment'] == i) & (df['month'] == j), 'Price'] - \
            mode(df.loc[(df['Apartment'] == i) & (df['month'] == j), 'Price'])

    for i in range(1, 5):
        for j in range(1, 13):
            df.loc[(df['Beds'] == i) & (df['month'] == j), 'expensive than bed num and Frequent month'] = df.loc[(df['Beds'] == i) & (df['month'] == j), 'Price'] - \
            mode(df.loc[(df['Beds'] == i) & (df['month'] == j), 'Price'])

    # 更细粒度挖掘：考虑双重特征约束下，评分是否具有吸引力
    for i in range(1, 11):
        for j in range(1, 8):
            df.loc[(df['Region'] == i) & (df['Weekday'] == j) , 'positive review than Frequent region and Frequent weekday'] = df.loc[(df['Region'] == i) & (df['Weekday'] == j), 'Review'] - \
            mode(df.loc[(df['Region'] == i) & (df['Weekday'] == j), 'Review'])

    for i in range(1, 11):
        for j in range(1, 366):
            df.loc[(df['Region'] == i) & (df['Date'] == j) , 'positive review than Frequent region and Frequent day'] = df.loc[(df['Region'] == i) & (df['Date'] == j), 'Review'] - \
            mode(df.loc[(df['Region'] == i) & (df['Date'] == j), 'Review'])

    for i in range(1, 11):
        for j in range(2):
            df.loc[(df['Region'] == i) & (df['Apartment'] == j) , 'positive review than Frequent region and if Apartment'] = df.loc[(df['Region'] == i) & (df['Apartment'] == j), 'Review'] - \
            mode(df.loc[(df['Region'] == i) & (df['Apartment'] == j), 'Review'])

    for i in range(1, 11):
        for j in range(1, 5):
            df.loc[(df['Region'] == i) & (df['Beds'] == j) , 'positive review than Frequent region and Frequent bed num'] = df.loc[(df['Region'] == i) & (df['Beds'] == j), 'Review'] - \
            mode(df.loc[(df['Region'] == i) & (df['Beds'] == j), 'Review'])

    for i in range(1, 11):
        for j in range(1, 13):
            df.loc[(df['Region'] == i) & (df['month'] == j) , 'positive review than Frequent region and Frequent month'] = df.loc[(df['Region'] == i) & (df['month'] == j), 'Review'] - \
            mode(df.loc[(df['Region'] == i) & (df['month'] == j), 'Review'])

    for i in range(1, 8):
        for j in range(2):
            df.loc[(df['Weekday'] == i) & (df['Apartment'] == j) , 'positive review than Frequent weekday and if Apartment'] = df.loc[(df['Weekday'] == i) & (df['Apartment'] == j), 'Review'] - \
            mode(df.loc[(df['Weekday'] == i) & (df['Apartment'] == j), 'Review'])

    for i in range(1, 8):
        for j in range(1, 5):
            df.loc[(df['Weekday'] == i) & (df['Beds'] == j) , 'positive review than Frequent weekday and bed num'] = df.loc[(df['Weekday'] == i) & (df['Beds'] == j), 'Review'] - \
            mode(df.loc[(df['Weekday'] == i) & (df['Beds'] == j), 'Review'])

    for i in range(1, 366):
        for j in range(2):
            df.loc[(df['Date'] == i) & (df['Apartment'] == j) , 'positive review than Frequent day and if Apartment'] = df.loc[(df['Date'] == i) & (df['Apartment'] == j), 'Review'] - \
            mode(df.loc[(df['Date'] == i) & (df['Apartment'] == j), 'Review'])

    for i in range(1, 366):
        for j in range(1, 5):
            df.loc[(df['Date'] == i) & (df['Beds'] == j) , 'positive review than Frequent day and bed num'] = df.loc[(df['Date'] == i) & (df['Beds'] == j), 'Review'] - \
            mode(df.loc[(df['Date'] == i) & (df['Beds'] == j), 'Review'])

    for i in range(2):
        for j in range(1, 5):
            df.loc[(df['Apartment'] == i) & (df['Beds'] == j) , 'positive review than Frequent if Apartment and bed num'] = df.loc[(df['Apartment'] == i) & (df['Beds'] == j), 'Review'] - \
            mode(df.loc[(df['Apartment'] == i) & (df['Beds'] == j), 'Review'])

    for i in range(2):
        for j in range(1, 13):
            df.loc[(df['Apartment'] == i) & (df['month'] == j) , 'positive review than if Apartment and Frequent month'] = df.loc[(df['Apartment'] == i) & (df['month'] == j), 'Review'] - \
            mode(df.loc[(df['Apartment'] == i) & (df['month'] == j), 'Review'])

    for i in range(1, 5):
        for j in range(1, 13):
            df.loc[(df['Beds'] == i) & (df['month'] == j), 'positive review than bed num and Frequent month'] = df.loc[(df['Beds'] == i) & (df['month'] == j), 'Review'] - \
            mode(df.loc[(df['Beds'] == i) & (df['month'] == j), 'Review'])

    # 挖掘多重组合关系，同时关注价格和评分（即性价比）与同一分组的差异特征。
    for i in range(1, 11):
        for j in range(1, 8):
            df.loc[(df['Region'] == i) & (df['Weekday'] == j) , 'pos & exp than Frequent region and Frequent weekday'] = df.loc[(df['Region'] == i) & (df['Weekday'] == j), 'p/Rev'] - \
            mode(df.loc[(df['Region'] == i) & (df['Weekday'] == j), 'p/Rev'])

    for i in range(1, 11):
        for j in range(1, 366):
            df.loc[(df['Region'] == i) & (df['Date'] == j) , 'pos & exp than Frequent region and Frequent day'] = df.loc[(df['Region'] == i) & (df['Date'] == j), 'p/Rev'] - \
            mode(df.loc[(df['Region'] == i) & (df['Date'] == j), 'p/Rev'])

    for i in range(1, 11):
        for j in range(2):
            df.loc[(df['Region'] == i) & (df['Apartment'] == j) , 'pos & exp than Frequent region and if Apartment'] = df.loc[(df['Region'] == i) & (df['Apartment'] == j), 'p/Rev'] - \
            mode(df.loc[(df['Region'] == i) & (df['Apartment'] == j), 'p/Rev'])

    for i in range(1, 11):
        for j in range(1, 5):
            df.loc[(df['Region'] == i) & (df['Beds'] == j) , 'pos & exp than Frequent region and Frequent bed num'] = df.loc[(df['Region'] == i) & (df['Beds'] == j), 'p/Rev'] - \
            mode(df.loc[(df['Region'] == i) & (df['Beds'] == j), 'p/Rev'])

    for i in range(1, 11):
        for j in range(1, 13):
            df.loc[(df['Region'] == i) & (df['month'] == j) , 'pos & exp than Frequent region and Frequent month'] = df.loc[(df['Region'] == i) & (df['month'] == j), 'p/Rev'] - \
            mode(df.loc[(df['Region'] == i) & (df['month'] == j), 'p/Rev'])

    for i in range(1, 8):
        for j in range(2):
            df.loc[(df['Weekday'] == i) & (df['Apartment'] == j) , 'pos & exp than Frequent weekday and if Apartment'] = df.loc[(df['Weekday'] == i) & (df['Apartment'] == j), 'p/Rev'] - \
            mode(df.loc[(df['Weekday'] == i) & (df['Apartment'] == j), 'p/Rev'])

    for i in range(1, 8):
        for j in range(1, 5):
            df.loc[(df['Weekday'] == i) & (df['Beds'] == j) , 'pos & exp than Frequent weekday and bed num'] = df.loc[(df['Weekday'] == i) & (df['Beds'] == j), 'p/Rev'] - \
            mode(df.loc[(df['Weekday'] == i) & (df['Beds'] == j), 'p/Rev'])

    for i in range(1, 366):
        for j in range(2):
            df.loc[(df['Date'] == i) & (df['Apartment'] == j) , 'pos & exp than Frequent day and if Apartment'] = df.loc[(df['Date'] == i) & (df['Apartment'] == j), 'p/Rev'] - \
            mode(df.loc[(df['Date'] == i) & (df['Apartment'] == j), 'p/Rev'])

    for i in range(1, 366):
        for j in range(1, 5):
            df.loc[(df['Date'] == i) & (df['Beds'] == j) , 'pos & exp than Frequent day and bed num'] = df.loc[(df['Date'] == i) & (df['Beds'] == j), 'p/Rev'] - \
            mode(df.loc[(df['Date'] == i) & (df['Beds'] == j), 'p/Rev'])

    for i in range(2):
        for j in range(1, 5):
            df.loc[(df['Apartment'] == i) & (df['Beds'] == j) , 'pos & exp than Frequent if Apartment and bed num'] = df.loc[(df['Apartment'] == i) & (df['Beds'] == j), 'p/Rev'] - \
            mode(df.loc[(df['Apartment'] == i) & (df['Beds'] == j), 'p/Rev'])

    for i in range(2):
        for j in range(1, 13):
            df.loc[(df['Apartment'] == i) & (df['month'] == j) , 'pos & exp than if Apartment and Frequent month'] = df.loc[(df['Apartment'] == i) & (df['month'] == j), 'p/Rev'] - \
            mode(df.loc[(df['Apartment'] == i) & (df['month'] == j), 'p/Rev'])

    for i in range(1, 5):
        for j in range(1, 13):
            df.loc[(df['Beds'] == i) & (df['month'] == j), 'pos & exp than bed num and Frequent month'] = df.loc[(df['Beds'] == i) & (df['month'] == j), 'p/Rev'] - \
            mode(df.loc[(df['Beds'] == i) & (df['month'] == j), 'p/Rev'])
    
    
    #特征组合，特征交叉
    fc = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    columns1 = ['fc' + str(i) for i in range(15)]
    fc_values = pd.DataFrame(fc.fit_transform(df[['Apartment', 'Beds', 'Review', 'Pic Quality', 'Price']]), columns=columns1)
    
    # 类别特征独热编码
    encoder = OneHotEncoder()
    oh1 = encoder.fit_transform(df['Region'].values.reshape(-1, 1))
    columns2 = ['oh1' + str(i) for i in range(10)]
    oh1_values = pd.DataFrame(oh1.toarray(), columns=columns2)
    oh2 = encoder.fit_transform(df['Weekday'].values.reshape(-1, 1))
    columns3 = ['oh2' + str(i) for i in range(7)]
    oh2_values = pd.DataFrame(oh2.toarray(), columns=columns3)
    df = pd.concat([df.drop(columns=['ID']),fc_values, oh1_values, oh2_values], axis=1)

    return df

In [9]:
# 特征选择--留下最具区分性特征
# 基于LGB的特征排序并选择
# 基于LGB的特征排序并选择
def feature_selection(df, labels):
    # Initialize an empty array to hold feature importances
    feature_importances = np.zeros(df.shape[1])
    weights = (len(labels) - np.sum(labels)) / np.sum(labels)

    # Create the model with several hyperparameters
    model = lgb.LGBMClassifier(objective='binary', boosting_type = 'gbdt', 
                               learning_rate = 0.1, n_estimators = 1000, 
                               reg_alpha = 0.1, reg_lambda =0.1, scale_pos_weight=weights)

    # 两次独立运行避免过拟合
    # Fit the model twice to avoid overfitting
    for i in range(2):

        # Split into training and validation set
        train_features, valid_features, train_y, valid_y = train_test_split(df, labels, test_size = 0.25, random_state = i)

        # Train using early stopping
        model.fit(train_features, train_y, early_stopping_rounds=100, eval_set = [(valid_features, valid_y)], 
                  eval_metric = 'auc', verbose = 200)

        # Record the feature importances
        feature_importances += model.feature_importances_


    # 特征重要性排序
    # Make sure to average feature importances! 
    feature_importances = feature_importances / 2.0
    feature_importances = pd.DataFrame({'feature': list(df.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)

    # 寻找零重要性特征
    # Find the features with zero importance
    zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])
    print('There are %d features with 0.0 importance' % len(zero_features))
    return zero_features,feature_importances

In [10]:
fe_train = feature_extraction(train)
zero_features_list,feature_importances_list = feature_selection(fe_train.drop(['Accept'], axis=1), fe_train['Accept'])
fs_train =  fe_train.drop(columns = zero_features_list)

fe_test = feature_extraction(test)
fs_test =  fe_test.drop(columns = zero_features_list)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[9]	valid_0's auc: 0.789077	valid_0's binary_logloss: 0.506875
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[9]	valid_0's auc: 0.791743	valid_0's binary_logloss: 0.505493
There are 66 features with 0.0 importance


In [11]:
def stackmodel(train_, test_, bin_=5):
    Seed = 32
    split = StratifiedShuffleSplit(n_splits=bin_, test_size=0.3, random_state=Seed)
    rou = 0
    cv_pred = np.zeros((test_.shape[0],2))
    for train_index, val_index in split.split(train_, train_['Accept']):
        train_df = train_.loc[train_index]
        val_df = train_.loc[val_index]
        train_data = train_df.drop(['Accept'], axis=1)
        train_label = train_df['Accept']
        val_data = val_df.drop(['Accept'], axis=1)
        val_label = val_df['Accept']
        train_data = np.array(train_data)
        train_label = np.array(train_label)
        val_data = np.array(val_data)
        val_label = np.array(val_label)

        weights = (len(train_label) - np.sum(train_label)) / np.sum(train_label)

        xgb_model = XGBClassifier(scale_pos_weight=weights,
                                  learning_rate=0.01,
                                  n_estimators=300,
                                  max_depth=5,
                                  min_child_weight=5,
                                  gamma=0.1,
                                  subsample=0.7,
                                  colsample_bytree=0.6,
                                  reg_alpha=0, reg_lambda=1,
                                  objective='binary:logistic', nthread=4, seed=Seed)

        rf_model = RandomForestClassifier(n_estimators=200,
                                          min_samples_split=100,
                                          min_samples_leaf=20,
                                          max_depth=9,
                                          max_features=11,
                                          random_state=Seed,
                                          class_weight={0: 1, 1: weights},
                                          oob_score=True)

        gc_model = GradientBoostingClassifier(n_estimators=300,
                                              learning_rate=0.01,
                                              min_samples_split=850,
                                              min_samples_leaf=60,
                                              max_depth=7,
                                              max_features=7,
                                              subsample=0.8,
                                              random_state=Seed)

        lgb_model = lgb.LGBMClassifier(objective='binary', boosting_type = 'gbdt', 
                                   learning_rate = 0.1, n_estimators = 300, 
                                   reg_alpha = 0.1, reg_lambda =0.1, scale_pos_weight=weights)

        # Voting Classifier
        voting_clf = VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model), ('gc', gc_model),('lgb', lgb_model)], voting='soft')
        voting_clf.fit(train_data, train_label.ravel())
        predictions = voting_clf.predict(val_data)
        accuracy = accuracy_score(val_label, predictions)
        recall = recall_score(val_label, predictions, average='macro')
        f1 = f1_score(val_label, predictions)
        auc = roc_auc_score(val_label, voting_clf.predict_proba(val_data)[:, 1])
        auc_train = roc_auc_score(train_label, voting_clf.predict_proba(train_data)[:, 1])
        print('round%s:acc-->%.4f, recall-->%.4f, f1-->%.4f, auc-->%.4f, auc_train-->%.4f' % (rou, accuracy, recall, f1, auc, auc_train))
        cv_pred += voting_clf.predict_proba(test_)
        rou += 1
    return cv_pred/bin_

In [12]:
# 单模
def lgb_model(train_X, label, test):
    params_lgb = {
        'boosting_type': 'gbdt',
        'n_estimators': 300,
        'max_depth': -1,
        'objective': 'binary',
        'metric': {'auc'},
        'num_leaves': 16,
        'learning_rate': 0.1,
        'feature_fraction': 1.,
        'bagging_fraction': 1.,
        'reg_lambda': 0.5,
        'reg_alpha': 0.3,
        'random_state': 1024,
        'n_jobs': -1,
    }
    fea_dict = {v: k for k, v in enumerate(train_X.columns)}
    train_X.columns = [fea_dict[v] for v in train_X.columns]
    test.columns = train_X.columns

    fea_dict = {v: k for k, v in fea_dict.items()}
    # lgb 模型
    cv_pred = np.zeros(test.shape[0])
    cv_best_auc_all = 0

    SKF = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
    skf = SKF.split(train_X, label)

    fea_importances = pd.DataFrame({'column': train_X.columns})

    stack_train = np.zeros((train.shape[0], 1))
    stack_test = np.zeros((test.shape[0], 1))

    for i, (train_fold, validate) in enumerate(skf):
        print("model: lgb. fold: ", i, "training...")
        X_train, label_train = train_X.iloc[train_fold], label.iloc[train_fold]
        X_validate, label_validate = train_X.iloc[validate], label.iloc[validate]

        dtrain = lgb.Dataset(X_train, label_train)
        dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)

        bst = lgb.train(params_lgb, dtrain, valid_sets=(dtrain, dvalid),
                        verbose_eval=50, early_stopping_rounds=100)
        cv_pred += bst.predict(test, num_iteration=bst.best_iteration)
        cv_best_auc_all += bst.best_score['valid_1']['auc']

        score_va = bst.predict(train_X.iloc[validate], num_iteration=bst.best_iteration)
        
        pre = [1 if i>=0.5 else 0 for i in score_va]
        recall = recall_score(label_validate, pre, average='macro')
        f1 = f1_score(label_validate, pre)
        print('recall%.6f' %(recall))
        score_te = bst.predict(test, num_iteration=bst.best_iteration)
        stack_train[validate] += score_va[:, None]
        stack_test += score_te[:, None]

        fea_importance_temp = pd.DataFrame({
            'column': train_X.columns,
            'importance_' + str(i): bst.feature_importance(importance_type='split', iteration=bst.best_iteration)
        })

        fea_importances = fea_importances.merge(fea_importance_temp, how='left', on='column')

    cv_pred /= 5
    cv_best_auc_all /= 5
    print("lgb cv score for valid is: ", cv_best_auc_all)

    fea_importances['importance'] = (fea_importances['importance_0'] + fea_importances['importance_1'] +
                                     fea_importances['importance_2'] + fea_importances['importance_3'] +
                                     fea_importances['importance_4']) / 5

    fea_importances = fea_importances[['column', 'importance']]
    fea_importances = fea_importances.sort_values(by='importance', ascending=False)
    fea_importances['column'] = fea_importances['column'].apply(lambda x: fea_dict[x])

    stack_test /= 5
    stack = np.vstack([stack_train, stack_test])
    df_stack = pd.DataFrame()

    df_stack['lgb_prob'] = np.around(stack[:, 0], 6)
    return cv_pred

In [13]:
# 堆叠模型比单模优
result = stackmodel(fs_train, fs_test, bin_=5)
output = pd.DataFrame(np.concatenate([test['ID'].values.reshape(-1,1), result[:, 1].reshape(-1,1)], axis=1), columns=['ID', 'possibility'])
output.to_csv('output.csv', index=False)

round0:acc-->0.7147, recall-->0.7319, f1-->0.5931, auc-->0.7922, auc_train-->0.8812
round1:acc-->0.7085, recall-->0.7284, f1-->0.5887, auc-->0.7891, auc_train-->0.8825
round2:acc-->0.7160, recall-->0.7344, f1-->0.5958, auc-->0.7928, auc_train-->0.8826
round3:acc-->0.7087, recall-->0.7292, f1-->0.5895, auc-->0.7899, auc_train-->0.8840
round4:acc-->0.7071, recall-->0.7271, f1-->0.5872, auc-->0.7911, auc_train-->0.8822


In [14]:
result_single = lgb_model(fs_train.drop(['Accept'], axis=1), fs_train['Accept'], fs_test)
output_single  = pd.DataFrame(np.concatenate([test['ID'].values.reshape(-1,1), result_single.reshape(-1,1)], axis=1), columns=['ID', 'possibility'])
output_single.to_csv('output_single.csv', index=False)

model: lgb. fold:  0 training...
Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.809776	valid_1's auc: 0.796743
[100]	training's auc: 0.830914	valid_1's auc: 0.79592
Early stopping, best iteration is:
[28]	training's auc: 0.801024	valid_1's auc: 0.796953
recall0.584935
model: lgb. fold:  1 training...
Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.812896	valid_1's auc: 0.786091
[100]	training's auc: 0.834761	valid_1's auc: 0.78744
[150]	training's auc: 0.853312	valid_1's auc: 0.786058
[200]	training's auc: 0.868789	valid_1's auc: 0.786183
Early stopping, best iteration is:
[107]	training's auc: 0.837688	valid_1's auc: 0.787456
recall0.621132
model: lgb. fold:  2 training...
Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.813742	valid_1's auc: 0.785796
[100]	training's auc: 0.837077	valid_1's auc: 0.783069
[150]	training's auc: 0.854702	valid_1's auc: 0.781311
Early stopping, 