### Packages

In [1]:
import time
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

### Input data

In [2]:
data_path = '../data/'
train_data = pd.read_csv(data_path + 'train_dataset.csv')
test_data = pd.read_csv(data_path + 'test_dataset.csv')
sample_sub = pd.read_csv(data_path + 'submit_example.csv')

In [4]:
train_data.head(1)

Unnamed: 0,用户编码,用户实名制是否通过核实,用户年龄,是否大学生客户,是否黑名单客户,是否4G不健康客户,用户网龄（月）,用户最近一次缴费距今时长（月）,缴费用户最近一次缴费金额（元）,用户近6个月平均消费值（元）,...,当月是否景点游览,当月是否体育场馆消费,当月网购类应用使用次数,当月物流快递类应用使用次数,当月金融理财类应用使用总次数,当月视频播放类应用使用次数,当月飞机类应用使用次数,当月火车类应用使用次数,当月旅游资讯类应用使用次数,信用分
0,a4651f98c82948b186bdcdc8108381b4,1,44,0,0,0,186,1,99.8,163.86,...,1,1,713,0,2740,7145,0,0,30,664


In [6]:
print(train_data.columns)

Index(['用户编码', '用户实名制是否通过核实', '用户年龄', '是否大学生客户', '是否黑名单客户', '是否4G不健康客户',
       '用户网龄（月）', '用户最近一次缴费距今时长（月）', '缴费用户最近一次缴费金额（元）', '用户近6个月平均消费值（元）',
       '用户账单当月总费用（元）', '用户当月账户余额（元）', '缴费用户当前是否欠费缴费', '用户话费敏感度', '当月通话交往圈人数',
       '是否经常逛商场的人', '近三个月月均商场出现次数', '当月是否逛过福州仓山万达', '当月是否到过福州山姆会员店', '当月是否看电影',
       '当月是否景点游览', '当月是否体育场馆消费', '当月网购类应用使用次数', '当月物流快递类应用使用次数',
       '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数',
       '当月旅游资讯类应用使用次数', '信用分'],
      dtype='object')


### Feature Engineering

In [7]:
#top up amount, 充值金额是整数，和小数，应该对应不同的充值途径？

def produce_offline_feat(train_data):
    train_data['不同充值途径']=0
    train_data['不同充值途径'][(train_data['缴费用户最近一次缴费金额（元）']%10==0)&train_data['缴费用户最近一次缴费金额（元）']!=0]=1

    return train_data

train_data = produce_offline_feat(train_data)
test_data = produce_offline_feat(test_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [8]:
def produce_fee_rate(train_data):
    train_data['当前费用稳定性']=train_data['用户账单当月总费用（元）']/(train_data['用户近6个月平均消费值（元）']+1)
    
    ##当月话费/当月账户余额
    train_data['用户余额比例']=train_data['用户账单当月总费用（元）']/(train_data['用户当月账户余额（元）']+1)
    return train_data

train_data = produce_fee_rate(train_data)
test_data = produce_fee_rate(test_data)

In [9]:
#获取特征
def get_features(data):
    data.loc[data['用户年龄']==0,'用户年龄']=data['用户年龄'].mode()
    data['缴费金额是否能覆盖当月账单']=data['缴费用户最近一次缴费金额（元）']-data['用户账单当月总费用（元）']
    data['最近一次缴费是否超过平均消费额']=data['缴费用户最近一次缴费金额（元）']-data['用户近6个月平均消费值（元）']
    data['当月账单是否超过平均消费额']=data['用户账单当月总费用（元）']-data['用户近6个月平均消费值（元）']
    
    #映射年龄
    def map_age(x):
        if x<=18:
            return 1
        elif x<=30:
            return 2
        elif x<=35:
            return 3
        elif x<=45:
            return 4
        else:
            return 5
    data['是否大学生_黑名单']=data['是否大学生客户']+data['是否黑名单客户']
    data['是否去过高档商场']=data['当月是否到过福州山姆会员店']+data['当月是否逛过福州仓山万达']
    data['是否去过高档商场']=data['是否去过高档商场'].map(lambda x:1 if x>=1 else 0)
    data['是否_商场_电影']=data['是否去过高档商场']*data['当月是否看电影']
    data['是否_商场_体育馆']=data['是否去过高档商场']*data['当月是否体育场馆消费']
    data['是否_商场_旅游']=data['是否去过高档商场']*data['当月是否景点游览']
    data['是否_电影_体育馆']=data['当月是否看电影']*data['当月是否体育场馆消费']
    data['是否_电影_旅游']=data['当月是否看电影']*data['当月是否景点游览']
    data['是否_旅游_体育馆']=data['当月是否景点游览']*data['当月是否体育场馆消费']
    
    data['是否_商场_旅游_体育馆']=data['是否去过高档商场']*data['当月是否景点游览']*data['当月是否体育场馆消费']
    data['是否_商场_电影_体育馆']=data['是否去过高档商场']*data['当月是否看电影']*data['当月是否体育场馆消费']
    data['是否_商场_电影_旅游']=data['是否去过高档商场']*data['当月是否看电影']*data['当月是否景点游览']
    data['是否_体育馆_电影_旅游']=data['当月是否体育场馆消费']*data['当月是否看电影']*data['当月是否景点游览']
    
    data['是否_商场_体育馆_电影_旅游']=data['是否去过高档商场']*data['当月是否体育场馆消费']*data['当月是否看电影']*data['当月是否景点游览']
    
    discretize_features=['交通类应用使用次数','当月物流快递类应用使用次数','当月飞机类应用使用次数','当月火车类应用使用次数','当月旅游资讯类应用使用次数']
    data['交通类应用使用次数']=data['当月飞机类应用使用次数']+data['当月火车类应用使用次数']
    
    data['6个月平均占比总费用']=data['用户近6个月平均消费值（元）']/data['用户账单当月总费用（元）']+1
    
    data['用户账单当月总费用（零钱）'] = data['用户账单当月总费用（元）'].apply(lambda x:1 if(x-int(x))>0 else 0)
    data['用户账单当月总费用（整数）'] = data['用户账单当月总费用（元）'].apply(lambda x:int(x))
    
    del data['用户账单当月总费用（元）']
    
    def map_discretize(x):
        if x==0:
            return 0
        elif x<=5:
            return 1
        elif x<=15:
            return 2
        elif x<=50:
            return 3
        elif x<=100:
            return 4
        else:
            return 5
        
    for col in discretize_features[:]:
        data[col]=data[col].map(lambda x:map_discretize(x))
    
    return data

train_data=get_features(train_data)
test_data=get_features(test_data)

### Training

In [11]:
#para
params = {'bagging_fraction': 0.8, 'bagging_freq': 2, 'boosting_type': 'gbdt', 'feature_fraction': 0.6, 'lambda_l2': 2.087923442506413, 'learning_rate': 0.010693808471895612, 'max_depth': 6, 'metric': 'mae', 'min_data_in_leaf': 26, 'nthread': 4, 'num_leaves': 40, 'objective': 'regression_l1', 'seed': 2019, 'verbose': -1}


In [12]:
#para
params2 = {'bagging_fraction': 0.8, 'bagging_freq': 2, 'boosting_type': 'gbdt', 'feature_fraction': 0.6, 'lambda_l2': 4.891146817725019, 'learning_rate': 0.022239925089113915, 'max_depth': 5, 'metric': 'mae', 'min_data_in_leaf': 20, 'nthread': 4, 'num_leaves': 61, 'objective': 'regression_l2', 'seed': 2019, 'verbose': -1}


In [14]:
cv_pred_all = 0
en_amount = 3
for seed in range(en_amount):
    NFOLDS = 5
    train_label = train_data['信用分']
    kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed)
    kf = kfold.split(train_data, train_label)

    train_data_use = train_data.drop(['用户编码','信用分','是否黑名单客户'], axis=1)
    test_data_use = test_data.drop(['用户编码','是否黑名单客户'], axis=1)


    cv_pred = np.zeros(test_data.shape[0])
    valid_best_l1_all = 0

    feature_importance_df = pd.DataFrame()
    count = 0
    for i, (train_fold, validate) in enumerate(kf):
        print('fold: ',i, ' training')
        X_train, X_validate, label_train, label_validate = \
        train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \
        train_label[train_fold], train_label[validate]
        dtrain = lgb.Dataset(X_train, label_train)
        dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)
        bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)
        cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)
        valid_best_l1_all += bst.best_score['valid_0']['l1']

        count += 1

    cv_pred /= NFOLDS
    valid_best_l1_all /= NFOLDS
    
    cv_pred_all += cv_pred
cv_pred_all /= en_amount
print('cv score for valid is: ', 1/(1+valid_best_l1_all))



fold:  0  training
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[1812]	valid_0's l1: 14.7542
fold:  1  training
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[1293]	valid_0's l1: 14.8632
fold:  2  training
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[1771]	valid_0's l1: 14.6869
fold:  3  training
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[1966]	valid_0's l1: 14.6055
fold:  4  training
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[2505]	valid_0's l1: 14.4855
fold:  0  training
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[1828]	valid_0's l1: 14.7499
fold:  1  training
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[1898]	valid_0

In [15]:
cv_pred_all2 = 0
en_amount = 3
for seed in range(en_amount):
    NFOLDS = 5
    train_label = train_data['信用分']
    kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=(seed + 2019))
    kf = kfold.split(train_data, train_label)

    train_data_use = train_data.drop(['用户编码','信用分','是否黑名单客户'], axis=1)
    test_data_use = test_data.drop(['用户编码','是否黑名单客户'], axis=1)


    cv_pred = np.zeros(test_data.shape[0])
    valid_best_l2_all = 0

    feature_importance_df = pd.DataFrame()
    count = 0
    for i, (train_fold, validate) in enumerate(kf):
        print('fold: ',i, ' training')
        X_train, X_validate, label_train, label_validate = \
        train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \
        train_label[train_fold], train_label[validate]
        dtrain = lgb.Dataset(X_train, label_train)
        dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)
        bst = lgb.train(params2, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)
        cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)
        valid_best_l2_all += bst.best_score['valid_0']['l1']

        count += 1

    cv_pred /= NFOLDS
    valid_best_l2_all /= NFOLDS
    
    cv_pred_all2 += cv_pred
    
cv_pred_all2 /= en_amount
print('cv score for valid is: ', 1/(1+valid_best_l2_all))



fold:  0  training
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[1195]	valid_0's l1: 14.7742
fold:  1  training
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[1055]	valid_0's l1: 14.662
fold:  2  training
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[1332]	valid_0's l1: 14.756
fold:  3  training
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[776]	valid_0's l1: 14.5359
fold:  4  training
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[861]	valid_0's l1: 14.7618
fold:  0  training
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[1255]	valid_0's l1: 14.7435
fold:  1  training
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[941]	valid_0's l1

In [16]:
print('cv score for valid is: ', 1/(1+valid_best_l1_all))
print('cv score for valid is: ', 1/(1+valid_best_l2_all))

cv score for valid is:  0.0638053928486
cv score for valid is:  0.0637120314866


### Submit

In [17]:
test_data_sub = test_data[['用户编码']]
test_data_sub['score'] = (cv_pred_all2 + cv_pred_all)/2
test_data_sub.columns = ['id','score']
test_data_sub['score1'] = cv_pred_all
test_data_sub['score2'] = cv_pred_all2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [18]:
test_data_sub['score'] = test_data_sub['score'].apply(lambda x: int(np.round(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
test_data_sub[['id','score']].to_csv('../summit/summit.csv',index=False)