In [51]:
import gc
import glob
import os
import json
import matplotlib.pyplot as plt
import pprint

import numpy as np
import pandas as pd

from joblib import Parallel, delayed
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
import scipy as sp

from sklearn.model_selection import train_test_split, GridSearchCV

from functools import partial
from collections import Counter

import random

# 评价函数

In [2]:
#%% 评价函数 Metric used for this competition 
# (Quadratic Weigthed Kappa aka Quadratic Cohen Kappa Score)
def metric(y1,y2):
    return cohen_kappa_score(y1, y2, weights = 'quadratic')


# Make scorer for scikit-learn
scorer = make_scorer(metric)

# Cross验证函数

In [297]:
from sklearn.model_selection import StratifiedKFold

#
def split_score(model, x, y, n=10):
    y_pre = np.zeros(y.shape[0])
    kfold = StratifiedKFold(n_splits=n, random_state=1337)
    for train_index, test_index in kfold.split(x,y):
        model.fit(x.iloc[train_index], y.iloc[train_index])
        y_pre[test_index] = model.predict(x.iloc[test_index])
    
#    score = metric(y_pre, y)
    print("{}折后的Kappa加权得分为:带补充".format(n))
    
    return y_pre

#
def fix_y(y, coef):
    y_fix = np.copy(y)
    for i, pred in enumerate(y_fix):
        if pred < coef[0]:
            y_fix[i] = 0
        elif pred >= coef[0] and pred < coef[1]:
            y_fix[i] = 1
        elif pred >= coef[1] and pred < coef[2]:
            y_fix[i] = 2
        elif pred >= coef[2] and pred < coef[3]:
            y_fix[i] = 3
        else:
            y_fix[i] = 4    
    return y_fix

# 
def _kappa_loss(y, y_true, coef):
    y_fix = np.copy(y)
    for i, pred in enumerate(y_fix):
        if pred < coef[0]:
            y_fix[i] = 0
        elif pred >= coef[0] and pred < coef[1]:
            y_fix[i] = 1
        elif pred >= coef[1] and pred < coef[2]:
            y_fix[i] = 2
        elif pred >= coef[2] and pred < coef[3]:
            y_fix[i] = 3
        else:
            y_fix[i] = 4
            
    loss = metric(y_fix, y_true)
    return -loss

# 寻找分类的最佳参数
def search_coef(x1, x2):
    loss_partial = partial(_kappa_loss, x1, x2)
    initial_coef = [1.55, 2.05, 2.5, 3]
    coef = sp.optimize.basinhopping(loss_partial, initial_coef, niter=500, T=1,
                                              stepsize=0.2, minimizer_kwargs={"method": 'nelder-mead'}, 
                                              take_step=None, accept_test=None, callback=None, 
                                              interval=100, disp=True, niter_success=20, seed=None)
    return coef

# 读取数据、划分验证集

In [178]:
df_train  = pd.read_csv('train.csv')

#读取唯一的RescuerID
RescuerID = set(df_train['RescuerID'].unique())

#随机生成RescuerID
j_test = random.sample(RescuerID, int(len(RescuerID)*0.15))
j_train = RescuerID - set(j_test)

df_test = df_train[df_train['RescuerID'].isin(j_test)]
df_train = df_train[df_train['RescuerID'].isin(j_train)]

train = df_train.copy()
test  = df_test.copy()

In [179]:
labels_breed = pd.read_csv('breed_labels.csv')
labels_state = pd.read_csv('color_labels.csv')
labels_color = pd.read_csv('state_labels.csv')

In [299]:
model_lgb.feature_importances_
a = pd.DataFrame(model_lgb.feature_importances_, index=x_train.columns)
a.sort_values(by=0, ascending=False)

Unnamed: 0,0
Breed1,111
Age,97
RescuerID_count,96
annots_score_MEAN,69
State,63
main_breed_BreedName,56
annots_score_SUM,49
SVD_Description_9,43
Quantity,42
SVD_Description_8,41


In [180]:
#%% 删除异常值
cul_drop = ['375905770', 'da8d4a273', '27e74e45c', '7b5bee232', '0327b8e94']
df_train = df_train[~df_train['PetID'].isin(cul_drop)]

# 提取 sentiment 的特征

In [181]:
def extract_sentiment_feature(i, x):    
    feature_sentiment = pd.DataFrame(columns=['PetID', 'token', 'sentence_magnitude', 'sentence_score','document_magnitude', 'document_score'])

    if x == 'train':
        set_file = 'train'
    else:
        set_file = 'train' 
        
    file_name = '{}_sentiment/{}.json'.format(set_file,i)
    try:
        f = open(file_name, 'r')
        sentiment_file = json.load(f)
            
        token = [x['name'] for x in sentiment_file['entities']]
        token = ' '.join(token)
            
        sentences_sentiment = [x['sentiment'] for x in sentiment_file['sentences']]
        sentences_sentiment = pd.DataFrame.from_dict(
            sentences_sentiment, orient='columns').sum()
        sentenceSentiment_magnitude = sentences_sentiment['magnitude']
        sentenceSentiment_score     = sentences_sentiment['score']
            
        docementSentiment_magnitude = sentiment_file['documentSentiment']['magnitude']
        documentSentiment_score     = sentiment_file['documentSentiment']['score']
            
        new = pd.DataFrame(
                {'PetID'               :[i], 
                 'token'               : [token],
                 'sentence_magnitude'  : [sentenceSentiment_magnitude],
                 'sentence_score'      : [sentenceSentiment_score],
                 'document_magnitude'  : [docementSentiment_magnitude], 
                 'document_score'      : [documentSentiment_score]})  
        feature_sentiment = feature_sentiment.append(new)
    except:
        print('{}没找到'.format(file_name))
    
    for each in feature_sentiment.columns:
        if each not in ['PetID','token']:
            feature_sentiment[each] = feature_sentiment[each].astype(float)

    return feature_sentiment

#%%
train_feature_sentiment = Parallel(n_jobs=8, verbose=1)(
        delayed(extract_sentiment_feature)(i, 'train') for i in train.PetID)
train_feature_sentiment = [x for x in train_feature_sentiment]
train_feature_sentiment = pd.concat(train_feature_sentiment, ignore_index=True, sort=False)

test_feature_sentiment = Parallel(n_jobs=8, verbose=1)(
        delayed(extract_sentiment_feature)(i, 'test') for i in test.PetID)
test_feature_sentiment = [x for x in test_feature_sentiment]
test_feature_sentiment = pd.concat(test_feature_sentiment, ignore_index=True, sort=False)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  37 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 9415 tasks      | elapsed:    9.9s
[Parallel(n_jobs=8)]: Done 13012 out of 13012 | elapsed:   13.2s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done 488 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 1981 out of 1981 | elapsed:    1.9s finished


# 提取 metadata 的特征

In [182]:
#%% 提取 metadata 的特征
#file_name = 'train_metadata/000a290e4-1.json'
#f = open(file_name, 'r')
#metadatafile = json.load(f)
def extract_metadata_feature(i, x):
    feature_metadata = pd.DataFrame()
    if x == 'train':
        set_file = 'train'
    else:
        set_file = 'train'
        
    metadata_filenames = sorted(glob.glob('{}_metadata/{}*.json'.format(set_file, i)))
    if len(metadata_filenames) > 0:
        feature_metadata_sub = pd.DataFrame(columns=['PetID', 'annots_score', 'color_score', 'color_pixelfrac', 'crop_conf','crop_importance', 'annots_top_desc'])
        for ff in metadata_filenames:
            f = open(ff, 'rb')
            file = json.load(f)
            #label
            if 'labelAnnotations' in file:
                file_annots = file['labelAnnotations'][:int(len(file['labelAnnotations']) * 0.3)]
                file_top_score = np.asarray([x['score'] for x in file_annots]).mean()
                file_top_desc = [x['description'] for x in file_annots]            
            else:
                file_top_score = np.nan
                file_top_desc = ['']
            #colors
            file_colors = file['imagePropertiesAnnotation']['dominantColors']['colors']            
            file_color_score = np.asarray([x['score'] for x in file_colors]).mean()
            file_color_pixelfrac = np.asarray([x['pixelFraction'] for x in file_colors]).mean()            
            #crops
            file_crops = file['cropHintsAnnotation']['cropHints']                
            file_crop_conf = np.asarray([x['confidence'] for x in file_crops]).mean()
            if 'importanceFraction' in file_crops[0].keys():
                file_crop_importance = np.asarray([x['importanceFraction'] for x in file_crops]).mean()
            else:
                file_crop_importance = np.nan
                
            new = pd.DataFrame(
                    {
                            'PetID'          : [i],
                            'annots_score'   : [file_top_score],
                            'color_score'     : [file_color_score],
                            'color_pixelfrac' : [file_color_pixelfrac],
                            'crop_conf'       : [file_crop_conf],
                            'crop_importance' : [file_crop_importance],
                            'annots_top_desc' : [' '.join(file_top_desc)]})
            feature_metadata_sub = feature_metadata_sub.append(new)
                
        metadata_desc = feature_metadata_sub.groupby(['PetID'])['annots_top_desc'].unique()
        metadata_desc = metadata_desc.reset_index()
        metadata_desc['annots_top_desc'] = metadata_desc['annots_top_desc'].apply(lambda x:' '.join(x))
        feature_metadata_sub.drop(['annots_top_desc'], axis=1, inplace=True)

        for each in feature_metadata_sub:
            if each not in ['PetID']:
                feature_metadata_sub[each] = feature_metadata_sub[each].astype(float)
        
        
        feature_metadata_sub = feature_metadata_sub.groupby(['PetID']).agg(['mean', 'sum'])
        feature_metadata_sub.columns = ['{}_{}'.format(c[0], c[1].upper()) for c in feature_metadata_sub.columns.tolist()]  
        feature_metadata_sub = feature_metadata_sub.reset_index()
            
        feature_metadata = feature_metadata.append(feature_metadata_sub)
    return feature_metadata

#
#for each in 
train_feature_metadata = Parallel(n_jobs=8, verbose=1)(
        delayed(extract_metadata_feature)(i, 'train') for i in train.PetID)
train_feature_metadata = [x for x in train_feature_metadata]
train_feature_metadata = pd.concat(train_feature_metadata, ignore_index=True, sort=False)

test_feature_metadata = Parallel(n_jobs=8, verbose=1)(
        delayed(extract_metadata_feature)(i, 'test') for i in test.PetID)
test_feature_metadata = [x for x in test_feature_metadata]
test_feature_metadata = pd.concat(test_feature_metadata, ignore_index=True, sort=False)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 352 tasks      | elapsed:    7.1s
[Parallel(n_jobs=8)]: Done 852 tasks      | elapsed:   17.1s
[Parallel(n_jobs=8)]: Done 1552 tasks      | elapsed:   30.9s
[Parallel(n_jobs=8)]: Done 2452 tasks      | elapsed:   48.7s
[Parallel(n_jobs=8)]: Done 3552 tasks      | elapsed:  1.2min
[Parallel(n_jobs=8)]: Done 4852 tasks      | elapsed:  1.6min
[Parallel(n_jobs=8)]: Done 6352 tasks      | elapsed:  2.1min
[Parallel(n_jobs=8)]: Done 8052 tasks      | elapsed:  2.7min
[Parallel(n_jobs=8)]: Done 9952 tasks      | elapsed:  3.3min
[Parallel(n_jobs=8)]: Done 12052 tasks      | elapsed:  4.0min
[Parallel(n_jobs=8)]: Done 13012 out of 13012 | elapsed:  4.3min finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 352 task

In [300]:
#%% 连接sentiment和metadata和原始数据
x_train = df_train.merge(train_feature_sentiment, how='left', on='PetID')
x_train = x_train.merge(train_feature_metadata, how='left', on='PetID')

y_train = x_train['AdoptionSpeed']
x_train.drop(['AdoptionSpeed'], axis=1, inplace=True)

x_test = df_test.merge(test_feature_sentiment, how='left', on='PetID')
x_test = x_test.merge(test_feature_metadata, how='left', on='PetID')

y_test = x_test['AdoptionSpeed']
x_test.drop(['AdoptionSpeed'], axis=1, inplace=True)

(13009, 38)

# NLP

In [301]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import SparsePCA, TruncatedSVD, LatentDirichletAllocation, NMF

col_text = ['Description', 'token']

x = x_train.append(x_test).reset_index()
x = x[['Description', 'token', 'PetID']]

n_components = 10

x[col_text] = x[col_text].fillna('MISSING')
text_features = []


for i in  ['Description', 'token']:
    svd_ = TruncatedSVD(n_components=n_components)
    nmf_ = NMF(n_components=n_components)
    
    tfidf_col = TfidfVectorizer(min_df=3, max_df=0.9).fit_transform(x.loc[:, i])

    
    svd_col = svd_.fit_transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('SVD_{}_'.format(i))
    
    nmf_col = nmf_.fit_transform(tfidf_col)
    nmf_col = pd.DataFrame(nmf_col)
    nmf_col = nmf_col.add_prefix('NMF_{}_'.format(i))

    text_features.append(svd_col)
    text_features.append(nmf_col)
    
    x.drop(i, axis=1, inplace=True)
    
# Combine all extracted features:
text_features = pd.concat(text_features, axis=1)

# Concatenate with main DF:
x = pd.concat([x, text_features], axis=1)

x_train = x_train.merge(x, how='left', on='PetID')
x_test  = x_test.merge(x, how='left', on='PetID')


# 增加新的特征
1、是否需要收费
2、年份
3、Color的笛卡尔积（效果不要）

In [302]:
x_train['IsFree'] = x_train['Fee'].apply(lambda x:1 if x>0 else 0)
x_test['IsFree']  = x_test['Fee'].apply(lambda x:1 if x>0 else 0)

x_train['Year'] = x_train['Age'].apply(lambda x:round(x/12))
x_test['Year']  = x_test['Age'].apply(lambda x:round(x/12))

x = x_train.append(x_test)
x['Age_qcut'] = pd.qcut(x['Age'], 5,  duplicates='drop')
x['Age_qcut'] = pd.factorize(x['Age_qcut'])[0]
x_train = x_train.merge(x[['PetID','Age_qcut']], how='left', on='PetID')
x_test  = x_test.merge(x[['PetID','Age_qcut']], how='left', on='PetID')

#效果不好
#x_train['Color_Mix'] = x_train['Color1'].astype(str)+x_train['Color2'].astype(str)+x_train['Color3'].astype(str)
#x_train['Color_Mix'] = pd.factorize(x_train['Color_Mix'])[0]
#x_test['Color_Mix'] = x_test['Color1'].astype(str)+x_test['Color2'].astype(str)+x_test['Color3'].astype(str)
#x_test['Color_Mix'] = pd.factorize(x_test['Color_Mix'])[0]

# RescuerID 处理

In [303]:
#%% RescuerID 处理

df = df_train.append(df_test)
data_rescuer = df.groupby(['RescuerID'])['PetID'].count().reset_index()
data_rescuer.columns = ['RescuerID', 'RescuerID_count']
#data_rescuer['rank_Rescuer_count'] = data_rescuer['RescuerID_count'].rank(pct=True)

x_train = x_train.merge(data_rescuer, how='left', on='RescuerID')
x_test  = x_test.merge(data_rescuer, how='left', on='RescuerID')

#x_train.drop(['RescuerID_count'], axis=1, inplace=True)
#x_test.drop(['RescuerID_count'], axis=1, inplace=True)

x_train['single'] = x_train['RescuerID_count'].apply(lambda x:1 if x<3 else 0)
x_train['middle'] = x_train['RescuerID_count'].apply(lambda x:1 if (x>2 and x<6) else 0)
x_train['Charities'] = x_train['RescuerID_count'].apply(lambda x:1 if x>5 else 0)

x_test['single'] = x_test['RescuerID_count'].apply(lambda x:1 if x<3 else 0)
x_test['middle'] = x_test['RescuerID_count'].apply(lambda x:1 if (x>2 and x<6) else 0)
x_test['Charities'] = x_test['RescuerID_count'].apply(lambda x:1 if x>5 else 0)

In [304]:
x_test['RescuerID_count'].unique()

array([ 52,   1,   4,   2,  11, 156,  17,  42,  41,   5,  25,  10,   3,
        48,   6,   9,  12,  28,   7,  33,  26,  44,  22,   8])

# 处理Breed

In [305]:
x_train['HasSecondBreed'] = x_train['Breed2'].map(lambda x:1 if x != 0 else 0)
x_test['HasSecondBreed'] = x_test['Breed2'].map(lambda x:1 if x != 0 else 0)

train_breed_main = x_train[['Breed1']].merge(
    labels_breed, how='left',
    left_on='Breed1', right_on='BreedID',
    suffixes=('', '_main_breed'))

train_breed_main = train_breed_main.iloc[:, 2:]
train_breed_main = train_breed_main.add_prefix('main_breed_')

train_breed_second = x_train[['Breed2']].merge(
    labels_breed, how='left',
    left_on='Breed2', right_on='BreedID',
    suffixes=('', '_second_breed'))


train_breed_second = train_breed_second.iloc[:, 2:]
train_breed_second = train_breed_second.add_prefix('second_breed_')

x_train = pd.concat(
    [x_train, train_breed_main, train_breed_second], axis=1)

##############
test_breed_main = x_test[['Breed1']].merge(
    labels_breed, how='left',
    left_on='Breed1', right_on='BreedID',
    suffixes=('', '_main_breed'))

test_breed_main = test_breed_main.iloc[:, 2:]
test_breed_main = test_breed_main.add_prefix('main_breed_')

test_breed_second = x_test[['Breed2']].merge(
    labels_breed, how='left',
    left_on='Breed2', right_on='BreedID',
    suffixes=('', '_second_breed'))

test_breed_second = test_breed_second.iloc[:, 2:]
test_breed_second = test_breed_second.add_prefix('second_breed_')

x_test = pd.concat(
    [x_test, test_breed_main, test_breed_second], axis=1)

print(x_train.shape, x_test.shape)

categorical_columns = ['main_breed_BreedName', 'second_breed_BreedName']
for i in categorical_columns:
    x_train.loc[:, i] = pd.factorize(x_train.loc[:, i])[0]
    x_test.loc[:,i]   = pd.factorize(x_test.loc[:, i])[0]


(13009, 80) (1981, 80)


# 等某些特征进行rank

In [306]:
# 对某些特征进行rank
cols_rank = ['sentence_magnitude', 'sentence_score', 'document_magnitude','document_score']#,
#       'annots_score_MEAN', 'annots_score_SUM','color_score_MEAN', 'color_score_SUM', 'color_pixelfrac_MEAN',
#       'color_pixelfrac_SUM', 'crop_conf_MEAN', 'crop_conf_SUM','crop_importance_MEAN', 'crop_importance_SUM']

x = x_train.append(x_test)
x[cols_rank] = x[cols_rank].fillna(0)
df_cols_rank = x[cols_rank].rank(pct=True).rename(columns=lambda s:'rank.'+s)
df_cols_rank = pd.concat([df_cols_rank, x['PetID']], axis=1)

x_train = x_train.merge(df_cols_rank, how='left', on='PetID')
x_test =  x_test.merge(df_cols_rank, how='left', on='PetID')

In [307]:
x_train.isnull().sum()

Type                          0
Name                       1009
Age                           0
Breed1                        0
Breed2                        0
Gender                        0
Color1                        0
Color2                        0
Color3                        0
MaturitySize                  0
FurLength                     0
Vaccinated                    0
Dewormed                      0
Sterilized                    0
Health                        0
Quantity                      0
Fee                           0
State                         0
RescuerID                     0
VideoAmt                      0
Description                  12
PetID                         0
PhotoAmt                      0
token                       418
sentence_magnitude          418
sentence_score              418
document_magnitude          418
document_score              418
SVD_Description_0             0
SVD_Description_1             0
                           ... 
SVD_toke

In [223]:
x_train[['second_breed_Type','HasSecondBreed']]

Unnamed: 0,second_breed_Type,HasSecondBreed
0,,0
1,,0
2,,0
3,,0
4,,0
5,,0
6,2.0,1
7,,0
8,,0
9,,0


# 数据清理

In [308]:
drop_columns = ['Name', 'RescuerID', 'Description', 'PetID', 'token']


x_train.drop(drop_columns, axis=1, inplace=True)
x_test.drop(drop_columns, axis=1, inplace=True)


x_train = x_train.fillna(0)
x_test  = x_test.fillna(0)

# LGB 算法

In [309]:
from lightgbm.sklearn import LGBMRegressor


model_lgb = LGBMRegressor(
        learning_rate    = 0.05,
        n_estimators     = 200,
        max_depth        = 4,
        num_leaves       = 10,
        subsample        = 0.7,      #训练时采样一定比例的数据	
        colsample_bytree = 0.7,
        n_jobs           = -1,
        random_state     = 4,
        objective        = 'regression',
        eval_metric      = 'scorer',
        min_child_samples = 3         #叶子节点具有的最小记录数	

        )
        
model_lgb.fit(x_train, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
       eval_metric='scorer', importance_type='split', learning_rate=0.05,
       max_depth=4, min_child_samples=3, min_child_weight=0.001,
       min_split_gain=0.0, n_estimators=200, n_jobs=-1, num_leaves=10,
       objective='regression', random_state=4, reg_alpha=0.0,
       reg_lambda=0.0, silent=True, subsample=0.7,
       subsample_for_bin=200000, subsample_freq=0)

In [310]:
y_lgb = split_score(model_lgb, x_train, y_train)

coe = search_coef(y_lgb, y_train)
best_coe = coe['x']
print('lgb的最佳系数为{}'.format(best_coe))


result_lgb = model_lgb.predict(x_test)
result_lgb_fix = fix_y(result_lgb, best_coe)
print('lgb后的分布:',Counter(result_lgb_fix))
print('融合后的二次加权Kappa系数为', metric(result_lgb_fix, y_test))
print('y_test的真实分布为',Counter(y_test))

10折后的Kappa加权得分为:带补充
basinhopping step 0: f -0.429915
basinhopping step 1: f -0.430518 trial_f -0.430518 accepted 1  lowest_f -0.430518
found new global minimum on step 1 with function value -0.430518
basinhopping step 2: f -0.430774 trial_f -0.430774 accepted 1  lowest_f -0.430774
found new global minimum on step 2 with function value -0.430774
basinhopping step 3: f -0.429031 trial_f -0.429031 accepted 1  lowest_f -0.430774
basinhopping step 4: f -0.430137 trial_f -0.430137 accepted 1  lowest_f -0.430774
basinhopping step 5: f -0.430345 trial_f -0.430345 accepted 1  lowest_f -0.430774
basinhopping step 6: f -0.431175 trial_f -0.431175 accepted 1  lowest_f -0.431175
found new global minimum on step 6 with function value -0.431175
basinhopping step 7: f -0.430258 trial_f -0.430258 accepted 1  lowest_f -0.431175
basinhopping step 8: f -0.430187 trial_f -0.430187 accepted 1  lowest_f -0.431175
basinhopping step 9: f -0.430645 trial_f -0.430645 accepted 1  lowest_f -0.431175
basinhopping s

In [293]:
#调参
parameters = {
            'reg_alpha': [0,0.01,0.1,1],
            'reg_lambda': [0,0.01,0.1,1]
            }

gsearch = GridSearchCV(model_lgb, param_grid=parameters, scoring=scorer, cv=10, n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_


ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

# 随机森林模型

In [289]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(
    n_estimators      = 250,
    
    max_features      = 0.7,#选择最适属性时划分的特征不能超过此值。
#    max_depth         = 6, #设置树的最大深度，默认为None
    
    min_samples_leaf  = 3, #叶子节点最少的样本数
    min_samples_split = 2,#根据属性划分节点时，每个划分最少的样本数
    
    criterion = 'mse',
    
    n_jobs            = -1
    )

model_rf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.7, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [296]:
y_rf = split_score(model_rf, x_train, y_train)

coe = search_coef(y_rf, y_train)
best_coe = coe['x']
print('lgb的最佳系数为{}'.format(best_coe))
    
result_rf = model_rf.predict(x_test)
result_rf_fix = fix_y(result_rf, best_coe)
print('随机森林融合后的分布:',Counter(result_rf_fix))
print('融合后的二次加权Kappa系数为', metric(result_rf_fix, y_test))
print('y_test的真实分布为',Counter(y_test))

10折后的Kappa加权得分为:带补充
lgb的最佳系数为[1.6662308  2.12006523 2.52696015 2.80978917]
随机森林融合后的分布: Counter({2.0: 662, 4.0: 511, 1.0: 435, 3.0: 360, 0.0: 13})
融合后的二次加权Kappa系数为 0.3629689307927427
y_test的真实分布为 Counter({4: 619, 2: 499, 1: 418, 3: 386, 0: 59})


# XGB

In [282]:
from xgboost.sklearn import XGBRegressor

model_xgb = XGBRegressor(
    learning_rate    = 0.05,
    n_estimatores    = 200, 
    
    early_stopping_rounds=20,  
    
    max_depth        = 4, 
    min_child_weight = 5,
    
    gamma            = 0,
    
    subsample        =  0.8,
    colsample_bytree = 0.6,
    
    reg_alpha        = 1,
    reg_lambda       = 0.1,
    nthread      = -1)
#    objective        = 'regression',    
#    eval_metric      = 'scorer')      

model_xgb.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, early_stopping_rounds=20, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=4,
       min_child_weight=5, missing=None, n_estimatores=200,
       n_estimators=100, n_jobs=1, nthread=-1, objective='reg:linear',
       random_state=0, reg_alpha=1, reg_lambda=0.1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.8)

In [295]:
y_xgb = split_score(model_xgb, x_train, y_train)

coe = search_coef(y_xgb, y_train)
best_coe = coe['x']
print('xgb的最佳系数为{}'.format(best_coe))

result_xgb = model_xgb.predict(x_test)
result_xgb_fix = fix_y(result_xgb, best_coe)
print('xgb后的分布:',Counter(result_xgb_fix))
print('xgb后的二次加权Kappa系数为', metric(result_xgb_fix, y_test))
print('y_test的真实分布为',Counter(y_test))

10折后的Kappa加权得分为:带补充
xgb的最佳系数为[1.71307733 2.19394624 2.42778017 2.83029307]
xgb后的分布: Counter({1.0: 547, 3.0: 531, 4.0: 477, 2.0: 414, 0.0: 12})
xgb后的二次加权Kappa系数为 0.42120763258365623
y_test的真实分布为 Counter({4: 619, 2: 499, 1: 418, 3: 386, 0: 59})


In [358]:
#调参
parameters = {'max_depth': range(6,14,2), 'min_child_weight': range(3,13,2)}
gsearch = GridSearchCV(model_xgb, param_grid=parameters, scoring=scorer, cv=10, n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

{'max_depth': 12, 'min_child_weight': 11}

# 后处理

In [138]:
y_lgb = model_lgb.predict(x_train)

y_xgb = model_xgb.predict(x_train)
y_rf  = model_rf.predict(x_train)


In [154]:
y = (result_lgb + result_xgb + result_rf)/3
result = fix_y(y, best_coe)

print('融合后的分布:',Counter(result))
print('融合后的二次加权Kappa系数为', metric(result, y_test))
print('y_test的真实分布为',Counter(y_test))


融合后的分布: Counter({2.0: 1140, 3.0: 840, 1.0: 194, 4.0: 143})
融合后的二次加权Kappa系数为 0.30060168046082947
y_test的真实分布为 Counter({2: 655, 4: 552, 3: 531, 1: 521, 0: 58})


In [288]:
result

array([2., 3., 2., ..., 3., 1., 4.])