In [1]:
import gc
import glob
import os
import json
import matplotlib.pyplot as plt
import pprint

import numpy as np
import pandas as pd

from joblib import Parallel, delayed
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
import scipy as sp

from sklearn.model_selection import train_test_split, GridSearchCV

from functools import partial
from collections import Counter

import random
import math

# 评价函数

In [2]:
#%% 评价函数 Metric used for this competition 
# (Quadratic Weigthed Kappa aka Quadratic Cohen Kappa Score)
def metric(y1,y2):
    return cohen_kappa_score(y1, y2, weights = 'quadratic')


# Make scorer for scikit-learn
scorer = make_scorer(metric)

# Cross验证函数

In [3]:
from sklearn.model_selection import StratifiedKFold

#
def split_score(model, x, y, n=10):
    y_pre = np.zeros(y.shape[0])
    kfold = StratifiedKFold(n_splits=n, random_state=4)
    for train_index, test_index in kfold.split(x,y):
        model.fit(x.iloc[train_index], y.iloc[train_index])
        y_pre[test_index] = model.predict(x.iloc[test_index])
    
#    score = metric(y_pre, y)
    print("{}折后的Kappa加权得分为:带补充".format(n))
    
    return y_pre

#
def fix_y(y, coef):
    y_fix = np.copy(y)
    for i, pred in enumerate(y_fix):
        if pred < coef[0]:
            y_fix[i] = 0
        elif pred >= coef[0] and pred < coef[1]:
            y_fix[i] = 1
        elif pred >= coef[1] and pred < coef[2]:
            y_fix[i] = 2
        elif pred >= coef[2] and pred < coef[3]:
            y_fix[i] = 3
        else:
            y_fix[i] = 4    
    return y_fix

# 
def _kappa_loss(y, y_true, coef):
    y_fix = np.copy(y)
    for i, pred in enumerate(y_fix):
        if pred < coef[0]:
            y_fix[i] = 0
        elif pred >= coef[0] and pred < coef[1]:
            y_fix[i] = 1
        elif pred >= coef[1] and pred < coef[2]:
            y_fix[i] = 2
        elif pred >= coef[2] and pred < coef[3]:
            y_fix[i] = 3
        else:
            y_fix[i] = 4
            
    loss = metric(y_fix, y_true)
    return -loss

# 寻找分类的最佳参数
def search_coef(x1, x2):
    loss_partial = partial(_kappa_loss, x1, x2)
    initial_coef = [1.55, 2.05, 2.5, 3]
    coef = sp.optimize.minimize(loss_partial, initial_coef, method= 'nelder-mead')

    return coef

# 读取数据、划分验证集

In [4]:
df_train  = pd.read_csv('train.csv')
x = df_train.copy()

#读取唯一的RescuerID
RescuerID = set(df_train['RescuerID'].unique())

#随机生成RescuerID
j_test = random.sample(RescuerID, int(len(RescuerID)*0.17))
j_train = RescuerID - set(j_test)

df_test = df_train[df_train['RescuerID'].isin(j_test)]
df_train = df_train[df_train['RescuerID'].isin(j_train)]

train = df_train.copy()
test  = df_test.copy()

In [5]:
labels_breed = pd.read_csv('breed_labels.csv')
labels_state = pd.read_csv('color_labels.csv')
labels_color = pd.read_csv('state_labels.csv')

In [6]:
#%% 删除异常值
cul_drop = ['375905770', 'da8d4a273', '27e74e45c', '7b5bee232', '0327b8e94']
df_train = df_train[~df_train['PetID'].isin(cul_drop)]

# 提取 sentiment 的特征

In [7]:
def extract_sentiment_feature(i, x):    
#    feature_sentiment = pd.DataFrame(columns=['PetID', 'token', 'sentence_magnitude', 'sentence_score','document_magnitude', 'document_score'])
    feature_sentiment = pd.DataFrame()

    if x == 'train':
        set_file = 'train'
    else:
        set_file = 'train' 
        
    file_name = '{}_sentiment/{}.json'.format(set_file,i)
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            sentiment_file = json.load(f)
            
            token = [x['name'] for x in sentiment_file['entities']]
            token = ' '.join(token)

            sentences_sentiment = [x['sentiment'] for x in sentiment_file['sentences']]
            sentences_sentiment = pd.DataFrame.from_dict(
                sentences_sentiment, orient='columns').sum()
            sentenceSentiment_magnitude = sentences_sentiment['magnitude']
            sentenceSentiment_score     = sentences_sentiment['score']

            docementSentiment_magnitude = sentiment_file['documentSentiment']['magnitude']
            documentSentiment_score     = sentiment_file['documentSentiment']['score']
            
            new = pd.DataFrame(
                    {'PetID'               :[i], 
    #                 'token'               : [token],
                     'sentence_magnitude'  : [sentenceSentiment_magnitude],
                     'sentence_score'      : [sentenceSentiment_score],
                     'document_magnitude'  : [docementSentiment_magnitude], 
                     'document_score'      : [documentSentiment_score]})  
            feature_sentiment = feature_sentiment.append(new)
    except:
        print('{}没找到'.format(file_name))
    
    for each in feature_sentiment.columns:
        if each not in ['PetID','token']:
            feature_sentiment[each] = feature_sentiment[each].astype(float)

    return feature_sentiment

#%%
train_feature_sentiment = Parallel(n_jobs=8, verbose=1)(
        delayed(extract_sentiment_feature)(i, 'train') for i in train.PetID)
train_feature_sentiment = [x for x in train_feature_sentiment]
train_feature_sentiment = pd.concat(train_feature_sentiment, ignore_index=True, sort=False)

test_feature_sentiment = Parallel(n_jobs=8, verbose=1)(
        delayed(extract_sentiment_feature)(i, 'test') for i in test.PetID)
test_feature_sentiment = [x for x in test_feature_sentiment]
test_feature_sentiment = pd.concat(test_feature_sentiment, ignore_index=True, sort=False)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    3.0s
[Parallel(n_jobs=8)]: Done 1859 tasks      | elapsed:    7.5s
[Parallel(n_jobs=8)]: Done 5359 tasks      | elapsed:   15.4s
[Parallel(n_jobs=8)]: Done 10259 tasks      | elapsed:   26.3s
[Parallel(n_jobs=8)]: Done 12480 out of 12480 | elapsed:   30.9s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done 304 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 2513 out of 2513 | elapsed:    5.1s finished


# 提取 metadata 的特征

In [372]:
#%% 提取 metadata 的特征
#file_name = 'train_metadata/000a290e4-1.json'
#f = open(file_name, 'r')
#metadatafile = json.load(f)
def extract_metadata_feature(i, x):
    feature_metadata = pd.DataFrame()
    if x == 'train':
        set_file = 'train'
    else:
        set_file = 'train'
        
    metadata_filenames = sorted(glob.glob('{}_metadata/{}*.json'.format(set_file, i)))
    if len(metadata_filenames) > 0:
        feature_metadata_sub = pd.DataFrame()
        for ff in metadata_filenames:
            f = open(ff, 'rb')
            file = json.load(f)
            #label
            if 'labelAnnotations' in file:
                file_annots = file['labelAnnotations'][:1]
                file_top_score = np.asarray([x['score'] for x in file_annots]).mean()
                file_top_desc = [x['description'] for x in file_annots]            
            else:
                file_top_score = np.nan
                file_top_desc = ['']
            #colors
            file_colors = file['imagePropertiesAnnotation']['dominantColors']['colors']            
            file_color_score = np.asarray([x['score'] for x in file_colors]).mean()
            file_color_pixelfrac = np.asarray([x['pixelFraction'] for x in file_colors]).mean()            
            #crops
            file_crops = file['cropHintsAnnotation']['cropHints']                
            file_crop_conf = np.asarray([x['confidence'] for x in file_crops]).mean()
            if 'importanceFraction' in file_crops[0].keys():
                file_crop_importance = np.asarray([x['importanceFraction'] for x in file_crops]).mean()
            else:
                file_crop_importance = np.nan
                
            new = pd.DataFrame(
                    {
                            'PetID'          : [i],
                            'annots_score'   : [file_top_score],
                            'color_score'     : [file_color_score],
                            'color_pixelfrac' : [file_color_pixelfrac]})
#                            'crop_conf'       : [file_crop_conf],
#                            'crop_importance' : [file_crop_importance],
#                            'annots_top_desc' : [' '.join(file_top_desc)]})
            feature_metadata_sub = feature_metadata_sub.append(new)
                
#        metadata_desc = feature_metadata_sub.groupby(['PetID'])['annots_top_desc'].unique()
#        metadata_desc = metadata_desc.reset_index()
#        metadata_desc['annots_top_desc'] = metadata_desc['annots_top_desc'].apply(lambda x:' '.join(x))
#        feature_metadata_sub.drop(['annots_top_desc'], axis=1, inplace=True)

        for each in feature_metadata_sub:
            if each not in ['PetID']:
                feature_metadata_sub[each] = feature_metadata_sub[each].astype(float)
        
        
        feature_metadata_sub = feature_metadata_sub.groupby(['PetID']).agg(['mean', 'sum'])
        feature_metadata_sub.columns = ['{}_{}'.format(c[0], c[1].upper()) for c in feature_metadata_sub.columns.tolist()]  
        feature_metadata_sub = feature_metadata_sub.reset_index()
                    
        feature_metadata = feature_metadata.append(feature_metadata_sub)
    return feature_metadata

#
#for each in 
train_feature_metadata = Parallel(n_jobs=8, verbose=1)(
        delayed(extract_metadata_feature)(i, 'train') for i in train.PetID)
train_feature_metadata = [x for x in train_feature_metadata]
train_feature_metadata = pd.concat(train_feature_metadata, ignore_index=True, sort=False)

test_feature_metadata = Parallel(n_jobs=8, verbose=1)(
        delayed(extract_metadata_feature)(i, 'test') for i in test.PetID)
test_feature_metadata = [x for x in test_feature_metadata]
test_feature_metadata = pd.concat(test_feature_metadata, ignore_index=True, sort=False)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  88 tasks      | elapsed:    1.9s
[Parallel(n_jobs=8)]: Done 688 tasks      | elapsed:   12.6s
[Parallel(n_jobs=8)]: Done 1688 tasks      | elapsed:   30.6s
[Parallel(n_jobs=8)]: Done 3088 tasks      | elapsed:   55.7s
[Parallel(n_jobs=8)]: Done 4888 tasks      | elapsed:  1.5min
[Parallel(n_jobs=8)]: Done 7088 tasks      | elapsed:  2.1min
[Parallel(n_jobs=8)]: Done 9688 tasks      | elapsed:  2.9min
[Parallel(n_jobs=8)]: Done 12688 tasks      | elapsed:  3.8min
[Parallel(n_jobs=8)]: Done 12773 out of 12773 | elapsed:  3.8min finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  88 tasks      | elapsed:    1.9s
[Parallel(n_jobs=8)]: Done 688 tasks      | elapsed:   12.6s
[Parallel(n_jobs=8)]: Done 1688 tasks      | elapsed:   30.7s
[Parallel(n_jobs=8)]: Done 2220 out of 2220 | elapsed:   40.1s finished


In [10]:
#%% 连接sentiment和metadata和原始数据
x_train = df_train.merge(train_feature_sentiment, how='left', on='PetID')
#x_train = x_train.merge(train_feature_metadata, how='left', on='PetID')

y_train = x_train['AdoptionSpeed']
x_train.drop(['AdoptionSpeed'], axis=1, inplace=True)

x_test = df_test.merge(test_feature_sentiment, how='left', on='PetID')
#x_test = x_test.merge(test_feature_metadata, how='left', on='PetID')

y_test = x_test['AdoptionSpeed']
x_test.drop(['AdoptionSpeed'],  axis=1, inplace=True)

# NLP

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import SparsePCA, TruncatedSVD, LatentDirichletAllocation, NMF

col_text = ['Description']

x = x_train.append(x_test).reset_index()
x = x[['Description', 'PetID']]

n_components = 50

x[col_text] = x[col_text].fillna('MISSING')
text_features = []


for i in  ['Description']:
    svd_ = TruncatedSVD(n_components=n_components)
        
    tfv = CountVectorizer(min_df=3,  
                          max_df=0.9,
                          stop_words = 'english')
    
    tfidf_col = tfv.fit_transform(x.loc[:, i])

    
    svd_col = svd_.fit_transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('SVD_{}_'.format(i))


    text_features.append(svd_col)    
    
    x.drop(i, axis=1, inplace=True)
    
# Combine all extracted features:
text_features = pd.concat(text_features, axis=1)

# Concatenate with main DF:
x = pd.concat([x, text_features], axis=1)

x_train = x_train.merge(x, how='left', on='PetID')
x_test  = x_test.merge(x, how='left', on='PetID')


In [13]:
from textblob import TextBlob

x = x_train.append(x_test)
x = x[['PetID', 'Description']]

x['Description'] = x['Description'].fillna("Missing")
x['Description'] = x['Description'].apply(lambda x:TextBlob(x))

x['polarity']     = x['Description'].apply(lambda x:x.sentiment[0])
x['subjectivity'] = x['Description'].apply(lambda x:x.sentiment[1])

#对情感进行分箱
bin=[-2,0,0.3,2]
x['polarity'] = pd.cut(x['polarity'], bins=bin, labels=range(3))
x['polarity'] = x['polarity'].astype(np.int32)

x_train = x_train.merge(x[['PetID', 'polarity']], how='left', on='PetID')
x_test  = x_test.merge(x[['PetID', 'polarity']], how='left', on='PetID')

# 增加新的特征
1、是否需要收费
2、年份 floor(x/12)
3、该品种的数量
4、是否是稀有品种(分普通稀有和特别稀有)
5、是否很常见
6、是否是稀有Color
7、按照片的多少进行分组

In [14]:
x_train['IsFree'] = x_train['Fee'].apply(lambda x:True if x == 0 else False)
x_test['IsFree']  = x_test['Fee'].apply(lambda x:True if x == 0 else False)

x_train['Year'] = x_train['Age'].apply(lambda x:math.floor(x/12))
x_test['Year']  = x_test['Age'].apply(lambda x:math.floor(x/12))

x = x_train.append(x_test)
x['Age_qcut'] = pd.qcut(x['Age'], 5,  duplicates='drop')
x['Age_qcut'] = pd.factorize(x['Age_qcut'])[0]
x_train = x_train.merge(x[['PetID','Age_qcut']], how='left', on='PetID')
x_test  = x_test.merge(x[['PetID','Age_qcut']], how='left', on='PetID')

x = x_train.append(x_test)
Breed1_count = x.groupby('Breed1').size().to_frame('Breed1_count').reset_index()
x_train = x_train.merge(Breed1_count, how='left', on='Breed1')
x_test  = x_test.merge(Breed1_count, how='left', on='Breed1')


a = x['Breed1'].value_counts().sort_values(ascending = False).cumsum()/len(x)
rare1_index = a[a > 0.85].index.tolist()
x_train['IsRare1'] = x_train['Breed1'].isin(rare1_index).apply(lambda x:True if x == True else False)
x_test['IsRare1']  = x_test['Breed1'].isin(rare1_index).apply(lambda x:True if x == True else False)
rare2_index = a[a > 0.72].index.tolist()
x_train['IsRare2'] = x_train['Breed1'].isin(rare2_index).apply(lambda x:True if x == True else False)
x_test['IsRare2']  = x_test['Breed1'].isin(rare2_index).apply(lambda x:True if x == True else False)


x_train['Is_COMMON'] = x_train['Breed1'].apply(lambda x:True if (x == 265 or x == 307 or x == 266) else False)
x_test['Is_COMMON']  = x_test['Breed1'].apply(lambda x:True if (x == 265 or x == 307 or x == 266) else False)


bin=[-0.5,0.5,1.5,4.5,1000]
x_train['Photo_cut'] = pd.cut(x_train['PhotoAmt'], bins=bin, labels=range(4)).astype(np.int32)
x_test['Photo_cut']  = pd.cut(x_test['PhotoAmt'], bins=bin, labels=range(4)).astype(np.int32)

# 是否是稀有颜色
x_train['Is_rare_color1'] = x_train['Color1'].apply(lambda x:True if x==5 or x==6 or x==7 else False)
x_test['Is_rare_color1'] = x_test['Color1'].apply(lambda x:True if x==5 or x==6 or x==7 else False)
x_train['Is_rare_color2'] = x_train['Color2'].apply(lambda x:True if x==6 else False)
x_test['Is_rare_color2'] = x_test['Color2'].apply(lambda x:True if x==6 else False)

#年龄是否小于二月
x_train['Is_less_than_2month']= x_train['Age'].apply(lambda x:True if x<3 else False)
x_test['Is_less_than_2month'] = x_test['Age'].apply(lambda x:True if x<3 else False)


# RescuerID 处理

In [15]:
#%% RescuerID 处理

df = df_train.append(df_test)
data_rescuer = df.groupby(['RescuerID'])['PetID'].size().reset_index()
data_rescuer.columns = ['RescuerID', 'RescuerID_count']
#data_rescuer['rank_Rescuer_count'] = data_rescuer['RescuerID_count'].rank(pct=True)

x_train = x_train.merge(data_rescuer, how='left', on='RescuerID')
x_test  = x_test.merge(data_rescuer, how='left', on='RescuerID')

x = x_train.append(x_test)
x['RescuerID_count_cut'] = pd.qcut(x['RescuerID_count'], 5, labels=range(4), duplicates='drop').astype(np.int32)

x_train = x_train.merge(x[['PetID', 'RescuerID_count_cut']], how='left', on='PetID')
x_test  = x_test.merge(x [['PetID', 'RescuerID_count_cut']], how='left', on='PetID')

#x_train.drop(['RescuerID_count'], axis=1, inplace=True)
#x_test.drop(['RescuerID_count'], axis=1, inplace=True)

# 处理Breed

In [16]:
# 增加特征 是否有第二血统
x_train['HasSecondBreed'] = x_train['Breed2'].map(lambda x:True if x != 0 else False)
x_test['HasSecondBreed'] = x_test['Breed2'].map(lambda x:True if x != 0 else False)

train_breed_main = x_train[['Breed1']].merge(
    labels_breed, how='left',
    left_on='Breed1', right_on='BreedID',
    suffixes=('', '_main_breed'))

train_breed_main = train_breed_main.iloc[:, 2:]
train_breed_main = train_breed_main.add_prefix('main_breed_')

train_breed_second = x_train[['Breed2']].merge(
    labels_breed, how='left',
    left_on='Breed2', right_on='BreedID',
    suffixes=('', '_second_breed'))


train_breed_second = train_breed_second.iloc[:, 2:]
train_breed_second = train_breed_second.add_prefix('second_breed_')

x_train = pd.concat(
    [x_train, train_breed_main, train_breed_second], axis=1)

##############
test_breed_main = x_test[['Breed1']].merge(
    labels_breed, how='left',
    left_on='Breed1', right_on='BreedID',
    suffixes=('', '_main_breed'))

test_breed_main = test_breed_main.iloc[:, 2:]
test_breed_main = test_breed_main.add_prefix('main_breed_')

test_breed_second = x_test[['Breed2']].merge(
    labels_breed, how='left',
    left_on='Breed2', right_on='BreedID',
    suffixes=('', '_second_breed'))

test_breed_second = test_breed_second.iloc[:, 2:]
test_breed_second = test_breed_second.add_prefix('second_breed_')

x_test = pd.concat(
    [x_test, test_breed_main, test_breed_second], axis=1)

print(x_train.shape, x_test.shape)

categorical_columns = ['main_breed_BreedName', 'second_breed_BreedName']
#for i in categorical_columns:
#    x_train.loc[:, i] = pd.factorize(x_train.loc[:, i])[0]
#    x_test.loc[:,i]   = pd.factorize(x_test.loc[:, i])[0]

# 增加特征 是否纯种
x_train['True_Pure'] = False
x_train.loc[(x_train['main_breed_BreedName'] != 'Mixed Breed')&
                    ((x_train['main_breed_BreedName'] == x_train['second_breed_BreedName'])|
                   (x_train['second_breed_BreedName'].isnull())),'True_Pure'] = True

x_test['True_Pure'] = False
x_test.loc[(x_test['main_breed_BreedName'] != 'Mixed Breed')&
                    ((x_test['main_breed_BreedName'] == x_test['second_breed_BreedName'])|
                   (x_test['second_breed_BreedName'].isnull())),'True_Pure'] = True

# 是否纯种狗
x_train['Is_Pure_Dog'] = (x_train['True_Pure'] == True) & (x_train['Type'] == 1)
x_test['Is_Pure_Dog']  = (x_test['True_Pure'] == True)  & (x_test['Type'] == 1)


#删除没用特征
x_train.drop(['main_breed_BreedName', 'second_breed_BreedName', 'main_breed_Type', 'second_breed_Type'], axis=1, inplace=True)
x_test.drop(['main_breed_BreedName', 'second_breed_BreedName', 'main_breed_Type', 'second_breed_Type'], axis=1, inplace=True)

(12475, 96) (2513, 96)


# 等某些特征进行rank

In [17]:
# 对某些特征进行rank
#cols_rank = ['sentence_magnitude', 'sentence_score', 'document_magnitude','document_score']#,
#       'annots_score_MEAN', 'annots_score_SUM','color_score_MEAN', 'color_score_SUM', 'color_pixelfrac_MEAN',
#       'color_pixelfrac_SUM', 'crop_conf_MEAN', 'crop_conf_SUM','crop_importance_MEAN', 'crop_importance_SUM']

#x = x_train.append(x_test)
#x[cols_rank] = x[cols_rank].fillna(0)
#df_cols_rank = x[cols_rank].rank(pct=True).rename(columns=lambda s:'rank.'+s)
#df_cols_rank = pd.concat([df_cols_rank, x['PetID']], axis=1)

#x_train = x_train.merge(df_cols_rank, how='left', on='PetID')
#x_test =  x_test.merge(df_cols_rank, how='left', on='PetID')

# 加入图像特征

In [382]:
a = pd.read_csv('train_img_features.csv')

a.columns

x_train = x_train.merge(a, how='left', on='PetID')
x_test = x_test.merge(a, how='left', on='PetID')


# 数据清理

In [18]:
drop_columns = ['Name', 'RescuerID', 'Description', 'PetID', 'token', 'annots_top_desc']
drop_columns = ['Name', 'RescuerID', 'Description', 'PetID']


x_train.drop(drop_columns, axis=1, inplace=True)
x_test.drop(drop_columns, axis=1, inplace=True)


x_train = x_train.fillna(0)
x_test  = x_test.fillna(0)

# LGB 算法

In [21]:
from lightgbm.sklearn import LGBMRegressor

model_lgb = LGBMRegressor(
        learning_rate    = 0.01,
        n_estimators     = 500,
        max_depth        = 4,
        num_leaves       = 8 ,
        subsample        = 0.9,      #训练时采样一定比例的数据	
        colsample_bytree = 0.6,
        n_jobs           = -1,
        random_state     = 4,
        objective        = 'regression',
#        reg_alpha        = 0.1,
        eval_metric      = 'scorer',
        min_child_samples = 3         #叶子节点具有的最小记录数	

        )
        


In [22]:
y_lgb = split_score(model_lgb, x_train, y_train)

coe = search_coef(y_lgb, y_train)
best_lgb_coe = coe['x']
print('lgb的最佳系数为{}'.format(best_lgb_coe))

model_lgb.fit(x_train, y_train)
result_lgb = model_lgb.predict(x_test)
result_lgb_fix = fix_y(result_lgb, best_lgb_coe)
print('lgb后的分布:',Counter(result_lgb_fix))
print('融合后的二次加权Kappa系数为', metric(result_lgb_fix, y_test))
print('y_test的真实分布为',Counter(y_test))

10折后的Kappa加权得分为:带补充
lgb的最佳系数为[1.62493356 2.2024686  2.42634022 2.77896379]
lgb后的分布: Counter({3.0: 656, 1.0: 650, 4.0: 643, 2.0: 563, 0.0: 1})
融合后的二次加权Kappa系数为 0.341037767833419
y_test的真实分布为 Counter({4: 753, 2: 607, 3: 564, 1: 509, 0: 80})


In [386]:
#参数重要性
model_lgb.feature_importances_
a = pd.DataFrame(model_lgb.feature_importances_, index=x_train.columns)
a.sort_values(by=0, ascending=False)

Unnamed: 0,0
RescuerID_count,31
Breed1,30
Age,28
Fee,19
annots_score_SUM,18
Quantity,17
126,17
annots_score_MEAN,15
Breed1_count,14
49,13


# XGB

In [387]:
from xgboost.sklearn import XGBRegressor

model_xgb = XGBRegressor(
    learning_rate    = 0.1,
    n_estimatores    = 100, 
    
    early_stopping_rounds=20,  
    
    max_depth        = 5, 
    min_child_weight = 5,
    
    gamma            = 0,
    
    subsample        =  0.9,
    colsample_bytree = 0.7,
        
    reg_alpha        = 1,
#    reg_lambda       = 0.1,
    nthread      = -1)
#    objective        = 'regression',    
#    eval_metric      = 'scorer')      

model_xgb.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, early_stopping_rounds=20, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=5, missing=None, n_estimatores=100,
       n_estimators=100, n_jobs=1, nthread=-1, objective='reg:linear',
       random_state=0, reg_alpha=1, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.9)

In [388]:
y_xgb = split_score(model_xgb, x_train, y_train)

coe = search_coef(y_xgb, y_train)
best_xgb_coe = coe['x']
print('xgb的最佳系数为{}'.format(best_xgb_coe))

model_xgb.fit(x_train, y_train)
result_xgb = model_xgb.predict(x_test)
result_xgb_fix = fix_y(result_xgb, best_xgb_coe)
print('xgb后的分布:',Counter(result_xgb_fix))
print('xgb后的二次加权Kappa系数为', metric(result_xgb_fix, y_test))
print('y_test的真实分布为',Counter(y_test))

10折后的Kappa加权得分为:带补充
xgb的最佳系数为[1.65966313 2.07945665 2.53704873 2.85860824]
xgb后的分布: Counter({2.0: 691, 4.0: 631, 3.0: 487, 1.0: 347, 0.0: 64})
xgb后的二次加权Kappa系数为 0.3651553140051612
y_test的真实分布为 Counter({4: 757, 2: 537, 1: 448, 3: 417, 0: 61})


In [389]:
#参数重要性
model_xgb.feature_importances_
a = pd.DataFrame(model_xgb.feature_importances_, index=x_train.columns)
a.sort_values(by=0, ascending=False)

Unnamed: 0,0
Age,0.015806
RescuerID_count,0.013347
annots_score_MEAN,0.010186
Breed1,0.010186
SVD_Description_49,0.008430
annots_score_SUM,0.007727
35,0.007727
9,0.007727
Quantity,0.007025
SVD_Description_33,0.006674


# 后处理

In [390]:
best_coe  = (best_lgb_coe + best_xgb_coe) / 2
best_coe = best_xgb_coe

result = (result_lgb + result_xgb) / 2
result_fix = fix_y(result, best_coe)

print('融合后的二次加权Kappa系数为', metric(result_fix, y_test))



融合后的二次加权Kappa系数为 0.38851531123422645


In [391]:
y_lgb = model_lgb.predict(x_train)

y_xgb = model_xgb.predict(x_train)
y_rf  = model_rf.predict(x_train)


NameError: name 'model_rf' is not defined

In [None]:
y = (result_lgb + result_xgb + result_rf)/3
result = fix_y(y, best_coe)

print('融合后的分布:',Counter(result))
print('融合后的二次加权Kappa系数为', metric(result, y_test))
print('y_test的真实分布为',Counter(y_test))


In [None]:
# 特征重要性
model_lgb.feature_importances_
a = pd.DataFrame(model_lgb.feature_importances_, index=x_train.columns)
a.sort_values(by=0, ascending=False)