In [5]:
import gc
import glob
import os
import json
import matplotlib.pyplot as plt
import pprint

import numpy as np
import pandas as pd

from joblib import Parallel, delayed
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
import scipy as sp

from sklearn.model_selection import train_test_split, GridSearchCV

from functools import partial
from collections import Counter

import random
import math

# 评价函数

In [6]:
#%% 评价函数 Metric used for this competition 
# (Quadratic Weigthed Kappa aka Quadratic Cohen Kappa Score)
def metric(y1,y2):
    return cohen_kappa_score(y1, y2, weights = 'quadratic')


# Make scorer for scikit-learn
scorer = make_scorer(metric)

# Cross验证函数

In [7]:
from sklearn.model_selection import StratifiedKFold

#
def split_score(model, x, y, n=10):
    y_pre = np.zeros(y.shape[0])
    kfold = StratifiedKFold(n_splits=n, random_state=4)
    for train_index, test_index in kfold.split(x,y):
        model.fit(x.iloc[train_index], y.iloc[train_index])
        y_pre[test_index] = model.predict(x.iloc[test_index])
    
#    score = metric(y_pre, y)
    print("{}折后的Kappa加权得分为:带补充".format(n))
    
    return y_pre

#
def fix_y(y, coef):
    y_fix = np.copy(y)
    for i, pred in enumerate(y_fix):
        if pred < coef[0]:
            y_fix[i] = 0
        elif pred >= coef[0] and pred < coef[1]:
            y_fix[i] = 1
        elif pred >= coef[1] and pred < coef[2]:
            y_fix[i] = 2
        elif pred >= coef[2] and pred < coef[3]:
            y_fix[i] = 3
        else:
            y_fix[i] = 4    
    return y_fix

# 
def _kappa_loss(y, y_true, coef):
    y_fix = np.copy(y)
    for i, pred in enumerate(y_fix):
        if pred < coef[0]:
            y_fix[i] = 0
        elif pred >= coef[0] and pred < coef[1]:
            y_fix[i] = 1
        elif pred >= coef[1] and pred < coef[2]:
            y_fix[i] = 2
        elif pred >= coef[2] and pred < coef[3]:
            y_fix[i] = 3
        else:
            y_fix[i] = 4
            
    loss = metric(y_fix, y_true)
    return -loss

# 寻找分类的最佳参数
def search_coef(x1, x2):
    loss_partial = partial(_kappa_loss, x1, x2)
    initial_coef = [1.55, 2.05, 2.5, 3]
    coef = sp.optimize.basinhopping(loss_partial, initial_coef, niter=500, T=1,
                                              stepsize=0.2, minimizer_kwargs={"method": 'nelder-mead'}, 
                                              take_step=None, accept_test=None, callback=None, 
                                              interval=100, disp=True, niter_success=10, seed=None)
    return coef

# 读取数据、划分验证集

In [8]:
df_train  = pd.read_csv('train.csv')
x = df_train.copy()

#读取唯一的RescuerID
RescuerID = set(df_train['RescuerID'].unique())

#随机生成RescuerID
j_test = random.sample(RescuerID, int(len(RescuerID)*0.2))
j_train = RescuerID - set(j_test)

df_test = df_train[df_train['RescuerID'].isin(j_test)]
df_train = df_train[df_train['RescuerID'].isin(j_train)]

train = df_train.copy()
test  = df_test.copy()

In [9]:
labels_breed = pd.read_csv('breed_labels.csv')
labels_state = pd.read_csv('color_labels.csv')
labels_color = pd.read_csv('state_labels.csv')

In [10]:
#%% 删除异常值
cul_drop = ['375905770', 'da8d4a273', '27e74e45c', '7b5bee232', '0327b8e94']
df_train = df_train[~df_train['PetID'].isin(cul_drop)]

# 提取 sentiment 的特征

In [59]:
def extract_sentiment_feature(i, x):    
#    feature_sentiment = pd.DataFrame(columns=['PetID', 'token', 'sentence_magnitude', 'sentence_score','document_magnitude', 'document_score'])
    feature_sentiment = pd.DataFrame()

    if x == 'train':
        set_file = 'train'
    else:
        set_file = 'train' 
        
    file_name = '{}_sentiment/{}.json'.format(set_file,i)
    try:
        f = open(file_name, 'r')
        sentiment_file = json.load(f)
            
        token = [x['name'] for x in sentiment_file['entities']]
        token = ' '.join(token)
            
        sentences_sentiment = [x['sentiment'] for x in sentiment_file['sentences']]
        sentences_sentiment = pd.DataFrame.from_dict(
            sentences_sentiment, orient='columns').sum()
        sentenceSentiment_magnitude = sentences_sentiment['magnitude']
        sentenceSentiment_score     = sentences_sentiment['score']
            
        docementSentiment_magnitude = sentiment_file['documentSentiment']['magnitude']
        documentSentiment_score     = sentiment_file['documentSentiment']['score']
            
        new = pd.DataFrame(
                {'PetID'               :[i], 
#                 'token'               : [token],
                 'sentence_magnitude'  : [sentenceSentiment_magnitude],
                 'sentence_score'      : [sentenceSentiment_score],
                 'document_magnitude'  : [docementSentiment_magnitude], 
                 'document_score'      : [documentSentiment_score]})  
        feature_sentiment = feature_sentiment.append(new)
    except:
        print('{}没找到'.format(file_name))
    
    for each in feature_sentiment.columns:
        if each not in ['PetID','token']:
            feature_sentiment[each] = feature_sentiment[each].astype(float)

    return feature_sentiment

#%%
train_feature_sentiment = Parallel(n_jobs=8, verbose=1)(
        delayed(extract_sentiment_feature)(i, 'train') for i in train.PetID)
train_feature_sentiment = [x for x in train_feature_sentiment]
train_feature_sentiment = pd.concat(train_feature_sentiment, ignore_index=True, sort=False)

test_feature_sentiment = Parallel(n_jobs=8, verbose=1)(
        delayed(extract_sentiment_feature)(i, 'test') for i in test.PetID)
test_feature_sentiment = [x for x in test_feature_sentiment]
test_feature_sentiment = pd.concat(test_feature_sentiment, ignore_index=True, sort=False)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 12147 out of 12147 | elapsed:    7.3s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done 632 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 2846 out of 2846 | elapsed:    1.6s finished


# 提取 metadata 的特征

In [13]:
#%% 提取 metadata 的特征
#file_name = 'train_metadata/000a290e4-1.json'
#f = open(file_name, 'r')
#metadatafile = json.load(f)
def extract_metadata_feature(i, x):
    feature_metadata = pd.DataFrame()
    if x == 'train':
        set_file = 'train'
    else:
        set_file = 'train'
        
    metadata_filenames = sorted(glob.glob('{}_metadata/{}*.json'.format(set_file, i)))
    if len(metadata_filenames) > 0:
        feature_metadata_sub = pd.DataFrame(columns=['PetID', 'annots_score', 'color_score', 'color_pixelfrac', 'crop_conf','crop_importance', 'annots_top_desc'])
        for ff in metadata_filenames:
            f = open(ff, 'rb')
            file = json.load(f)
            #label
            if 'labelAnnotations' in file:
                file_annots = file['labelAnnotations'][:int(len(file['labelAnnotations']) * 0.3)]
                file_top_score = np.asarray([x['score'] for x in file_annots]).mean()
                file_top_desc = [x['description'] for x in file_annots]            
            else:
                file_top_score = np.nan
                file_top_desc = ['']
            #colors
            file_colors = file['imagePropertiesAnnotation']['dominantColors']['colors']            
            file_color_score = np.asarray([x['score'] for x in file_colors]).mean()
            file_color_pixelfrac = np.asarray([x['pixelFraction'] for x in file_colors]).mean()            
            #crops
            file_crops = file['cropHintsAnnotation']['cropHints']                
            file_crop_conf = np.asarray([x['confidence'] for x in file_crops]).mean()
            if 'importanceFraction' in file_crops[0].keys():
                file_crop_importance = np.asarray([x['importanceFraction'] for x in file_crops]).mean()
            else:
                file_crop_importance = np.nan
                
            new = pd.DataFrame(
                    {
                            'PetID'          : [i],
                            'annots_score'   : [file_top_score],
                            'color_score'     : [file_color_score],
                            'color_pixelfrac' : [file_color_pixelfrac],
                            'crop_conf'       : [file_crop_conf],
                            'crop_importance' : [file_crop_importance],
                            'annots_top_desc' : [' '.join(file_top_desc)]})
            feature_metadata_sub = feature_metadata_sub.append(new)
                
        metadata_desc = feature_metadata_sub.groupby(['PetID'])['annots_top_desc'].unique()
        metadata_desc = metadata_desc.reset_index()
        metadata_desc['annots_top_desc'] = metadata_desc['annots_top_desc'].apply(lambda x:' '.join(x))
        feature_metadata_sub.drop(['annots_top_desc'], axis=1, inplace=True)

        for each in feature_metadata_sub:
            if each not in ['PetID']:
                feature_metadata_sub[each] = feature_metadata_sub[each].astype(float)
        
        
        feature_metadata_sub = feature_metadata_sub.groupby(['PetID']).agg(['mean', 'sum'])
        feature_metadata_sub.columns = ['{}_{}'.format(c[0], c[1].upper()) for c in feature_metadata_sub.columns.tolist()]  
        feature_metadata_sub = feature_metadata_sub.reset_index()
        
        feature_metadata_sub = feature_metadata_sub.merge(metadata_desc, how='left', on='PetID')
            
        feature_metadata = feature_metadata.append(feature_metadata_sub)
    return feature_metadata

#
#for each in 
train_feature_metadata = Parallel(n_jobs=8, verbose=1)(
        delayed(extract_metadata_feature)(i, 'train') for i in train.PetID)
train_feature_metadata = [x for x in train_feature_metadata]
train_feature_metadata = pd.concat(train_feature_metadata, ignore_index=True, sort=False)

test_feature_metadata = Parallel(n_jobs=8, verbose=1)(
        delayed(extract_metadata_feature)(i, 'test') for i in test.PetID)
test_feature_metadata = [x for x in test_feature_metadata]
test_feature_metadata = pd.concat(test_feature_metadata, ignore_index=True, sort=False)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 352 tasks      | elapsed:    7.3s
[Parallel(n_jobs=8)]: Done 852 tasks      | elapsed:   17.3s
[Parallel(n_jobs=8)]: Done 1552 tasks      | elapsed:   31.4s
[Parallel(n_jobs=8)]: Done 2452 tasks      | elapsed:   49.3s
[Parallel(n_jobs=8)]: Done 3552 tasks      | elapsed:  1.2min
[Parallel(n_jobs=8)]: Done 4852 tasks      | elapsed:  1.6min
[Parallel(n_jobs=8)]: Done 6352 tasks      | elapsed:  2.1min
[Parallel(n_jobs=8)]: Done 8052 tasks      | elapsed:  2.7min
[Parallel(n_jobs=8)]: Done 9952 tasks      | elapsed:  3.3min
[Parallel(n_jobs=8)]: Done 11452 out of 11452 | elapsed:  3.8min finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 352 tasks      | elapsed:    7.4s
[Parallel(n_jobs=8)]: Done 852 tasks 

In [589]:
train_feature_metadata[train_feature_metadata['PetID']=='86e1089a3']['annots_top_desc']

0    cat black cat
Name: annots_top_desc, dtype: object

In [60]:
#%% 连接sentiment和metadata和原始数据
x_train = df_train.merge(train_feature_sentiment, how='left', on='PetID')
#x_train = x_train.merge(train_feature_metadata, how='left', on='PetID')

y_train = x_train['AdoptionSpeed']
x_train.drop(['AdoptionSpeed'], axis=1, inplace=True)

x_test = df_test.merge(test_feature_sentiment, how='left', on='PetID')
#x_test = x_test.merge(test_feature_metadata, how='left', on='PetID')

y_test = x_test['AdoptionSpeed']
x_test.drop(['AdoptionSpeed'],  axis=1, inplace=True)

# NLP(情感分析，形容词的数量)

In [64]:
from textblob import TextBlob

x = x_train.append(x_test)
x = x[['PetID', 'Description']]

x['Description'] = x['Description'].fillna("Missing")
x['Description'] = x['Description'].apply(lambda x:TextBlob(x))

x['polarity']     = x['Description'].apply(lambda x:x.sentiment[0])
x['subjectivity'] = x['Description'].apply(lambda x:x.sentiment[1])

x_train = x_train.merge(x[['PetID', 'polarity', 'subjectivity']], how='left', on='PetID')
x_Test  = x_test.merge(x[['PetID', 'polarity', 'subjectivity']], how='left', on='PetID')

In [67]:
##################################统计TF###############################################

import nltk
from nltk.corpus import stopwords
x = x_train.append(x_test)
x = x[['PetID', 'Description']]

# 填充缺失值
x['Description'] = x['Description'].fillna('Missing')

# 第一步分词
x['Description'] = x['Description'].apply(lambda x : nltk.word_tokenize(x))


#1 去掉标点符号和停用词
#去掉标点符号
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
x['Description'] = x['Description'].apply(lambda x : [word for word in x if word not in english_punctuations])

# 提取布朗语料库的形容词
brown_corpus = (nltk.corpus.brown.tagged_words())
objective = [word for (word, tap) in brown_corpus if tap == 'JJ']
# 根据布朗语料库提取形容词
x['Description'] = x['Description'].apply(lambda x:list(set(x).intersection(set(objective))))
x['Description'] = x['Description'].apply(lambda x:" ".join(x))

x['Adjective_count'] = x['Description'].apply(lambda x:len(x))
#######构造词频统计
#from sklearn.feature_extraction.text import CountVectorizer
#
#countvec = CountVectorizer( lowercase  = True, 
#                            stop_words = 'english',
#                            min_df     = 10,
#                            max_df     = 0.9
#                            )
#b = countvec.fit_transform(x['Description'])

##################################SVD降维###############################################
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import SparsePCA, TruncatedSVD, LatentDirichletAllocation, NMF

#countvec = CountVectorizer( lowercase  = True, 
#                            stop_words = 'english',
#                            min_df     = 10,
#                            max_df     = 0.9
#                            )
#TF_data = countvec.fit_transform(x['Description'])

#n_components = 10
#svd_ = TruncatedSVD(n_components=n_components)

#nmf_ = NMF(n_components=n_components)

#nmf_col = nmf_.fit_transform(TF_data)
#nmf_col = pd.DataFrame(nmf_col)
#nmf_col = nmf_col.add_prefix('SVD_{}_'.format('describe'))

#x = pd.concat([x.reset_index()['PetID'], nmf_col], axis=1)

x_train = x_train.merge(x[['PetID', 'Adjective_count']], how='left', on='PetID')
x_test  =  x_test.merge(x[['PetID', 'Adjective_count']], how='left', on='PetID')

print('Done')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Done


In [70]:
x

Unnamed: 0,PetID,Description,Adjective_count
0,6296e909a,alone near just temporary,25
1,3422e4906,near adorable pregnant acceptable long precaut...,80
2,5842f1ff5,interested alert Good good very master active,45
3,850a43f90,such like looking cute playful young handsome,45
4,d24c30b4b,stray,5
5,1caa6fcdb,interested just outside still,29
6,97aa9eeac,just,4
7,c06d167ca,healthy active sure,19
8,8b693ca84,serious,7
9,aaedd873d,neuter happy active very,24


# 增加新的特征
1、是否需要收费
2、年份 floor(x/12)
3、该品种的数量
4、是否是稀有品种(分普通稀有和特别稀有)
5、是否很常见
6、Color的笛卡尔积（效果不好）

In [206]:
x_train['IsFree'] = x_train['Fee'].apply(lambda x:1 if x>0 else 0)
x_test['IsFree']  = x_test['Fee'].apply(lambda x:1 if x>0 else 0)

x_train['Year'] = x_train['Age'].apply(lambda x:math.floor(x/12))
x_test['Year']  = x_test['Age'].apply(lambda x:math.floor(x/12))

x = x_train.append(x_test)
x['Age_qcut'] = pd.qcut(x['Age'], 5,  duplicates='drop')
x['Age_qcut'] = pd.factorize(x['Age_qcut'])[0]
x_train = x_train.merge(x[['PetID','Age_qcut']], how='left', on='PetID')
x_test  = x_test.merge(x[['PetID','Age_qcut']], how='left', on='PetID')

x = x_train.append(x_test)
Breed1_count = x.groupby('Breed1').size().to_frame('Breed1_count').reset_index()
x_train = x_train.merge(Breed1_count, how='left', on='Breed1')
x_test = x_test.merge(Breed1_count, how='left', on='Breed1')


a = x['Breed1'].value_counts().sort_values(ascending = False).cumsum()/len(x)
rare1_index = a[a > 0.85].index.tolist()
x_train['IsRare1'] = x_train['Breed1'].isin(rare1_index).apply(lambda x:1 if x == True else 0)
x_test['IsRare1']  = x_test['Breed1'].isin(rare1_index).apply(lambda x:1 if x == True else 0)
rare2_index = a[a > 0.72].index.tolist()
x_train['IsRare2'] = x_train['Breed1'].isin(rare2_index).apply(lambda x:1 if x == True else 0)
x_test['IsRare2']  = x_test['Breed1'].isin(rare2_index).apply(lambda x:1 if x == True else 0)

x_train['Is_Mixed Breed_ID307']          = x_train['Breed1'].apply(lambda x:1 if x == 307 else 0)
x_test['Is_Mixed Breed_ID307']           = x_test['Breed1'].apply(lambda x:1 if x == 307 else 0)
x_train['Is_Domestic Short Hair_ID266']  = x_train['Breed1'].apply(lambda x:1 if x == 266 else 0)
x_test['Is_Domestic Short Hair_ID266']   = x_test['Breed1'].apply(lambda x:1 if x == 266 else 0)
x_train['Is_Domestic Medium Hair_ID265'] = x_train['Breed1'].apply(lambda x:1 if x == 265 else 0)
x_test['Is_Domestic Medium Hair_ID265']  = x_test['Breed1'].apply(lambda x:1 if x == 265 else 0)

x_train['Is_COMMON'] = x_train['Breed1'].apply(lambda x:1 if (x == 265 or x == 307 or x == 266) else 0)
x_test['Is_COMMON']  = x_test['Breed1'].apply(lambda x:1 if (x == 265 or x == 307 or x == 266) else 0)


#是否是纯种


#效果不好
#x_train['Color_Mix'] = x_train['Color1'].astype(str)+x_train['Color2'].astype(str)+x_train['Color3'].astype(str)
#x_train['Color_Mix'] = pd.factorize(x_train['Color_Mix'])[0]
#x_test['Color_Mix'] = x_test['Color1'].astype(str)+x_test['Color2'].astype(str)+x_test['Color3'].astype(str)
#x_test['Color_Mix'] = pd.factorize(x_test['Color_Mix'])[0]

In [207]:
x_test.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'sentence_magnitude',
       'sentence_score', 'document_magnitude', 'document_score',
       'SVD_describe_0', 'SVD_describe_1', 'SVD_describe_2', 'SVD_describe_3',
       'SVD_describe_4', 'SVD_describe_5', 'SVD_describe_6', 'SVD_describe_7',
       'SVD_describe_8', 'SVD_describe_9', 'IsFree', 'Year', 'Age_qcut',
       'Breed1_count', 'IsRare1', 'IsRare2', 'Is_Mixed Breed_ID307',
       'Is_Domestic Short Hair_ID266', 'Is_Domestic Medium Hair_ID265',
       'Is_COMMON'],
      dtype='object')

In [208]:
x_train['PhotoAmt'].value_counts()

1.0     2451
2.0     2003
3.0     1988
5.0     1672
4.0     1522
6.0      484
7.0      349
0.0      271
8.0      244
9.0      176
11.0     145
10.0     145
12.0      82
13.0      73
14.0      62
15.0      40
16.0      33
20.0      24
17.0      23
19.0      17
18.0      16
30.0      15
24.0      14
21.0      14
23.0      11
22.0       8
26.0       8
28.0       7
27.0       6
25.0       6
29.0       5
Name: PhotoAmt, dtype: int64

# RescuerID 处理

In [209]:
#%% RescuerID 处理

df = df_train.append(df_test)
data_rescuer = df.groupby(['RescuerID'])['PetID'].count().reset_index()
data_rescuer.columns = ['RescuerID', 'RescuerID_count']
#data_rescuer['rank_Rescuer_count'] = data_rescuer['RescuerID_count'].rank(pct=True)

x_train = x_train.merge(data_rescuer, how='left', on='RescuerID')
x_test  = x_test.merge(data_rescuer, how='left', on='RescuerID')

#x_train.drop(['RescuerID_count'], axis=1, inplace=True)
#x_test.drop(['RescuerID_count'], axis=1, inplace=True)

x_train['single'] = x_train['RescuerID_count'].apply(lambda x:1 if x<3 else 0)
x_train['middle'] = x_train['RescuerID_count'].apply(lambda x:1 if (x>2 and x<6) else 0)
x_train['Charities'] = x_train['RescuerID_count'].apply(lambda x:1 if x>5 else 0)

x_test['single'] = x_test['RescuerID_count'].apply(lambda x:1 if x<3 else 0)
x_test['middle'] = x_test['RescuerID_count'].apply(lambda x:1 if (x>2 and x<6) else 0)
x_test['Charities'] = x_test['RescuerID_count'].apply(lambda x:1 if x>5 else 0)

# 处理Breed

In [210]:
# 增加特征 是否有第二血统
x_train['HasSecondBreed'] = x_train['Breed2'].map(lambda x:1 if x != 0 else 0)
x_test['HasSecondBreed'] = x_test['Breed2'].map(lambda x:1 if x != 0 else 0)

train_breed_main = x_train[['Breed1']].merge(
    labels_breed, how='left',
    left_on='Breed1', right_on='BreedID',
    suffixes=('', '_main_breed'))

train_breed_main = train_breed_main.iloc[:, 2:]
train_breed_main = train_breed_main.add_prefix('main_breed_')

train_breed_second = x_train[['Breed2']].merge(
    labels_breed, how='left',
    left_on='Breed2', right_on='BreedID',
    suffixes=('', '_second_breed'))


train_breed_second = train_breed_second.iloc[:, 2:]
train_breed_second = train_breed_second.add_prefix('second_breed_')

x_train = pd.concat(
    [x_train, train_breed_main, train_breed_second], axis=1)

##############
test_breed_main = x_test[['Breed1']].merge(
    labels_breed, how='left',
    left_on='Breed1', right_on='BreedID',
    suffixes=('', '_main_breed'))

test_breed_main = test_breed_main.iloc[:, 2:]
test_breed_main = test_breed_main.add_prefix('main_breed_')

test_breed_second = x_test[['Breed2']].merge(
    labels_breed, how='left',
    left_on='Breed2', right_on='BreedID',
    suffixes=('', '_second_breed'))

test_breed_second = test_breed_second.iloc[:, 2:]
test_breed_second = test_breed_second.add_prefix('second_breed_')

x_test = pd.concat(
    [x_test, test_breed_main, test_breed_second], axis=1)

print(x_train.shape, x_test.shape)

categorical_columns = ['main_breed_BreedName', 'second_breed_BreedName']
#for i in categorical_columns:
#    x_train.loc[:, i] = pd.factorize(x_train.loc[:, i])[0]
#    x_test.loc[:,i]   = pd.factorize(x_test.loc[:, i])[0]

# 增加特征 是否纯种
x_train['True_Pure'] = 0
x_train.loc[(x_train['main_breed_BreedName'] != 'Mixed Breed')&
                    ((x_train['main_breed_BreedName'] == x_train['second_breed_BreedName'])|
                   (x_train['second_breed_BreedName'].isnull())),'True_Pure'] = 1

x_test['True_Pure'] = 0
x_test.loc[(x_test['main_breed_BreedName'] != 'Mixed Breed')&
                    ((x_test['main_breed_BreedName'] == x_test['second_breed_BreedName'])|
                   (x_test['second_breed_BreedName'].isnull())),'True_Pure'] = 1

x_train[['main_breed_BreedName', 'True_Pure', 'second_breed_BreedName']]

#删除没用特征
x_train.drop(['main_breed_BreedName', 'second_breed_BreedName', 'main_breed_Type', 'second_breed_Type'], axis=1, inplace=True)
x_test.drop(['main_breed_BreedName', 'second_breed_BreedName', 'main_breed_Type', 'second_breed_Type'], axis=1, inplace=True)

(11914, 56) (3075, 56)


In [211]:
# 增加特征 是否纯种
x_train.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'sentence_magnitude',
       'sentence_score', 'document_magnitude', 'document_score',
       'SVD_describe_0', 'SVD_describe_1', 'SVD_describe_2', 'SVD_describe_3',
       'SVD_describe_4', 'SVD_describe_5', 'SVD_describe_6', 'SVD_describe_7',
       'SVD_describe_8', 'SVD_describe_9', 'IsFree', 'Year', 'Age_qcut',
       'Breed1_count', 'IsRare1', 'IsRare2', 'Is_Mixed Breed_ID307',
       'Is_Domestic Short Hair_ID266', 'Is_Domestic Medium Hair_ID265',
       'Is_COMMON', 'RescuerID_count', 'single', 'middle', 'Charities',
       'HasSecondBreed', 'True_Pure'],
      dtype='object')

# 等某些特征进行rank

In [212]:
# 对某些特征进行rank
#cols_rank = ['sentence_magnitude', 'sentence_score', 'document_magnitude','document_score']#,
#       'annots_score_MEAN', 'annots_score_SUM','color_score_MEAN', 'color_score_SUM', 'color_pixelfrac_MEAN',
#       'color_pixelfrac_SUM', 'crop_conf_MEAN', 'crop_conf_SUM','crop_importance_MEAN', 'crop_importance_SUM']

#x = x_train.append(x_test)
#x[cols_rank] = x[cols_rank].fillna(0)
#df_cols_rank = x[cols_rank].rank(pct=True).rename(columns=lambda s:'rank.'+s)
#df_cols_rank = pd.concat([df_cols_rank, x['PetID']], axis=1)

#x_train = x_train.merge(df_cols_rank, how='left', on='PetID')
#x_test =  x_test.merge(df_cols_rank, how='left', on='PetID')

# 数据清理

In [213]:
drop_columns = ['Name', 'RescuerID', 'Description', 'PetID', 'token', 'annots_top_desc']
drop_columns = ['Name', 'RescuerID', 'Description', 'PetID']


x_train.drop(drop_columns, axis=1, inplace=True)
x_test.drop(drop_columns, axis=1, inplace=True)


x_train = x_train.fillna(0)
x_test  = x_test.fillna(0)

# LGB 算法

In [214]:
from lightgbm.sklearn import LGBMRegressor

model_lgb = LGBMRegressor(
        learning_rate    = 0.1,
        n_estimators     = 300,
        max_depth        = 4,
        num_leaves       = 8 ,
        subsample        = 0.7,      #训练时采样一定比例的数据	
        colsample_bytree = 0.7,
        n_jobs           = -1,
        random_state     = 4,
        objective        = 'regression',
        eval_metric      = 'scorer',
        min_child_samples = 3         #叶子节点具有的最小记录数	

        )
        


In [215]:
y_lgb = split_score(model_lgb, x_train, y_train)

coe = search_coef(y_lgb, y_train)
best_lgb_coe = coe['x']
print('lgb的最佳系数为{}'.format(best_lgb_coe))

model_lgb.fit(x_train, y_train)
result_lgb = model_lgb.predict(x_test)
result_lgb_fix = fix_y(result_lgb, best_lgb_coe)
print('lgb后的分布:',Counter(result_lgb_fix))
print('融合后的二次加权Kappa系数为', metric(result_lgb_fix, y_test))
print('y_test的真实分布为',Counter(y_test))

10折后的Kappa加权得分为:带补充
basinhopping step 0: f -0.424523
basinhopping step 1: f -0.424697 trial_f -0.424697 accepted 1  lowest_f -0.424697
found new global minimum on step 1 with function value -0.424697
basinhopping step 2: f -0.423883 trial_f -0.423883 accepted 1  lowest_f -0.424697
basinhopping step 3: f -0.425726 trial_f -0.425726 accepted 1  lowest_f -0.425726
found new global minimum on step 3 with function value -0.425726
basinhopping step 4: f -0.42595 trial_f -0.42595 accepted 1  lowest_f -0.42595
found new global minimum on step 4 with function value -0.42595
basinhopping step 5: f -0.423497 trial_f -0.423497 accepted 1  lowest_f -0.42595
basinhopping step 6: f -0.425468 trial_f -0.425468 accepted 1  lowest_f -0.42595
basinhopping step 7: f -0.425926 trial_f -0.425926 accepted 1  lowest_f -0.42595
basinhopping step 8: f -0.425862 trial_f -0.425862 accepted 1  lowest_f -0.42595
basinhopping step 9: f -0.42566 trial_f -0.42566 accepted 1  lowest_f -0.42595
basinhopping step 10: f -

In [137]:
y_train.shape

(11447,)

In [216]:
#参数重要性
model_lgb.feature_importances_
a = pd.DataFrame(model_lgb.feature_importances_, index=x_train.columns)
a.sort_values(by=0, ascending=False)

Unnamed: 0,0
RescuerID_count,180
Breed1,133
Age,98
PhotoAmt,98
SVD_describe_9,96
SVD_describe_1,93
Breed1_count,76
SVD_describe_8,76
SVD_describe_0,75
SVD_describe_4,70


In [516]:
#调参
parameters = {
            'reg_alpha': [0,0.01,0.1,1],
            'reg_lambda': [0,0.01,0.1,1]
            }

gsearch = GridSearchCV(model_lgb, param_grid=parameters, scoring=scorer, cv=10, n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_


ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

# LGB 分类模型

In [218]:
x_train.columns

Index(['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'VideoAmt',
       'PhotoAmt', 'sentence_magnitude', 'sentence_score',
       'document_magnitude', 'document_score', 'SVD_describe_0',
       'SVD_describe_1', 'SVD_describe_2', 'SVD_describe_3', 'SVD_describe_4',
       'SVD_describe_5', 'SVD_describe_6', 'SVD_describe_7', 'SVD_describe_8',
       'SVD_describe_9', 'IsFree', 'Year', 'Age_qcut', 'Breed1_count',
       'IsRare1', 'IsRare2', 'Is_Mixed Breed_ID307',
       'Is_Domestic Short Hair_ID266', 'Is_Domestic Medium Hair_ID265',
       'Is_COMMON', 'RescuerID_count', 'single', 'middle', 'Charities',
       'HasSecondBreed', 'True_Pure'],
      dtype='object')

In [197]:
from lightgbm.sklearn import LGBMClassifier

model_lgb_class = LGBMClassifier(
        learning_rate    = 0.1,
        n_estimators     = 500,
        max_depth        = 4,
        num_leaves       = 8 ,
        subsample        = 0.7,      #训练时采样一定比例的数据	
        colsample_bytree = 0.7,
        n_jobs           = -1,
        random_state     = 4,
        eval_metric      = 'scorer',
        min_child_samples = 3         #叶子节点具有的最小记录数	
        )

In [198]:
model_lgb_class.fit(x_train, y_train)
result_lgb_class = model_lgb_class.predict(x_test)
metric(result_lgb_class, y_test)

0.31808482442869823

In [201]:
#参数重要性
model_lgb_class.feature_importances_
a = pd.DataFrame(model_lgb_class.feature_importances_, index=x_train.columns)
a.sort_values(by=0, ascending=False)

Unnamed: 0,0
RescuerID_count,1003
SVD_describe_4,997
SVD_describe_6,860
SVD_describe_8,839
SVD_describe_3,829
SVD_describe_1,772
SVD_describe_7,769
SVD_describe_5,750
SVD_describe_9,741
Age,738


# XGB

In [199]:
from xgboost.sklearn import XGBRegressor

model_xgb = XGBRegressor(
    learning_rate    = 0.05,
    n_estimatores    = 500, 
    
    early_stopping_rounds=20,  
    
    max_depth        = 4, 
    min_child_weight = 5,
    
    gamma            = 0,
    
    subsample        =  0.8,
    colsample_bytree = 0.6,
    
    reg_alpha        = 1,
    reg_lambda       = 0.1,
    nthread      = -1)
#    objective        = 'regression',    
#    eval_metric      = 'scorer')      

model_xgb.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, early_stopping_rounds=20, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=4,
       min_child_weight=5, missing=None, n_estimatores=500,
       n_estimators=100, n_jobs=1, nthread=-1, objective='reg:linear',
       random_state=0, reg_alpha=1, reg_lambda=0.1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.8)

In [200]:
y_xgb = split_score(model_xgb, x_train, y_train)

coe = search_coef(y_xgb, y_train)
best_xgb_coe = coe['x']
print('xgb的最佳系数为{}'.format(best_xgb_coe))

result_xgb = model_xgb.predict(x_test)
result_xgb_fix = fix_y(result_xgb, best_xgb_coe)
print('xgb后的分布:',Counter(result_xgb_fix))
print('xgb后的二次加权Kappa系数为', metric(result_xgb_fix, y_test))
print('y_test的真实分布为',Counter(y_test))

10折后的Kappa加权得分为:带补充
basinhopping step 0: f -0.408559
basinhopping step 1: f -0.411163 trial_f -0.411163 accepted 1  lowest_f -0.411163
found new global minimum on step 1 with function value -0.411163
basinhopping step 2: f -0.410285 trial_f -0.410285 accepted 1  lowest_f -0.411163
basinhopping step 3: f -0.410037 trial_f -0.410037 accepted 1  lowest_f -0.411163
basinhopping step 4: f -0.411028 trial_f -0.411028 accepted 1  lowest_f -0.411163
basinhopping step 5: f -0.410433 trial_f -0.410433 accepted 1  lowest_f -0.411163
basinhopping step 6: f -0.411793 trial_f -0.411793 accepted 1  lowest_f -0.411793
found new global minimum on step 6 with function value -0.411793
basinhopping step 7: f -0.411807 trial_f -0.411807 accepted 1  lowest_f -0.411807
found new global minimum on step 7 with function value -0.411807
basinhopping step 8: f -0.408987 trial_f -0.408987 accepted 1  lowest_f -0.411807
basinhopping step 9: f -0.409127 trial_f -0.409127 accepted 1  lowest_f -0.411807
basinhopping s

In [98]:
#参数重要性
model_xgb.feature_importances_
a = pd.DataFrame(model_xgb.feature_importances_, index=x_train.columns)
a.sort_values(by=0, ascending=False)

Unnamed: 0,0
RescuerID_count,0.054110
Age,0.048630
Breed1,0.037671
annots_score_MEAN,0.037671
annots_score_SUM,0.036986
Breed1_count,0.032877
SVD_annots_top_desc_1,0.026712
SVD_annots_top_desc_2,0.026027
Quantity,0.026027
State,0.025342


In [358]:
#调参
parameters = {'max_depth': range(6,14,2), 'min_child_weight': range(3,13,2)}
gsearch = GridSearchCV(model_xgb, param_grid=parameters, scoring=scorer, cv=10, n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

{'max_depth': 12, 'min_child_weight': 11}

# 随机森林模型

In [289]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(
    n_estimators      = 250,
    
    max_features      = 0.7,#选择最适属性时划分的特征不能超过此值。
#    max_depth         = 6, #设置树的最大深度，默认为None
    
    min_samples_leaf  = 3, #叶子节点最少的样本数
    min_samples_split = 2,#根据属性划分节点时，每个划分最少的样本数
    
    criterion = 'mse',
    
    n_jobs            = -1
    )

model_rf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.7, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [517]:
y_rf = split_score(model_rf, x_train, y_train)

coe = search_coef(y_rf, y_train)
best_rf_coe = coe['x']
print('lgb的最佳系数为{}'.format(best_rf_coe))
    
result_rf = model_rf.predict(x_test)
result_rf_fix = fix_y(result_rf, best_rf_coe)
print('随机森林融合后的分布:',Counter(result_rf_fix))
print('融合后的二次加权Kappa系数为', metric(result_rf_fix, y_test))
print('y_test的真实分布为',Counter(y_test))

KeyboardInterrupt: 

# 后处理

In [472]:
a = pd.DataFrame([y_lgb, y_xgb])
a.T

Unnamed: 0,0,1
0,2.097558,1.984937
1,2.339441,2.366128
2,2.230871,2.241388
3,3.255839,3.019631
4,2.142082,2.266630
5,1.994521,2.046473
6,2.106402,2.015116
7,2.500434,2.497560
8,1.855198,1.979238
9,3.210996,3.091297


In [493]:
best_coe  = (best_lgb_coe + best_xgb_coe) / 2
best_coe = best_xgb_coe

result = (result_lgb + result_xgb) / 2
result_fix = fix_y(result, best_coe)

print('融合后的二次加权Kappa系数为', metric(result_fix, y_test))



融合后的二次加权Kappa系数为 0.4254420615705451


In [138]:
y_lgb = model_lgb.predict(x_train)

y_xgb = model_xgb.predict(x_train)
y_rf  = model_rf.predict(x_train)


In [154]:
y = (result_lgb + result_xgb + result_rf)/3
result = fix_y(y, best_coe)

print('融合后的分布:',Counter(result))
print('融合后的二次加权Kappa系数为', metric(result, y_test))
print('y_test的真实分布为',Counter(y_test))


融合后的分布: Counter({2.0: 1140, 3.0: 840, 1.0: 194, 4.0: 143})
融合后的二次加权Kappa系数为 0.30060168046082947
y_test的真实分布为 Counter({2: 655, 4: 552, 3: 531, 1: 521, 0: 58})


In [299]:
# 特征重要性
model_lgb.feature_importances_
a = pd.DataFrame(model_lgb.feature_importances_, index=x_train.columns)
a.sort_values(by=0, ascending=False)

Unnamed: 0,0
Breed1,111
Age,97
RescuerID_count,96
annots_score_MEAN,69
State,63
main_breed_BreedName,56
annots_score_SUM,49
SVD_Description_9,43
Quantity,42
SVD_Description_8,41
