In [3]:
import gc
import glob
import os
import json
import matplotlib.pyplot as plt
import pprint

import numpy as np
import pandas as pd

from joblib import Parallel, delayed
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
import scipy as sp

from sklearn.model_selection import train_test_split, GridSearchCV

from functools import partial
from collections import Counter


# 评价函数

In [4]:
#%% 评价函数 Metric used for this competition 
# (Quadratic Weigthed Kappa aka Quadratic Cohen Kappa Score)
def metric(y1,y2):
    return cohen_kappa_score(y1, y2, weights = 'quadratic')


# Make scorer for scikit-learn
scorer = make_scorer(metric)

# Cross验证函数

In [5]:
from sklearn.model_selection import StratifiedKFold

#
def split_score(model, x, y, n=10):
    y_pre = np.zeros(y.shape[0])
    kfold = StratifiedKFold(n_splits=n, random_state=1337)
    for train_index, test_index in kfold.split(x,y):
        model.fit(x.iloc[train_index], y.iloc[train_index])
        y_pre[test_index] = model.predict(x.iloc[test_index])
    
#    score = metric(y_pre, y)
    print("{}折后的Kappa加权得分为:带补充".format(n))
    
    return y_pre

#
def fix_y(y, coef):
    y_fix = np.copy(y)
    for i, pred in enumerate(y_fix):
        if pred < coef[0]:
            y_fix[i] = 0
        elif pred >= coef[0] and pred < coef[1]:
            y_fix[i] = 1
        elif pred >= coef[1] and pred < coef[2]:
            y_fix[i] = 2
        elif pred >= coef[2] and pred < coef[3]:
            y_fix[i] = 3
        else:
            y_fix[i] = 4    
    return y_fix

# 
def _kappa_loss(y, y_true, coef):
    y_fix = np.copy(y)
    for i, pred in enumerate(y_fix):
        if pred < coef[0]:
            y_fix[i] = 0
        elif pred >= coef[0] and pred < coef[1]:
            y_fix[i] = 1
        elif pred >= coef[1] and pred < coef[2]:
            y_fix[i] = 2
        elif pred >= coef[2] and pred < coef[3]:
            y_fix[i] = 3
        else:
            y_fix[i] = 4
            
    loss = metric(y_fix, y_true)
    return -loss

# 寻找分类的最佳参数
def search_coef(x1, x2):
    loss_partial = partial(_kappa_loss, x1, x2)
    initial_coef = [0.5, 1.5, 2.5, 3.5]
    coef = sp.optimize.basinhopping(loss_partial, initial_coef, niter=500, T=1,
                                              stepsize=0.2, minimizer_kwargs={"method": 'nelder-mead'}, 
                                              take_step=None, accept_test=None, callback=None, 
                                              interval=100, disp=True, niter_success=20, seed=None)
    return coef

In [6]:
initial_coef = [0.5, 1.5, 2.5, 3.5]
#        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
        self.coef_ = sp.optimize.basinhopping(loss_partial, initial_coef, niter=500, T=1,
                                              stepsize=0.2, minimizer_kwargs={"method": 'nelder-mead'}, 
                                              take_step=None, accept_test=None, callback=None, 
                                              interval=100, disp=True, niter_success=20, seed=None)

IndentationError: unexpected indent (<ipython-input-6-08733a7f93b9>, line 3)

# 读取数据

In [7]:
df_train  = pd.read_csv('train.csv')
df_test   = pd.read_csv('test.csv')

train = df_train.copy()
test  = df_test.copy()

labels_breed = pd.read_csv('breed_labels.csv')
labels_state = pd.read_csv('color_labels.csv')
labels_color = pd.read_csv('state_labels.csv')

In [8]:
a = df_test.groupby('RescuerID').size()
type(a)
a.describe()

count    1518.000000
mean        2.600791
std         6.862242
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max       146.000000
dtype: float64

In [29]:
import nltk
nltk.pos_tag(['i', 'like'])

LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  Searched in:
    - '/root/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/root/anaconda3/nltk_data'
    - '/root/anaconda3/share/nltk_data'
    - '/root/anaconda3/lib/nltk_data'
**********************************************************************


In [77]:
a.head()

0    [Nibble, is, a, 3+, month, old, ball, of, cute...
1    [I, just, found, it, alone, yesterday, near, m...
2    [Their, pregnant, mother, was, dumped, by, her...
3    [Good, guard, dog, ,, very, alert, ,, active, ...
4    [This, handsome, yet, cute, boy, is, up, for, ...
Name: Description, dtype: object

In [192]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import brown.tagged_words #导入布朗语料库

a = df_train['Description'].apply(lambda x:nltk.word_tokenize(x))

#1 去掉标点符号和停用词
#去掉标点符号
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
b = a.apply(lambda x:[word for word in x if word not in english_punctuations])

#去掉停用词
stops = set(stopwords.words("english"))
c = b.apply(lambda x:[set(x).intersection(objective)])


In [193]:
c

0        [{energetic, responsible, playful, little, old...
1                         [{near, alone, temporary, just}]
2        [{long, healthy, adorable, pregnant, precautio...
3        [{active, master, very, Good, interested, aler...
4        [{such, looking, playful, young, cute, like, h...
5                                                [{stray}]
6                     [{outside, interested, still, just}]
7                                                 [{just}]
8                                [{active, sure, healthy}]
9        [{long, very, gentle, soft, high, cute, like, ...
10                                             [{serious}]
11       [{super, active, long, routine, playful, singl...
12                         [{active, happy, very, neuter}]
13                    [{long, evident, cute, stray, good}]
14                                        [{red, healthy}]
15       [{loyal, looking, fun-loving, friendly, great,...
16                                  [{friendly, adorable

In [185]:
set(b[1]).intersection(objective)

{'alone', 'just', 'near', 'temporary'}

In [173]:
brown_corpus = (nltk.corpus.brown.tagged_words())
objective = [word for (word, tap) in brown_corpus if tap == 'JJ']

In [174]:
len(objective)

64028

In [149]:
from sklearn.feature_extraction.text import CountVectorizer

f = e.fillna('Missing')
countvec = CountVectorizer( lowercase  = True, 
                            stop_words = 'english',
                            min_df     = 10
                            )
a = countvec.fit_transform(f)

print(a.shape)
countvec.vocabulary_

(14993, 844)


{'nibble': 498,
 'old': 512,
 'energetic': 222,
 'playful': 546,
 'couple': 158,
 'neutered': 493,
 'clinic': 134,
 'little': 417,
 'kitty': 387,
 'responsible': 609,
 'temporary': 731,
 'pregnant': 562,
 'irresponsible': 374,
 'healthy': 323,
 'adorable': 27,
 'vaccinated': 797,
 'ready': 592,
 'long': 421,
 'acceptable': 10,
 'tie': 741,
 'precautionary': 559,
 'interested': 370,
 'adopt': 23,
 'good': 299,
 'active': 13,
 'cute': 166,
 'young': 836,
 'stray': 703,
 'hometown': 334,
 'outside': 518,
 'sure': 716,
 'manja': 444,
 'gentle': 293,
 'high': 328,
 'soft': 678,
 'super': 715,
 'quiet': 585,
 'cuddle': 163,
 'toys': 752,
 'busy': 108,
 'half': 315,
 'normal': 504,
 'single': 662,
 'second': 634,
 'info': 360,
 'puppy': 580,
 'happy': 319,
 'vaccinate': 796,
 'come': 142,
 'persian': 535,
 'red': 596,
 'friendly': 280,
 'neighbour': 489,
 'fun': 282,
 'loving': 432,
 'great': 305,
 'loyal': 434,
 'english': 223,
 'male': 442,
 'safe': 626,
 'cats': 117,
 'clean': 131,
 'black

(14993,)

str

In [7]:
a = df_train.groupby('RescuerID').size()
type(a)
a.describe()

count    5595.000000
mean        2.679714
std        10.384820
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max       459.000000
dtype: float64

In [8]:
#%% 删除异常值
cul_drop = ['375905770', 'da8d4a273', '27e74e45c', '7b5bee232', '0327b8e94']
df_train = df_train[~df_train['PetID'].isin(cul_drop)]

# 提取 sentiment 的特征

In [9]:
def extract_sentiment_feature(i, x):    
    feature_sentiment = pd.DataFrame(columns=['PetID', 'token', 'sentence_magnitude', 'sentence_score','document_magnitude', 'document_score'])

    if x == 'train':
        set_file = 'train'
    else:
        set_file = 'test'
    
        
    file_name = '{}_sentiment/{}.json'.format(set_file,i)
    try:
        f = open(file_name, 'r')
        sentiment_file = json.load(f)
            
        token = [x['name'] for x in sentiment_file['entities']]
        token = ' '.join(token)
            
        sentences_sentiment = [x['sentiment'] for x in sentiment_file['sentences']]
        sentences_sentiment = pd.DataFrame.from_dict(
            sentences_sentiment, orient='columns').sum()
        sentenceSentiment_magnitude = sentences_sentiment['magnitude']
        sentenceSentiment_score     = sentences_sentiment['score']
            
        docementSentiment_magnitude = sentiment_file['documentSentiment']['magnitude']
        documentSentiment_score     = sentiment_file['documentSentiment']['score']
            
        new = pd.DataFrame(
                {'PetID'               :[i], 
                 'token'               : [token],
                 'sentence_magnitude'  : [sentenceSentiment_magnitude],
                 'sentence_score'      : [sentenceSentiment_score],
                 'document_magnitude'  : [docementSentiment_magnitude], 
                 'document_score'      : [documentSentiment_score]})  
        feature_sentiment = feature_sentiment.append(new)
    except:
        print('{}没找到'.format(file_name))
    
    for each in feature_sentiment.columns:
        if each not in ['PetID','token']:
            feature_sentiment[each] = feature_sentiment[each].astype(float)

    return feature_sentiment

#%%
train_feature_sentiment = Parallel(n_jobs=8, verbose=1)(
        delayed(extract_sentiment_feature)(i, 'train') for i in train.PetID)
train_feature_sentiment = [x for x in train_feature_sentiment]
train_feature_sentiment = pd.concat(train_feature_sentiment, ignore_index=True, sort=False)

test_feature_sentiment = Parallel(n_jobs=8, verbose=1)(
        delayed(extract_sentiment_feature)(i, 'test') for i in test.PetID)
test_feature_sentiment = [x for x in test_feature_sentiment]
test_feature_sentiment = pd.concat(test_feature_sentiment, ignore_index=True, sort=False)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  36 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 6704 tasks      | elapsed:    7.3s
[Parallel(n_jobs=8)]: Done 14993 out of 14993 | elapsed:   15.0s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done 384 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 3948 out of 3948 | elapsed:    3.9s finished


In [None]:
train_feature_sentiment.columns

In [None]:
#%% 提取 metadata 的特征
#file_name = 'train_metadata/000a290e4-1.json'
#f = open(file_name, 'r')
#metadatafile = json.load(f)
def extract_metadata_feature(i, x):
    feature_metadata = pd.DataFrame()
    if x == 'train':
        set_file = 'train'
    else:
        set_file = 'test'
        
        
    metadata_filenames = sorted(glob.glob('{}_metadata/{}*.json'.format(set_file, i)))
    if len(metadata_filenames) > 0:
        feature_metadata_sub = pd.DataFrame(columns=['PetID', 'annots_score', 'color_score', 'color_pixelfrac', 'crop_conf','crop_importance', 'annots_top_desc'])
        for ff in metadata_filenames:
            f = open(ff, 'rb')
            file = json.load(f)
            #label
            if 'labelAnnotations' in file:
                file_annots = file['labelAnnotations'][:int(len(file['labelAnnotations']) * 0.3)]
                file_top_score = np.asarray([x['score'] for x in file_annots]).mean()
                file_top_desc = [x['description'] for x in file_annots]            
            else:
                file_top_score = np.nan
                file_top_desc = ['']
            #colors
            file_colors = file['imagePropertiesAnnotation']['dominantColors']['colors']            
            file_color_score = np.asarray([x['score'] for x in file_colors]).mean()
            file_color_pixelfrac = np.asarray([x['pixelFraction'] for x in file_colors]).mean()            
            #crops
            file_crops = file['cropHintsAnnotation']['cropHints']                
            file_crop_conf = np.asarray([x['confidence'] for x in file_crops]).mean()
            if 'importanceFraction' in file_crops[0].keys():
                file_crop_importance = np.asarray([x['importanceFraction'] for x in file_crops]).mean()
            else:
                file_crop_importance = np.nan
    
                
            new = pd.DataFrame(
                    {
                            'PetID'          : [i],
                            'annots_score'   : [file_top_score],
                            'color_score'     : [file_color_score],
                            'color_pixelfrac' : [file_color_pixelfrac],
                            'crop_conf'       : [file_crop_conf],
                            'crop_importance' : [file_crop_importance],
                            'annots_top_desc' : [' '.join(file_top_desc)]})
            feature_metadata_sub = feature_metadata_sub.append(new)
                
        metadata_desc = feature_metadata_sub.groupby(['PetID'])['annots_top_desc'].unique()
        metadata_desc = metadata_desc.reset_index()
        metadata_desc['annots_top_desc'] = metadata_desc['annots_top_desc'].apply(lambda x:' '.join(x))
        feature_metadata_sub.drop(['annots_top_desc'], axis=1, inplace=True)

        for each in feature_metadata_sub:
            if each not in ['PetID']:
                feature_metadata_sub[each] = feature_metadata_sub[each].astype(float)
        
        
        feature_metadata_sub = feature_metadata_sub.groupby(['PetID']).agg(['mean', 'sum'])
        feature_metadata_sub.columns = ['{}_{}'.format(c[0], c[1].upper()) for c in feature_metadata_sub.columns.tolist()]  
        feature_metadata_sub = feature_metadata_sub.reset_index()
            
        feature_metadata = feature_metadata.append(feature_metadata_sub)
    return feature_metadata


#
    

#for each in 
#train_feature_metadata = extract_metadata_feature('fffd78a11-1', 'train')

#train_feature_metadata = Parallel(n_jobs=8, verbose=1)(
#        delayed(extract_metadata_feature)(i, 'train') for i in train.PetID)
#train_feature_metadata = [x for x in train_feature_metadata]
#train_feature_metadata = pd.concat(train_feature_metadata, ignore_index=True, sort=False)

#test_feature_metadata = Parallel(n_jobs=8, verbose=1)(
#        delayed(extract_metadata_feature)(i, 'test') for i in test.PetID)
#test_feature_metadata = [x for x in test_feature_metadata]
#test_feature_metadata = pd.concat(test_feature_metadata, ignore_index=True, sort=False)

In [10]:
#%% 连接sentiment和metadata和原始数据
x_train = df_train.merge(train_feature_sentiment, how='left', on='PetID')
#x_train = x_train.merge(train_feature_metadata, how='left', on='PetID')

y_train = x_train['AdoptionSpeed']
x_train.drop(['AdoptionSpeed'], axis=1, inplace=True)

x_test = df_test.merge(test_feature_sentiment, how='left', on='PetID')
#x_test = x_test.merge(test_feature_metadata, how='left', on='PetID')

# NLP

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import SparsePCA, TruncatedSVD, LatentDirichletAllocation, NMF

col_text = ['Description', 'annots_top_desc']

x = x_train.append(x_test).reset_index()
x = x[['Description', 'PetID', 'annots_top_desc']]

n_components = 5

x[col_text] = x[col_text].fillna('MISSING')
text_features = []


for i in col_text:
    svd_ = TruncatedSVD(n_components=n_components)
    nmf_ = NMF(n_components=n_components)
    
    tfidf_col = TfidfVectorizer(min_df=3, max_df=0.9).fit_transform(x.loc[:, i])

    
    svd_col = svd_.fit_transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('SVD_{}_'.format(i))
    
    nmf_col = nmf_.fit_transform(tfidf_col)
    nmf_col = pd.DataFrame(nmf_col)
    nmf_col = nmf_col.add_prefix('NMF_{}_'.format(i))

    text_features.append(svd_col)
    text_features.append(nmf_col)
    
    x.drop(i, axis=1, inplace=True)
    
# Combine all extracted features:
text_features = pd.concat(text_features, axis=1)

# Concatenate with main DF:
x = pd.concat([x, text_features], axis=1)

x_train = x_train.merge(x, how='left', on='PetID')
x_test  = x_test.merge(x, how='left', on='PetID')

In [None]:
x_train.columns

# 增加新的特征
1、是否需要收费
2、年份
3、Color的笛卡尔积（效果不要）

In [None]:
x_train['IsFree'] = x_train['Fee'].apply(lambda x:1 if x>0 else 0)
x_test['IsFree']  = x_test['Fee'].apply(lambda x:1 if x>0 else 0)

x_train['Year'] = x_train['Age'].apply(lambda x:round(x/12))
x_test['Year']  = x_test['Age'].apply(lambda x:round(x/12))

x = x_train.append(x_test)
x['Age_qcut'] = pd.qcut(x['Age'], 5,  duplicates='drop')
x['Age_qcut'] = pd.factorize(x['Age_qcut'])[0]
x_train = x_train.merge(x[['PetID','Age_qcut']], how='left', on='PetID')
x_test  = x_test.merge(x[['PetID','Age_qcut']], how='left', on='PetID')

#效果不好
#x_train['Color_Mix'] = x_train['Color1'].astype(str)+x_train['Color2'].astype(str)+x_train['Color3'].astype(str)
#x_train['Color_Mix'] = pd.factorize(x_train['Color_Mix'])[0]
#x_test['Color_Mix'] = x_test['Color1'].astype(str)+x_test['Color2'].astype(str)+x_test['Color3'].astype(str)
#x_test['Color_Mix'] = pd.factorize(x_test['Color_Mix'])[0]

# RescuerID 处理

In [22]:
#%% RescuerID 处理

df = df_train.append(df_test)
data_rescuer = df.groupby(['RescuerID'])['PetID'].count().reset_index()
data_rescuer.columns = ['RescuerID', 'RescuerID_count']
#data_rescuer['rank_Rescuer_count'] = data_rescuer['RescuerID_count'].rank(pct=True)

x_train = x_train.merge(data_rescuer, how='left', on='RescuerID')
x_test  = x_test.merge(data_rescuer, how='left', on='RescuerID')

#x_train.drop(['RescuerID_count'], axis=1, inplace=True)
#x_test.drop(['RescuerID_count'], axis=1, inplace=True)

x_train['single'] = x_train['RescuerID_count'].apply(lambda x:1 if x<3 else 0)
x_train['middle'] = x_train['RescuerID_count'].apply(lambda x:1 if (x>2 and x<6) else 0)
x_train['Charities'] = x_train['RescuerID_count'].apply(lambda x:1 if x>5 else 0)

x_test['single'] = x_test['RescuerID_count'].apply(lambda x:1 if x<3 else 0)
x_test['middle'] = x_test['RescuerID_count'].apply(lambda x:1 if (x>2 and x<6) else 0)
x_test['Charities'] = x_test['RescuerID_count'].apply(lambda x:1 if x>5 else 0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [23]:
x_test['RescuerID_count'].unique()

array([146,   4,  14,  81,   1,   7,  37,  15,   2,  18,  10,  64,   3,
        49,   5,   6,   9,  67,   8,  12,  26,  62,  11,  27,  22,  34,
        25,  74,  48,  31,  59,  16,  54,  23,  21,  35,  17,  43,  19,
        13])

In [11]:
x_train['single'] = x_train['RescuerID_count'].apply(lambda x:1 if x<3 else 0)
x_train['middle'] = x_train['RescuerID_count'].apply(lambda x:1 if (x>2 and x<6) else 0)
x_train['Charities'] = x_train['RescuerID_count'].apply(lambda x:1 if x>5 else 0)

In [12]:
x_train[['single', 'middle', 'Charities', 'RescuerID_count']]
x_train

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,NMF_token_2,NMF_token_3,NMF_token_4,IsFree,Year,Age_qcut,RescuerID_count,single,middle,Charities
0,2,Nibble,3,299,0,1,1,7,0,1,...,0.000000,0.028520,0.005579,1,0,0,8,0,0,1
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,0.000000,0.024119,0.018522,0,0,1,1,1,0,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,0.000956,0.012173,0.036687,0,0,1,459,0,0,1
3,1,Miko,4,307,0,2,1,2,0,2,...,0.000774,0.000000,0.067366,1,0,2,50,0,0,1
4,1,Hunter,1,307,0,1,1,0,0,2,...,0.033239,0.007047,0.013453,0,0,1,134,0,0,1
5,2,,3,266,0,2,5,6,0,2,...,0.001423,0.080696,0.011082,0,0,0,4,0,1,0
6,2,BULAT,12,264,264,1,1,0,0,2,...,0.000000,0.041352,0.013317,1,1,3,10,0,0,1
7,1,Siu Pak & Her 6 Puppies,0,307,0,2,1,2,7,2,...,0.003908,0.004248,0.022216,0,0,1,3,0,1,0
8,2,,2,265,0,2,6,0,0,2,...,0.000887,0.024057,0.000000,0,0,1,7,0,0,1
9,2,Kitty,12,265,0,2,1,7,0,2,...,0.000000,0.077242,0.000000,0,1,3,1,1,0,0


# 处理Breed

In [13]:
x_train['HasSecondBreed'] = x_train['Breed2'].map(lambda x:1 if x != 0 else 0)
x_test['HasSecondBreed'] = x_test['Breed2'].map(lambda x:1 if x != 0 else 0)

train_breed_main = x_train[['Breed1']].merge(
    labels_breed, how='left',
    left_on='Breed1', right_on='BreedID',
    suffixes=('', '_main_breed'))

train_breed_main = train_breed_main.iloc[:, 2:]
train_breed_main = train_breed_main.add_prefix('main_breed_')

train_breed_second = x_train[['Breed2']].merge(
    labels_breed, how='left',
    left_on='Breed2', right_on='BreedID',
    suffixes=('', '_second_breed'))


train_breed_second = train_breed_second.iloc[:, 2:]
train_breed_second = train_breed_second.add_prefix('second_breed_')

x_train = pd.concat(
    [x_train, train_breed_main, train_breed_second], axis=1)

##############
test_breed_main = x_test[['Breed1']].merge(
    labels_breed, how='left',
    left_on='Breed1', right_on='BreedID',
    suffixes=('', '_main_breed'))

test_breed_main = test_breed_main.iloc[:, 2:]
test_breed_main = test_breed_main.add_prefix('main_breed_')

test_breed_second = x_test[['Breed2']].merge(
    labels_breed, how='left',
    left_on='Breed2', right_on='BreedID',
    suffixes=('', '_second_breed'))

test_breed_second = test_breed_second.iloc[:, 2:]
test_breed_second = test_breed_second.add_prefix('second_breed_')

x_test = pd.concat(
    [x_test, test_breed_main, test_breed_second], axis=1)

print(x_train.shape, x_test.shape)

categorical_columns = ['main_breed_BreedName', 'second_breed_BreedName']
for i in categorical_columns:
    x_train.loc[:, i] = pd.factorize(x_train.loc[:, i])[0]
    x_test.loc[:,i]   = pd.factorize(x_test.loc[:, i])[0]


(14988, 60) (3948, 60)


# 等某些特征进行rank

In [14]:
# 对某些特征进行rank
cols_rank = ['sentence_magnitude', 'sentence_score', 'document_magnitude','document_score']#,
#       'annots_score_MEAN', 'annots_score_SUM','color_score_MEAN', 'color_score_SUM', 'color_pixelfrac_MEAN',
#       'color_pixelfrac_SUM', 'crop_conf_MEAN', 'crop_conf_SUM','crop_importance_MEAN', 'crop_importance_SUM']

x = x_train.append(x_test)
x[cols_rank] = x[cols_rank].fillna(0)
df_cols_rank = x[cols_rank].rank(pct=True).rename(columns=lambda s:'rank.'+s)
df_cols_rank = pd.concat([df_cols_rank, x['PetID']], axis=1)

x_train = x_train.merge(df_cols_rank, how='left', on='PetID')
x_test =  x_test.merge(df_cols_rank, how='left', on='PetID')

In [333]:
x_train.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'token',
       'sentence_magnitude', 'sentence_score', 'document_magnitude',
       'document_score', 'SVD_Description_0', 'SVD_Description_1',
       'SVD_Description_2', 'SVD_Description_3', 'SVD_Description_4',
       'NMF_Description_0', 'NMF_Description_1', 'NMF_Description_2',
       'NMF_Description_3', 'NMF_Description_4', 'SVD_token_0', 'SVD_token_1',
       'SVD_token_2', 'SVD_token_3', 'SVD_token_4', 'NMF_token_0',
       'NMF_token_1', 'NMF_token_2', 'NMF_token_3', 'NMF_token_4', 'IsFree',
       'Year', 'Age_qcut', 'RescuerID_count', 'HasSecondBreed',
       'main_breed_Type', 'main_breed_BreedName', 'second_breed_Type',
       'second_breed_BreedName', 'rank.sentence_magnitude',
       'rank.sente

# 数据清理

In [15]:
drop_columns = ['Name', 'RescuerID', 'Description', 'PetID', 'token']
col = ['sentence_magnitude', 'sentence_score', 'document_magnitude', 'document_score']

x_train.drop(drop_columns, axis=1, inplace=True)
x_test.drop(drop_columns, axis=1, inplace=True)

x_train[col] = x_train[col].astype(float)
x_test[col]  = x_test[col].astype(float)

x_train = x_train.fillna(0)
x_test  = x_test.fillna(0)

# LGB 算法

In [16]:
from lightgbm.sklearn import LGBMRegressor


model_lgb = LGBMRegressor(
        learning_rate    = 0.01,
        n_estimators     = 2000,
        max_depth        = 5,
        num_leaves       = 30,
        subsample        = 0.7,      #训练时采样一定比例的数据	
        colsample_bytree = 0.7,
        n_jobs           = -1,
        random_state     = 4,
        objective        = 'regression',
        eval_metric      = 'scorer',
        min_child_samples = 5         #叶子节点具有的最小记录数	

        )
        
model_lgb.fit(x_train, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
       eval_metric='scorer', importance_type='split', learning_rate=0.01,
       max_depth=5, min_child_samples=5, min_child_weight=0.001,
       min_split_gain=0.0, n_estimators=2000, n_jobs=-1, num_leaves=30,
       objective='regression', random_state=4, reg_alpha=0.0,
       reg_lambda=0.0, silent=True, subsample=0.7,
       subsample_for_bin=200000, subsample_freq=0)

In [353]:
#调参
parameters = {
            'reg_alpha': [0,0.01,0.1,1],
            'reg_lambda': [0,0.01,0.1,1]
            }

gsearch = GridSearchCV(model_lgb, param_grid=parameters, scoring=scorer, cv=10, n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_


ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

# SVM多分类

In [531]:
from sklearn.svm import SVC
import sklearn.svm as svm 

clf = svm.SVC(decision_function_shape='ovo')
clf.fit(x_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [532]:
y_pre = model_lgb.predict(x_train)
#y_pre = np.round(y_pre)
val = cross_val_score(clf, x_train, y_train, scoring = scorer, cv=10).mean()
print('不分组预测{}'.format(val))   



不分组预测0.25924689193170236


# 随机森林模型

In [17]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(
    n_estimators      = 200,
    
    max_features      = 0.4,#选择最适属性时划分的特征不能超过此值。
#    max_depth         = 6, #设置树的最大深度，默认为None
    
    min_samples_leaf  = 1, #叶子节点最少的样本数
    min_samples_split = 2,#根据属性划分节点时，每个划分最少的样本数
    
    criterion = 'mse',
    
    n_jobs            = -1
    )

model_rf

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.4, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

# XGB

In [18]:
from xgboost.sklearn import XGBRegressor

model_xgb = XGBRegressor(
    learning_rate    = 0.05,
    n_estimatores    = 2000, 
    
    early_stopping_rounds=20,  
    
    max_depth        = 10, 
    min_child_weight = 5,
    
    gamma            = 0,
    
    subsample        =  0.9,
    colsample_bytree = 0.6,
    
#    reg_alpha        = 3,
#    reg_lambda       = 0.1,
    nthread      = -1)
#    objective        = 'regression',    
#    eval_metric      = 'scorer')      


In [358]:
#调参
parameters = {'max_depth': range(6,14,2), 'min_child_weight': range(3,13,2)}
gsearch = GridSearchCV(model_xgb, param_grid=parameters, scoring=scorer, cv=10, n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

{'max_depth': 12, 'min_child_weight': 11}

# 后处理

In [None]:
x_train.drop('Resc')

In [26]:
y_rf = split_score(model_rf, x_train, y_train)
    
coe = search_coef(y_rf, y_train)
best_coe = coe['x']

yy_rf = fix_y(y_rf, best_coe)
print('随机森林二次加权Kappa系数为:', metric(yy_rf, y_train))

10折后的Kappa加权得分为:带补充
basinhopping step 0: f -0.439404
basinhopping step 1: f -0.439423 trial_f -0.439423 accepted 1  lowest_f -0.439423
found new global minimum on step 1 with function value -0.439423
basinhopping step 2: f -0.439474 trial_f -0.439474 accepted 1  lowest_f -0.439474
found new global minimum on step 2 with function value -0.439474
basinhopping step 3: f -0.438211 trial_f -0.438211 accepted 1  lowest_f -0.439474
basinhopping step 4: f -0.439639 trial_f -0.439639 accepted 1  lowest_f -0.439639
found new global minimum on step 4 with function value -0.439639
basinhopping step 5: f -0.439 trial_f -0.439 accepted 1  lowest_f -0.439639
basinhopping step 6: f -0.43943 trial_f -0.43943 accepted 1  lowest_f -0.439639
basinhopping step 7: f -0.43916 trial_f -0.43916 accepted 1  lowest_f -0.439639
basinhopping step 8: f -0.439851 trial_f -0.439851 accepted 1  lowest_f -0.439851
found new global minimum on step 8 with function value -0.439851
basinhopping step 9: f -0.439707 trial_f 

(14988,)

In [19]:
y_lgb = split_score(model_lgb, x_train, y_train)

coe = search_coef(y_lgb, y_train)
best_coe = coe['x']

yy_lgb = fix_y(y_lgb, best_coe)
print('lgb二次加权Kappa系数为:', metric(yy_lgb, y_train))

10折后的Kappa加权得分为:带补充
lgb二次加权Kappa系数为: 0.4432320614037043


In [24]:
y_xgb = split_score(model_xgb, x_train, y_train)

coe = search_coef(y_xgb, y_train)

best_coe = coe['x']
print('最佳参数为',best_coe)

yy_xgb = fix_y(y_xgb, best_coe)
print('xgb二次加权Kappa系数为:', metric(yy_xgb, y_train))

10折后的Kappa加权得分为:带补充
basinhopping step 0: f -0.439899
basinhopping step 1: f -0.440282 trial_f -0.440282 accepted 1  lowest_f -0.440282
found new global minimum on step 1 with function value -0.440282
basinhopping step 2: f -0.440457 trial_f -0.440457 accepted 1  lowest_f -0.440457
found new global minimum on step 2 with function value -0.440457
basinhopping step 3: f -0.439552 trial_f -0.439552 accepted 1  lowest_f -0.440457
basinhopping step 4: f -0.439495 trial_f -0.439495 accepted 1  lowest_f -0.440457
basinhopping step 5: f -0.440775 trial_f -0.440775 accepted 1  lowest_f -0.440775
found new global minimum on step 5 with function value -0.440775
basinhopping step 6: f -0.440836 trial_f -0.440836 accepted 1  lowest_f -0.440836
found new global minimum on step 6 with function value -0.440836
basinhopping step 7: f -0.440319 trial_f -0.440319 accepted 1  lowest_f -0.440836
basinhopping step 8: f -0.440774 trial_f -0.440774 accepted 1  lowest_f -0.440836
basinhopping step 9: f -0.44050

KeyboardInterrupt: 

# 真实 2.21

In [27]:
y_rf  = split_score(model_rf, x_train, y_train)
y_xgb = split_score(model_xgb, x_train, y_train)
y_lgb = split_score(model_lgb, x_train, y_train)

y = (y_lgb + y_xgb + y_rf)/3

10折后的Kappa加权得分为:带补充
10折后的Kappa加权得分为:带补充


KeyboardInterrupt: 

In [290]:
print('xgb分布',(Counter(yy_xgb)))
print('lgb分布',(Counter(yy_lgb)))
print('rf分布',(Counter(yy_rf)))
print('融合后y分布',(Counter(yy)))
print('真实分布',(Counter(y_train)))

xgb分布 Counter({2.0: 4872, 3.0: 3866, 4.0: 3459, 1.0: 2604, 0.0: 187})
lgb分布 Counter({2.0: 4727, 4.0: 3680, 3.0: 3354, 1.0: 3127, 0.0: 100})
rf分布 Counter({2.0: 5439, 3.0: 3869, 4.0: 3349, 1.0: 2120, 0.0: 211})
融合后y分布 Counter({3.0: 4399, 2.0: 4174, 4.0: 3060, 1.0: 2911, 0.0: 449})
真实分布 Counter({4: 4195, 2: 4036, 3: 3257, 1: 3090, 0: 410})


In [29]:
y = (y_lgb + y_xgb + y_rf)/3

sort_y = y.argsort()[:int(0.030*len(y_rf))]
y[sort_y] = y[sort_y] - 0.2

coe = search_coef(y, y_train)

best_coe = coe['x']
print(best_coe)  

yy = fix_y(y, best_coe)
print('融合后的分布:',Counter(yy))
print('融合后的二次加权Kappa系数为', metric(yy, y_train))
print('y的真实分布为',Counter(y_train))

basinhopping step 0: f -0.425257
basinhopping step 1: f -0.425249 trial_f -0.425249 accepted 1  lowest_f -0.425257
basinhopping step 2: f -0.424822 trial_f -0.424822 accepted 1  lowest_f -0.425257
basinhopping step 3: f -0.424511 trial_f -0.424511 accepted 1  lowest_f -0.425257
basinhopping step 4: f -0.425249 trial_f -0.425249 accepted 1  lowest_f -0.425257
basinhopping step 5: f -0.459372 trial_f -0.459372 accepted 1  lowest_f -0.459372
found new global minimum on step 5 with function value -0.459372
basinhopping step 6: f -0.459356 trial_f -0.459356 accepted 1  lowest_f -0.459372
basinhopping step 7: f -0.458796 trial_f -0.458796 accepted 1  lowest_f -0.459372
basinhopping step 8: f -0.459541 trial_f -0.459541 accepted 1  lowest_f -0.459541
found new global minimum on step 8 with function value -0.459541
basinhopping step 9: f -0.458982 trial_f -0.458982 accepted 1  lowest_f -0.459541
basinhopping step 10: f -0.460106 trial_f -0.460106 accepted 1  lowest_f -0.460106
found new global

In [22]:
model_lgb.feature_importances_
a = pd.DataFrame(model_lgb.feature_importances_, index=x_train.columns)
a.sort_values(by=0, ascending=False)

Unnamed: 0,0
RescuerID_count,2495
SVD_Description_0,2035
SVD_Description_1,1861
PhotoAmt,1835
SVD_Description_4,1778
Age,1746
SVD_token_0,1732
SVD_Description_3,1727
SVD_token_4,1600
Breed1,1576


In [38]:
int(2.7)

2

In [12]:
x = x_train.append(x_test)
a = x['Breed1'].value_counts().sort_values(ascending = False).cumsum()/len(x)
rare_index = a[a > 0.9].index.tolist()
rare_index
a

307    0.381284
266    0.631707
265    0.716677
299    0.742871
264    0.763836
292    0.780999
285    0.795469
205    0.808988
141    0.821610
179    0.832700
109    0.843103
218    0.853295
254    0.861322
189    0.867976
103    0.874525
213    0.880756
243    0.886724
20     0.892533
283    0.897497
247    0.902302
152    0.906686
195    0.911016
128    0.915294
78     0.919149
306    0.922951
303    0.926120
69     0.929077
76     0.931612
60     0.934147
276    0.936576
         ...   
99     0.998469
146    0.998521
3      0.998574
6      0.998627
290    0.998680
258    0.998733
278    0.998785
130    0.998838
23     0.998891
257    0.998944
222    0.998997
104    0.999049
2      0.999102
116    0.999155
14     0.999208
93     0.999261
142    0.999313
81     0.999366
64     0.999419
112    0.999472
125    0.999525
192    0.999578
94     0.999630
61     0.999683
126    0.999736
298    0.999789
123    0.999842
217    0.999894
139    0.999947
16     1.000000
Name: Breed1, Length: 18