再現ミス  
more_title_pca  
title_disc_pca  

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_log_error, log_loss, roc_curve

import lightgbm as lgb ##2.3.1
import catboost
from catboost import CatBoostRegressor, Pool, CatBoost
import pickle
import os
from tqdm.notebook import tqdm

In [3]:
SEED = 43

def make_dir(path):
    if not os.path.exists(path):
        os.mkdir(path)
        
EXP_ID = "exp"
make_dir("../output/" + EXP_ID)
DATADIR = "../input/"
OUTPUT_DIR = "../output/" + EXP_ID + "/"

In [4]:
train_data = pd.read_csv(DATADIR + "train.csv")
test_data = pd.read_csv(DATADIR + "test.csv")

all_df = pd.concat([train_data, test_data], axis=0).reset_index(drop=True)

In [5]:
import re
from sklearn.preprocessing import LabelEncoder
def label_encoder(_df_input, cols):
    """
    colsに対してLabelEncodingをする関数
    """
    df_input = _df_input.copy()
    df_out = pd.DataFrame()
    
    for c in cols:
        df_input[c] = np.where(df_input[c].isnull(), "NULL", df_input[c])
        le = LabelEncoder()
        df_out[c] = le.fit_transform(df_input[c])
        
    return df_out

def count_encoder(df_input, cols):
    """
    colsに対してCountEncodingをする関数
    """
    df_out = pd.DataFrame()
    if type(cols) == str:
        c = cols
        dic = df_input[c].value_counts().to_dict()
        df_out[c + "CE"] = df_input[c].map(dic)
        return df_out
    
    for c in cols:
        dic = df_input[c].value_counts().to_dict()
        df_out[c + "CE"] = df_input[c].map(dic)
    return df_out

In [6]:
def count_feature(df_input):
    """
    main tableのCountEncoding
    """
    df_out = pd.DataFrame()
    
    cols = ["principal_maker", "copyright_holder", "acquisition_method", "acquisition_credit_line"]
    cols += ["art_series_id"]
    cols += ["description"] #0307
    cols += ["long_title", "principal_or_first_maker", "sub_title", "more_title", 
             "acquisition_date", "dating_presenting_date", "dating_period"] #dating_year_early
    
    cols += ["dating_year_early", "dating_year_late"] #0313
    for c in cols:
        dic = df_input[c].value_counts().to_dict()
        df_out[c + "CE"] = df_input[c].map(dic)
        
    assert len(df_out.columns) == len(set(df_out.columns))
    return df_out

In [7]:
def list_process(l):
    output_l = []
    for w in l:
        if w is np.nan:
            continue
        output_l.append(str(w))
        
    if len(output_l)==0:
        return np.nan
    
    return ", ".join(output_l)



def merge_tables(df_input):
    """
    複数のtableを結合する関数
    """
    material = pd.read_csv(DATADIR + "material.csv").groupby("object_id")["name"].apply(list).apply(lambda x:", ".join(sorted(x))).rename("material")
    tech = pd.read_csv(DATADIR + "technique.csv").groupby("object_id")["name"].apply(list).apply(lambda x:", ".join(sorted(x))).rename("tech")
    obj_type = pd.read_csv(DATADIR + "object_collection.csv").groupby("object_id")["name"].apply(list).apply(lambda x:", ".join(sorted(x))).rename("obj_type")
    person = pd.read_csv(DATADIR + "historical_person.csv").groupby("object_id")["name"].apply(list).apply(lambda x:", ".join(sorted(x))).rename("person")
    place = pd.read_csv(DATADIR + "production_place.csv").groupby("object_id")["name"].apply(list).apply(lambda x:", ".join(sorted(x))).rename("place")
    
    _maker = pd.read_csv(DATADIR + "principal_maker.csv")
    maker_occu = pd.read_csv(DATADIR + "principal_maker_occupation.csv").groupby("id")["name"].apply(list).apply(lambda x:", ".join(sorted(x)))
    maker_occu = maker_occu.rename("occu")
    _maker = pd.merge(_maker, maker_occu, on="id", how="left").drop("id", axis=1)
    
    cols = ['qualification', 'roles', 'productionPlaces', 'maker_name', "occu"]
    maker = pd.concat([_maker.groupby("object_id")[c].apply(list).apply(lambda x:list_process(x)) for c in cols], axis=1)
    
    tables = pd.concat([material, tech,
                        obj_type, person, place, maker], axis=1).reset_index().rename(columns={"index":"object_id"})

    tables_out = pd.merge(df_input, tables, on="object_id", how="left")
    return tables_out



def tables_onehot(df_input):
    """
    複数tableの要素をOneHot
    """
    outputs = []
    for name in ["material", "technique", "object_collection", "historical_person", "production_place"]:
        _df = pd.read_csv(DATADIR + f"{name}.csv")
        if name == "production_place":
            _df["name"] = _df["name"].apply(lambda x: x.replace("? ", ""))
        outputs.append(pd.crosstab(_df["object_id"], _df["name"]).add_prefix(f"{name}_OH_"))
    outputs = pd.concat(outputs, axis=1)
    
    idx = outputs.sum(axis=0) > 25 #30
    _df_out = outputs[idx[idx].index]
    _df_out = _df_out.reset_index().rename(columns={"index":"object_id"})
    cols = _df_out.columns
    df_out = pd.merge(df_input, _df_out, on="object_id", how="left")[[c for c in cols if c!="object_id"]]
    
    assert df_input.shape[0] == df_out.shape[0]
    df_out.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df_out.columns]
    return df_out


def null_features(df_input):
    """
    全tableにおいてNaNになっている要素についてOneHot化
    SVDで次元削減して特徴化
    """
    df_out = pd.DataFrame()
    tables = merge_tables(df_input)
    df_out["num_null"] = tables.isnull().sum(axis=1)
    
    for name in ["color", "palette"]:
        color_df = pd.read_csv(DATADIR + f"{name}.csv")
        color_ids = color_df["object_id"].unique()
        tables[name] = np.where(df_input["object_id"].isin(color_ids),
                                1,
                                np.nan)
    
    tables = pd.concat([tables, 
                        subtitle_feature(all_df)[["h", "w", "d", "l", "weight"]]], axis=1)
    
    cols = tables.columns[tables.isnull().astype(int).sum(axis=0) > 10]
    cols = [c for c in cols if c not in ["dating_sorting_date", "dating_year_early"]]
    
    print(len(cols))
    
    null_df = tables[cols].isnull().astype(int)
    
    pipeline_ = Pipeline([
        ('svd', TruncatedSVD(n_components=10, n_iter=30, random_state=42)), #5
    ])
    pipeline_.fit(null_df[:len(train_data)])
    arr = pipeline_.transform(null_df)
    df_out = pd.concat([df_out, 
                        pd.DataFrame(arr).add_prefix(f"null_svd")], axis=1)
    return df_out

In [8]:
def lazy_merge_tables_feature(df_input):
    """
    全テーブルの要素について文字長特徴
    """
    df_out = pd.DataFrame()
    _df = merge_tables(df_input)
    
    for c in ["tech", "material", "obj_type", "place", "person"]:
        df_out[f"{c}_len"] = _df[c].str.len()
        
        dic = _df[c].value_counts().to_dict()
        df_out[f"{c}_CE"] = _df[c].map(dic)

    for c in ["qualification", "roles", "maker_name", "occu"]:
        df_out[f"{c}_len"] = _df[c].str.len()
        
        dic = _df[c].value_counts().to_dict()
        df_out[f"{c}_CE"] = _df[c].map(dic)
        
    
    _df = _df.drop(["object_id", "art_series_id",
                    "dating_sorting_date", "dating_period", "dating_year_early", "dating_year_late",
                    "likes"], 
                    axis=1).fillna("")
    
    tmp_df = pd.DataFrame()
    for c in _df.columns:
        tmp_df[c] = _df[c].astype(str).str.len()
        
    df_out["all_text_len"] = tmp_df.sum(axis=1)
    return df_out

# color

In [9]:
#from gensim.models import word2vec, KeyedVectors
#word2vec
def resnet50_feature(df_input):
    """
    ratioにそってrandom patternで画像生成(adversarial attackみたいな画像)
    それに対してresnet50で特徴抽出したcsvを結合しPCA
    """
    emb_df = pd.read_pickle("../feature/ResNet50_palette_embedding1000.pkl")
    
    cols = [c for c in emb_df.columns if c!="object_id"]
    pca = PCA(n_components=3, random_state=42)
    arr = pca.fit_transform(emb_df[cols].values)
    #pca.fit(emb_df[cols][:len(train_data)].values)
    #arr = pca.transform(emb_df[cols].values)
    
    _df_out = pd.DataFrame(arr).add_prefix("random_resnet")
    output_cols = _df_out.columns
    
    _df_out = pd.concat([emb_df["object_id"], _df_out], axis=1)
    
    df_out = pd.merge(df_input, _df_out, on="object_id", how="left")[output_cols]
    assert df_input.shape[0] == df_out.shape[0]
    return df_out

In [21]:
import colorsys
def _agg_df(df_input, color, func, prefix):
    """
    palette_feature(下) 内部で用いる関数
    HSV等を計算しratioを掛けて重み付き和
    
    func : HSVなどを算出する関数 (e.g. colorsys.rgb_to_hsv)
    """
    arr = color.apply(lambda x:list(func(x[1], x[2], x[3])), axis=1).values.tolist()
    _df = pd.concat([color[["object_id", "percentage"]], pd.DataFrame(arr, columns=["val1", "val2", "val3"])], axis=1)
    
    for c in ["val1", "val2", "val3"]:
        _df[c] *= _df["percentage"]
    _df = _df.groupby("object_id").sum()
    tmp_df = pd.merge(df_input, _df, on="object_id", how="left")[["val1", "val2", "val3"]].add_prefix(prefix)
    
    return tmp_df


def palette_feature(df_input):
    """ 
    HSL, HSV, YIQの計算　-> ratioを掛けて重み付き和
    """
    df_out = pd.DataFrame()
    palette = pd.read_csv(DATADIR + "palette.csv")
    palette = palette.rename(columns={"ratio":"percentage",
                                      "color_r":"R",
                                      "color_g":"G",
                                      "color_b":"B"})
    for c in ["R", "G", "B"]:
        palette[c] /= 255
    df_out = [_agg_df(df_input, palette, colorsys.rgb_to_hls, "HLS_"),
              _agg_df(df_input, palette, colorsys.rgb_to_hsv, "HSV_"),
              _agg_df(df_input, palette, colorsys.rgb_to_yiq, "YIQ_")]
    df_out = pd.concat(df_out, axis=1).iloc[:, 1:]
    df_out = df_out.add_prefix("palette_")
    
    assert df_input.shape[0] == df_out.shape[0]
    return df_out

In [None]:
def ExG_agg_feature(df_input):
    """ 
    ExG (and ExR, ExB) ベースのpalette特徴
    
    df_out1 : 各要素に対してExGを計算して, ratioは使わずobject_id単位で統計量計算
    df_out2 : 各要素に対してExGを計算して, ratioを使ってobject_id単位で統計量計算
    df_out3 : 各要素に対してExGを計算して, ExRとExGの値が0以上となったものだけを抽出, ratioを掛けて重み付き和
    (ExBは分布の平均値が小さく不採用, 閾値を変えると効いた可能性は高い)
    (また重み付き和だけでなく，ratioの和も効く気がする)
    df_out4 : 各要素に対してgamma_grayを計算して, object_id単位で統計量
    """
    palette = pd.read_csv(DATADIR + "palette.csv")
    for c in ['color_r', 'color_g', 'color_b']:
        palette[c] = palette[c].astype(float)
    df_out = pd.DataFrame()
    palette = palette.copy()
    
    palette["RGBsum"] = palette.filter(regex="color_").sum(axis=1)
    palette["ExR"] = (2*palette["color_r"] - palette["color_g"] - palette["color_b"]) / palette["RGBsum"]
    palette["ExG"] = (-palette["color_r"] + 2*palette["color_g"] - palette["color_b"]) / palette["RGBsum"]
    palette["ExB"] = (-palette["color_r"] - palette["color_g"] + 2*palette["color_b"]) / palette["RGBsum"]
    
    cols = ["ExR", "ExG", "ExB"]
    df_out1 = palette.groupby("object_id").agg(["mean", "std", "max", "min"])[["RGBsum", "ExR", "ExG", "ExB"]]
    df_out1.columns = ["_".join(c) for c in df_out1.columns]
    
    tmp_df = palette[["ratio", "object_id", "RGBsum", "ExR", "ExG", "ExB"]].copy()
    for c in ["RGBsum", "ExR", "ExG", "ExB"]:
        tmp_df[c] *= tmp_df["ratio"]
    df_out2 = tmp_df.groupby("object_id").agg(["mean", "std", "max", "min"])[["RGBsum", "ExR", "ExG", "ExB"]]
    df_out2.columns = ["_".join(c) + "_prod_ratio" for c in df_out2.columns]
    
    
    df_out3 = []
    for c in ["ExR", "ExG"]:#, "ExB"]:
        tmp_df = palette[palette[c] > 0][["object_id", "ratio", c]]
        tmp_df[f"ratio_count_{c}"] = tmp_df["ratio"] * tmp_df[c]
        df_out3.append(
            pd.merge(df_input, 
                     tmp_df.groupby("object_id")[f"ratio_count_{c}"].sum(),
                     on="object_id", how="left"
                    )[f"ratio_count_{c}"].fillna(0)
        )
    df_out3 = pd.concat(df_out3, axis=1)
    
    
    tmp_df = palette.copy()
    tmp_df["gamma_gray"] = tmp_df["color_r"]*0.299 + tmp_df["color_g"]*0.587 + tmp_df["color_b"]*0.114
    #tmp_df["r_gamma_gray"] = tmp_df["gamma_gray"] * tmp_df["ratio"]

    df_out4 = pd.concat([tmp_df.groupby("object_id")["gamma_gray"].agg(["max", "min", "mean"]).add_suffix("_gamma_gray"),
                        #tmp_df.groupby("object_id")["r_gamma_gray"].agg(["max", "min", "sum"]).add_suffix("_r") #相関高杉
                       ], axis=1)
    
    df_out = pd.concat([pd.merge(df_input, df_out1, on="object_id", how="left")[df_out1.columns],
                        pd.merge(df_input, df_out2, on="object_id", how="left")[df_out2.columns],
                        df_out3,
                        pd.merge(df_input, df_out4, on="object_id", how="left")[df_out4.columns]
                       ], axis=1)
    
    assert df_input.shape[0] == df_out.shape[0]
    return df_out

In [None]:
def color_palette_agg(df_input):
    """
    [１つの色(代表色)にまとめた上での特徴量]
    rgb*ratioしてobject_id単位でsum
    - 各成分(rgb)の値
    - 各成分(rgb)の割合 r/r+g+b ...
    - HLS
    """
    df_out = pd.DataFrame()
    palette = pd.read_csv(DATADIR + "palette.csv")
    for c in ["color_r", "color_g", "color_b"]:
        palette[c] *= palette["ratio"]

    agg_df = palette.groupby("object_id").sum()
    for c in ["color_r", "color_g", "color_b"]:
        dic = agg_df[c].to_dict()
        df_out["agg_" + c] = df_input["object_id"].map(dic)
    
    for c in ["agg_color_r", "agg_color_g", "agg_color_b"]:
        df_out[c + "_ratio_in_rgb"] = df_out[c] / df_out.filter(regex="agg_color_").sum(axis=1)    

    palette["hls"] = (palette.filter(regex="color").max(axis=1) + palette.filter(regex="color").min(axis=1)) / 2
    dic = palette.groupby("object_id")["hls"].sum().to_dict()
    df_out["sum_rHLS"] = df_input["object_id"].map(dic)
    
    assert df_input.shape[0] == df_out.shape[0]
    return df_out


def lazy_ratio_agg(df_input):
    """
    ratioのobject_id単位で統計量
    """
    palette = pd.read_csv(DATADIR + "palette.csv")
    df_out = pd.merge(df_input, 
                      palette.groupby("object_id")["ratio"].agg(["max", "min", "std"]),
                      on="object_id", how="left")[["max", "min", "std"]].add_suffix("_ratio")
    return df_out

In [11]:
from scipy.stats import circmean
def calc_circmean(df_input):
    """
    HSVのHに対して
    (最終subには使っていない)
    """
    data = []
    for idx_label, idx_count in zip(df_input["H"], df_input["percentage"]*10000):
        data += [idx_label] * int(idx_count)
    mean = circmean(data, high = 1, low = 0)
    std = np.where(abs(df_input["H"] - mean) > 0.5,
                   1 - abs(df_input["H"] - mean),
                   abs(df_input["H"] - mean)
                  ).sum()
    
    return np.array([mean, std, df_input["V"].max(), df_input["V"].std(), (df_input["S"]*df_input["V"]).max()])


def palette_HSV(df_input):
    """
    HSV特化の特徴
    ratioあり
    (最終subには使っていない)
    """
    palette = pd.read_csv(DATADIR + "palette.csv")
    palette = palette.rename(columns={"ratio":"percentage",
                                      "color_r":"R",
                                      "color_g":"G",
                                      "color_b":"B"})
    for c in ["R", "G", "B"]:
        palette[c] /= 255

    arr = palette.apply(lambda x:list(colorsys.rgb_to_hsv(x[1], x[2], x[3])), axis=1).values.tolist()
    _df = pd.concat([palette[["object_id", "percentage"]], pd.DataFrame(arr, columns=["H", "S", "V"])], axis=1)

    df_out = pd.DataFrame(_df.groupby("object_id").apply(calc_circmean).values.tolist(),
                         columns=["Hcirc_mean", "Hcirc_std", "S_max", "S_std", "SV_prod"])
    df_out["object_id"] = _df.groupby("object_id").size().index
    
    cols = [c for c in df_out.columns if c != "object_id"]
    _df_out = pd.merge(df_input, df_out, how="left", on="object_id")[cols]
    
    assert df_input.shape[0] == _df_out.shape[0]
    return _df_out

In [12]:
from PIL import ImageColor
import colorsys

def _agg_df_for_color(df_input, color, func, prefix):
    """
    color.csvに対してHSV等を計算し，percentageで重み付き和
    """
    arr = color.apply(lambda x:list(func(x[3], x[4], x[5])), axis=1).values.tolist()
    _df = pd.concat([color[["object_id", "percentage"]], pd.DataFrame(arr, columns=["R", "G", "B"])], axis=1)
    
    for c in ["R", "G", "B"]:
        _df[c] *= _df["percentage"]
    _df = _df.groupby("object_id").sum()
    tmp_df = pd.merge(df_input, _df, on="object_id", how="left")[["R", "G", "B"]].add_prefix(prefix)
    
    return tmp_df


def color_feature(df_input):
    """
    color.csvに対してHSV等の計算
    """
    df_out = pd.DataFrame()
    color = pd.read_csv(DATADIR + "color.csv")
    color = color.sort_values(["object_id", "percentage"]).reset_index(drop=True)

    color = pd.concat([color,
                       pd.DataFrame(color['hex'].str.strip().map(ImageColor.getrgb).values.tolist(), columns=['R', 'G', 'B'])
                      ], axis=1)
    color["percentage"] /= 100
    for c in ["R", "G", "B"]:
        color[c] /= 255
    
    df_out = [_agg_df_for_color(df_input, color, colorsys.rgb_to_hls, "HLS_"),
              _agg_df_for_color(df_input, color, colorsys.rgb_to_hsv, "HSV_"),
              _agg_df_for_color(df_input, color, colorsys.rgb_to_yiq, "YIQ_")]
    df_out = pd.concat(df_out, axis=1)
    assert df_input.shape[0] == df_out.shape[0]
    return df_out

In [13]:
def color_agg_feature(df_input):
    """
    color.csvの単純なobject_id単位での統計量
    """
    df_out = pd.DataFrame()
    color = pd.read_csv(DATADIR + "color.csv")
    agg_df = color.groupby("object_id")["percentage"].agg(["max", "std", "min", "nunique"])
    for c in agg_df.columns:
        dic = agg_df[c].to_dict()
        df_out["color" + c] =  df_input["object_id"].map(dic)
    
    assert df_input.shape[0] == df_out.shape[0]
    return df_out

In [14]:
def color_ExG_agg_feature(df_input):
    """ 
    ExG_agg_featureをcolor.csvに適応
    df_out1 : percentage掛けず統計量
    df_out2 : percentage掛けて統計量
    """
    color = pd.read_csv(DATADIR + "color.csv")
    color["ratio"] = color["percentage"] / 100
    palette = pd.concat([color,
                       pd.DataFrame(color['hex'].str.strip().map(ImageColor.getrgb).values.tolist(), columns=['color_r', 'color_g', 'color_b'])
                        ], axis=1)
    df_out = pd.DataFrame()
    
    palette["RGBsum"] = palette.filter(regex="color_").sum(axis=1)
    palette["ExR"] = (2*palette["color_r"] - palette["color_g"] - palette["color_b"]) / palette["RGBsum"]
    palette["ExG"] = (-palette["color_r"] + 2*palette["color_g"] - palette["color_b"]) / palette["RGBsum"]
    palette["ExB"] = (-palette["color_r"] - palette["color_g"] + 2*palette["color_b"]) / palette["RGBsum"]
    
    cols = ["ExR", "ExG", "ExB"]
    df_out1 = palette.groupby("object_id").agg(["mean", "std", "max", "min"])[["RGBsum", "ExR", "ExG", "ExB"]]
    df_out1.columns = ["_".join(c) for c in df_out1.columns]
    
    tmp_df = palette[["ratio", "object_id", "RGBsum", "ExR", "ExG", "ExB"]].copy()
    for c in ["RGBsum", "ExR", "ExG", "ExB"]:
        tmp_df[c] *= tmp_df["ratio"]
    df_out2 = tmp_df.groupby("object_id").agg(["mean", "std", "max", "min"])[["RGBsum", "ExR", "ExG", "ExB"]]
    df_out2.columns = ["_".join(c) + "_prod_ratio" for c in df_out2.columns]
    df_out = pd.concat([pd.merge(df_input, df_out1, on="object_id", how="left")[df_out1.columns],
                        pd.merge(df_input, df_out2, on="object_id", how="left")[df_out2.columns],
                       ], axis=1)
    
    assert df_input.shape[0] == df_out.shape[0]
    return df_out.add_suffix("_color")

# text

In [15]:
from fasttext import load_model

def lang_detect(df_input):
    """
    Arai-san discussionのfasttext言語判定モデル
    lid.176.bin は https://fasttext.cc/docs/en/language-identification.html
    """
    model = load_model("../feature/lid.176.bin")
    df_out = pd.DataFrame()
    for c in ["title", "principal_maker", "description", "long_title"]:
        null_idx = df_input[c].isnull()
        
        df_out[f"{c}_langlabel"] = np.where(null_idx,
                                            np.nan,
                                            df_input.fillna("NULL")[c].apply(lambda x:x.replace("\n", " ")).apply(lambda x:model.predict(x)[0][0])
                                           )
        
        df_out[f"{c}_langlabel_prob"] = np.where(null_idx,
                                            np.nan,
                                            df_input.fillna("NULL")[c].apply(lambda x:x.replace("\n", " ")).apply(lambda x:model.predict(x)[1][0])
                                           )
        
        dic = df_out[f"{c}_langlabel"].value_counts().to_dict()
        df_out[f"{c}_langlabel"] = df_out[f"{c}_langlabel"].map(dic)
        
    assert df_input.shape[0] == df_out.shape[0]
    return df_out

In [17]:
def bert_description(df_input):
    bert_feat = pd.read_csv("../feature/nl_en_BERT_transed_description.csv")
    feat_df = pd.merge(df_input, bert_feat, on="object_id", how="left")

    cols = [c for c in bert_feat.columns if c != 'object_id']
    feat_df = feat_df[cols]
    pca = PCA(n_components=50, random_state=42)
    
    arr = pca.fit_transform(feat_df.values)
    df_out1 = pd.DataFrame(arr).add_prefix("nl_en_bert_desc_pca")
    
    
    bert_feat = pd.read_csv("../feature/nl_en_BERT_transed_long_title.csv")
    feat_df = pd.merge(df_input, bert_feat, on="object_id", how="left")
    
    cols = [c for c in bert_feat.columns if c != 'object_id']
    feat_df = feat_df[cols]
    pca = PCA(n_components=64, random_state=42)
    arr = pca.fit_transform(feat_df.values)
    df_out2 = pd.DataFrame(arr).add_prefix("nl_en_bert_long_title_pca")
    df_out = pd.concat([df_out1, df_out2], axis=1)
    
    assert df_input.shape[0] == df_out.shape[0]
    return df_out

In [18]:
from sklearn.decomposition import PCA, TruncatedSVD, SparsePCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline

def maker_countvectorizer(df_input):
    """
    principal_makerでCountVectorizer
    
    ngramは(1,1)が良い
    max_feature指定したものの精度が悪化した．出現回数が少ないことの表現を奪った説がある(大丈夫な説もある)
    countなので[train, test]でfitしても良いかも
    """
    #vectorizer = CountVectorizer(max_features=3200, ngram_range=(1, 2)) #max_feature指定したものの数が少ないことの表現を奪った説がある(大丈夫な説もある)
    vectorizer = CountVectorizer(max_features=None, ngram_range=(1, 1))
    arr = vectorizer.fit_transform(df_input["principal_maker"].dropna()).toarray()
    pca = PCA(n_components=50, random_state=42) #pcaでした
    arr = pca.fit_transform(arr)
    df_out = pd.DataFrame(arr).add_prefix("maker_svd")
    assert df_input.shape[0] == df_out.shape[0]
    return df_out


def description_tfidfvectorizer(df_input):
    """
    descriptionのTfidf (text cleaningはしない)
    """
    pipeline_ = Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=(1, 2))), #(1,2) #n_gram足しちゃいました
        ('svd', TruncatedSVD(n_components=64, n_iter=10, random_state=42)),
    ])
    c = "description"
    pipeline_.fit(df_input[c][:len(train_data)].fillna("NULL"))
    arr = pipeline_.transform(df_input[c].fillna("NULL"))
    df_out = pd.DataFrame(arr).add_prefix(f"{c}_svd") #pca
    assert df_input.shape[0] == df_out.shape[0]
    return df_out


def long_title_tfidfvectorizer(df_input):
    """
    long_titleのTfidf(text cleaningはしない)
    """
    pipeline_ = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('svd', TruncatedSVD(n_components=50, n_iter=10, random_state=42)),
    ])
    c = "long_title"
    pipeline_.fit(df_input[c][:len(train_data)].fillna("NULL"))
    arr = pipeline_.transform(df_input[c].fillna("NULL"))
    #arr = arr.toarray()
    df_out = pd.DataFrame(arr).add_prefix(f"{c}_svd")
    assert df_input.shape[0] == df_out.shape[0]
    return df_out

In [19]:
%%time
def remove_pad_token(s):
    """
    翻訳後のtextは頭に<pad> tokenが付いているので消す
    """
    if s is np.nan:
        return np.nan
    
    clean_s = re.sub("<pad>", "", s)
    if clean_s[0] == " ":
        clean_s = clean_s[1:]
    return clean_s


def transed_text_series(df_input, p):
    """
    翻訳後textのcsvを読み込みdf_inputにmergeさせる
    """
    _df = pd.read_csv(p)
    if "transed_long_title" in p:
        c = "transed_long_title"
        
    if "transed_description" in p:
        c = "transed_description"
        
    _df[c] = _df[c].apply(lambda x:remove_pad_token(x))
    text_series = pd.merge(df_input, _df, on="object_id", how="left")[c]
    
    return text_series



def transed_long_title_tfidf(df_input):
    """
    long_titleをDutch->Englishに翻訳したtextに対するTfidf
    """
    l = []
    for p in ["transed_long_title"]: 
        series = transed_text_series(df_input, f"../feature/{p}.csv")
        series = series.apply(lambda x:texthero_preprocessing(x))
        
        pipeline_ = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('svd', TruncatedSVD(n_components=50, n_iter=10, random_state=42)),
        ])
        pipeline_.fit(series[:len(train_data)])
        arr = pipeline_.transform(series)
        df_out = pd.DataFrame(arr).add_prefix(f"{p}_tfidf_svd")
        l.append(df_out)
        
    df_out = pd.concat(l, axis=1)
    assert df_input.shape[0] == df_out.shape[0]
    return df_out

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 6.91 µs


In [20]:
import string
import unicodedata
import nltk
def texthero_preprocessing(s):
    """
    textheroが手元ではinstallできなかったので愚直にcleaningを関数化したもの
    """
    
    if s is np.nan:
        return "NULL"
    s = s.lower()
    pattern = r"\b\d+\b"
    s = re.sub(pattern, "X", s) #remove_logits
    s = re.sub(rf"([{string.punctuation}])+", " ", s)
    nfkd_form = unicodedata.normalize("NFKD", s)
    s = "".join([char for char in nfkd_form if not unicodedata.combining(char)])
    
    custom_stopwords = nltk.corpus.stopwords.words('dutch') + nltk.corpus.stopwords.words('english')
    # Set flag to allow verbose regexps
    # Words with optional internal hyphens 
    # Any space
    # Any symbol 
    pattern = r"""(?x)                          
      \w+(?:-\w+)*                              
      | \s*                                     
      | [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~]   
    """
    s = "".join(t if t not in custom_stopwords else " " for t in re.findall(pattern, s))
    
    s = " ".join(re.sub("\xa0", " ", s).split())
    return s



def clean_tfidf_title_desc(df_input):
    """
    titleとdescriptionをcleaningし結合．それに対してTfidf
    TruncatedSVDのrandom_stateを指定し忘れており再現を妨げている
    """
    text_series = df_input["title"] + " " + df_input["description"].fillna("NULL")
    clean_series = text_series.apply(lambda x:texthero_preprocessing(x))
    
    pipeline_ = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('svd', TruncatedSVD(n_components=64, n_iter=10, random_state=42)),
    ])
    pipeline_.fit(clean_series[:len(train_data)])
    arr = pipeline_.transform(clean_series)
    df_out = pd.DataFrame(arr).add_prefix(f"title_desc_svd") #pca
    assert df_input.shape[0] == df_out.shape[0]
    return df_out

In [21]:
def lazy_text_feature(_df_input):
    """
    文字長さ関係の特徴量
    """
    df_input = _df_input.copy()
    df_out = pd.DataFrame()
    for c in ["title", "description", "long_title", "more_title", "sub_title"]:
        df_input[c] = np.where(df_input[c].isnull(), "", df_input[c])
        df_out[c + "_len"] = df_input[c].apply(lambda x:len(x))
        df_out["clean_" + c + "_len"] = df_input[c].apply(lambda x:texthero_preprocessing(x)).str.len()
        
        df_out[c + "_len_percentage"] = df_out["clean_" + c + "_len"] / df_out[c + "_len"]
        #df_out[c + "_Nwords"] = df_input[c].apply(lambda x:len(x.split()))
        
    df_out["long_div_title_len"] = df_out["long_title_len"] / df_out["title_len"]
    df_out["more_div_title_len"] = df_out["more_title_len"] / df_out["title_len"]
    
    df_out["year_is_circa"] = df_input["dating_presenting_date"].str.contains("c").astype(float)
    assert df_input.shape[0] == df_out.shape[0]
    return df_out

In [22]:
from scipy.special import softmax
def sentiment_feature(df_input):
    """
    twitter sentimentのcsvを結合
    """
    outputs_list = []
    for name in ["long_title", "principal_maker", "title"]:
        name = name + "_tweet_sentiment"
        feat = pd.read_csv(f"../feature/{name}.csv")
        cols = [c for c in feat.columns if c!="object_id"]
        tmp_out = pd.merge(df_input, feat, on="object_id", how="left")[cols]
        tmp_out = tmp_out.add_prefix(f"{name}_")

        softmax_arr = softmax(tmp_out.values, axis=1)
        tmp_out = pd.concat([tmp_out,
                            pd.DataFrame(softmax_arr, columns=cols).add_prefix(f"{name}_SF_")], axis=1)
        outputs_list.append(tmp_out)
        
    for name in ["long_title", "description"]:
        name = "transed_" + name + "_tweet_sentiment"
        feat = pd.read_csv(f"../feature/{name}.csv")
        cols = [c for c in feat.columns if c!="object_id"]
        tmp_out = pd.merge(df_input, feat, on="object_id", how="left")[cols]
        tmp_out = tmp_out.add_prefix(f"{name}_")

        softmax_arr = softmax(tmp_out.values, axis=1)
        tmp_out = pd.concat([tmp_out,
                            pd.DataFrame(softmax_arr, columns=cols).add_prefix(f"{name}_SF_")], axis=1)
        outputs_list.append(tmp_out)
        
    df_out = pd.concat(outputs_list, axis=1)
    assert df_input.shape[0] == df_out.shape[0]
    return df_out

In [23]:
def more_title_diff_feature(df_input):
    """
    more_titleとtitleの差分を取ってCountVector
    """
    df_out = pd.DataFrame()
    df_out["more_minus_title"] = df_input["more_title"].copy()
    null_idx = df_input["more_title"].isnull()
    
    df_out["more_minus_title"][~null_idx] = df_input[["title", "more_title"]].dropna().apply(lambda x:x[1].replace(x[0], ''), axis=1)                 
    df_out = count_encoder(df_out, df_out.columns)
    assert df_input.shape[0] == df_out.shape[0]
    return df_out


def more_title_tfidf(df_input):
    """
    more_titleのtfidf
    TruncatedSVDのrandom_stateを指定し忘れており再現の妨げになっている
    """
    df_out = pd.DataFrame()
    tmp_df = df_input[["title", "more_title"]].copy()
    tmp_df["title"] = tmp_df["title"].apply(lambda x:texthero_preprocessing(x))
    tmp_df["more_title"] = tmp_df["more_title"].fillna("").apply(lambda x:texthero_preprocessing(x))
    
    more_minus_title = tmp_df[["title", "more_title"]].dropna().apply(lambda x:x[1].replace(x[0], ''), axis=1)
    pipeline_ = Pipeline([
        ('tfidf', TfidfVectorizer(min_df=5, ngram_range=(1,3))),
        ('svd', TruncatedSVD(n_components=12, n_iter=10, random_state=42)),
    ])
    pipeline_.fit(more_minus_title[:len(train_data)])
    arr = pipeline_.transform(more_minus_title)
    df_out = pd.DataFrame(arr).add_prefix(f"more_title_svd") #pca
    
    return df_out

In [24]:
"""
SWEM用の単語ベクトル
"""
from gensim.models import word2vec, KeyedVectors
import gensim.downloader as api
model = api.load("glove-wiki-gigaword-100")

In [25]:
def remove_whitespace(s):
    s = " ".join(re.sub("\xa0", " ", s).split())
    return s


def extract_word2vec_arr(df_input, cols):
    """
    追加テーブル(material.csv等)に対してSWEM
    """
    feat = []
    for idx in df_input.index:
        sentence_vec = []
        for c in cols:
            sentence = df_input.loc[idx, c]
            if sentence is np.nan:
                #feat.append([np.nan])
                continue
            
            text_list = sentence.split(",")
            for w in text_list:
                w = remove_whitespace(w)
                try:
                    vec_arr = model[w]
                    sentence_vec.append(vec_arr)
                    continue
                except:
                    pass

                for _w in w.split(" "):
                    _w = remove_whitespace(_w)
                    try:
                        vec_arr = model[_w]
                        sentence_vec.append(vec_arr)
                    except:
                        pass

            
        if len(sentence_vec)==0:
            sentence_vec = [np.ones(100)*np.nan]

        sentence_vec = np.array(sentence_vec).mean(axis=0)
        feat.append(sentence_vec)

    feat_df = pd.DataFrame(feat)
    return feat_df


def pretrained_w2v_feature(df_input):
    _df = merge_tables(df_input)
    df_out = extract_word2vec_arr(_df, ["tech", "material"]).add_prefix("tech_mate_w2v_")
    return df_out

In [26]:
from gensim.models import word2vec, KeyedVectors
from tqdm import tqdm
def tables_word2vec_feature(df_input):
    """
    Arai-san discussion w2v
    """
    material = pd.read_csv(DATADIR + "material.csv")
    technique = pd.read_csv(DATADIR + "technique.csv")
    collection = pd.read_csv(DATADIR + "object_collection.csv")
    person = pd.read_csv(DATADIR + "historical_person.csv")
    place = pd.read_csv(DATADIR + "production_place.csv")

    mat_col = pd.concat([material, collection], axis=0).reset_index(drop=True)
    mat_tec = pd.concat([material, technique], axis=0).reset_index(drop=True)
    col_tec = pd.concat([collection, technique], axis=0).reset_index(drop=True)
    mat_col_tec = pd.concat([material, collection, technique], axis=0).reset_index(drop=True)


    model_size = {
        "material": 10,
        "technique": 8,
        "collection": 3,
        "material_collection": 20,
        "material_technique": 20,
        "collection_technique": 10,
        "material_collection_technique": 25
    }

    n_iter = 100


    w2v_dfs = []
    for df, df_name in tqdm(zip(
            [
                material, collection, technique,
                mat_col, mat_tec, col_tec, mat_col_tec
            ], [
                "material", "collection", "technique",
                "material_collection",
                "material_technique",
                "collection_technique",
                "material_collection_technique"
            ])):
        df_group = df.groupby("object_id")["name"].apply(list).reset_index()
        # Word2Vecの学習
        w2v_model = word2vec.Word2Vec(df_group["name"].values.tolist(),
                                      size=model_size[df_name],
                                      min_count=1,
                                      window=1,
                                      iter=n_iter)

        # 各文章ごとにそれぞれの単語をベクトル表現に直し、平均をとって文章ベクトルにする
        sentence_vectors = df_group["name"].apply(
            lambda x: np.mean([w2v_model.wv[e] for e in x], axis=0))
        sentence_vectors = np.vstack([x for x in sentence_vectors])
        sentence_vector_df = pd.DataFrame(sentence_vectors,
                                          columns=[f"{df_name}_w2v_{i}"
                                                   for i in range(model_size[df_name])])
        sentence_vector_df.index = df_group["object_id"]
        w2v_dfs.append(sentence_vector_df)
        
    feat_df = pd.concat(w2v_dfs, axis=1).reset_index().rename(columns={"index":"object_id"})
    cols = [c for c in feat_df.columns if c!="object_id"]
    df_out = pd.merge(df_input, feat_df, on="object_id", how="left")
    return df_out[cols]

# maintable_numeric

In [27]:
def search_index(s, word):
    try:
        return s[int(s.index(word)+1)]
    except:
        return np.nan
    
def cm2mm(s):
    try:
        num = float(re.sub("[a-zA-Z]", "", s))
        if len(str(num))==0:
            return np.nan
        elif "cm" in s:
            return num * 10
        else:# "mm" in s:
            return num
    except:
        return np.nan
    
    
def subtitle_feature(df_input):
    """
    sub_titleから数値抽出
    """
    df_out = pd.DataFrame()
    _df = df_input[["sub_title"]].copy()
    _df["sub_title"] = np.where(_df["sub_title"].isnull(), "NULL", _df["sub_title"])
    _df["sub_title"] = _df["sub_title"].apply(lambda x:[w for w in x.split(" ") if w !="×"])
    
    for w in ["h", "w", "t", "d", "l"]:
        df_out[w] = _df["sub_title"].apply(lambda x:search_index(x, w))
        df_out[w] = df_out[w].apply(lambda x:cm2mm(x))
        
    df_out["num_unit"] = df_input["sub_title"].str.split(" × ", expand=True).isnull().sum(axis=1)
    
    #df_out["weight"] = df_input["sub_title"].apply(lambda x:"".join([w for w in x.split(" ") if "g" in w]))
    df_out["weight"] = _df["sub_title"].apply(lambda x:"".join([w for w in x if "g" in w]))
    df_out["weight"] = df_out["weight"].apply(lambda x:re.sub("[^0-9]", "", x))
    df_out["weight"] = np.where(df_out["weight"].apply(lambda x:len(x))==0, np.nan, df_out["weight"]).astype(float)
    
    df_out["size"] = df_out["h"] * df_out["w"]
    df_out["volume"] = df_out["size"] * df_out["d"]
    
    assert df_input.shape[0] == df_out.shape[0]
    return df_out

In [28]:
def jumble_feature(df_input):
    """
    main table周りで思いついた特徴の寄せ集め
    """
    df_out = pd.DataFrame()
    
    df_out["acquisition_year"] = pd.to_datetime(df_input["acquisition_date"]).dt.year
    df_out["dating_year_diff"] = df_input["dating_year_late"] - df_input["dating_year_early"]
    
    df_out["principal_maker_mismatch"] = (df_input["principal_maker"] != df_input["principal_or_first_maker"]).astype(int)
    df_out["c_in_dating_presenting_date"] = df_input["dating_presenting_date"].str.contains("c").astype(float)
    
    _df = df_input.groupby("principal_maker")["dating_sorting_date"].agg(["size", "nunique"])
    dic = (_df["nunique"] / _df["size"]).to_dict()
    df_out["principal_maker_info"] = df_input["principal_maker"].map(dic)
    
    df_out["acquisition_year_from_made"] = pd.to_datetime(df_input["acquisition_date"]).dt.year - df_input["dating_year_early"]
    
    assert df_input.shape[0] == df_out.shape[0]
    return df_out

In [29]:
from sklearn.mixture import GaussianMixture

def groupby_feature(df_input):
    tables = merge_tables(all_df)
    for name in ["color", "palette"]:
        color_df = pd.read_csv(DATADIR + f"{name}.csv")
        color_ids = color_df["object_id"].unique()
        tables[name] = np.where(all_df["object_id"].isin(color_ids),
                                1,
                                np.nan)
    
    tables = pd.concat([tables, 
                        subtitle_feature(all_df)[["h", "w", "d", "l", "weight"]]], axis=1)
    
    cols = tables.columns[tables.isnull().astype(int).sum(axis=0) > 10]
    cols = [c for c in cols if c not in ["dating_sorting_date", "dating_year_early"]]
    null_df = tables[cols].isnull().astype(int)
    
    gmm = GaussianMixture(n_components=10, random_state=0)
    arr = gmm.fit_predict(null_df)
    null_group = pd.Series(arr, name="null_group")
    
    tmp_df = pd.concat([all_df, df_input, null_group], axis=1)
    
    df_out = pd.DataFrame()
    for key in ["principal_maker", "acquisition_method", "null_group", "title"]:
        for val in ["dating_sorting_date", "h", "RGBsum_std", "RGBsum_min_prod_ratio", "random_resnet0", "min_ratio"]:
            dic = tmp_df.groupby(key)[val].mean().to_dict()
            df_out[key+"_GPB_"+val] = tmp_df[val] - tmp_df[key].map(dic)
    
    return df_out

In [30]:
def raw_data(df_input):
    cols = ["dating_period", "dating_year_early", "dating_year_late"]
    df_out = df_input[cols].copy()
    return df_out


def make_feature(df_input):
    functions = [
        raw_data,
        count_feature,
        jumble_feature,
        lazy_text_feature,
        
        lazy_ratio_agg,
        color_agg_feature,
        color_palette_agg,
        palette_feature,
        color_feature,
        
        ExG_agg_feature,
        color_ExG_agg_feature,
        
        
        subtitle_feature,
        lang_detect,
        maker_countvectorizer,
        description_tfidfvectorizer,
        long_title_tfidfvectorizer,
        clean_tfidf_title_desc,
        
        transed_long_title_tfidf,
        bert_description,
        sentiment_feature,
        
        #palette_HSV, #時短用, #あってもなくてもみたいなところある
        
        tables_onehot,
        resnet50_feature,
        
        
        null_features,
        more_title_diff_feature,
        more_title_tfidf, #これはいらんかも
        lazy_merge_tables_feature,
        
        pretrained_w2v_feature,
    ]
    features = [f(df_input) for f in tqdm(functions)]
    features = pd.concat(features, axis=1)
    
    features = pd.concat([features, groupby_feature(features)], axis=1)
    return features

In [31]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2 
    dfs = []
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: #numericsのデータ型の範囲内のときに処理を実行. データの最大最小値を元にデータ型を効率的なものに変更
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    dfs.append(df[col].astype(np.int8))
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    dfs.append(df[col].astype(np.int16))
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    dfs.append(df[col].astype(np.int32))
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    dfs.append(df[col].astype(np.int64) ) 
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    dfs.append(df[col].astype(np.float16))
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    dfs.append(df[col].astype(np.float32))
                else:
                    dfs.append(df[col].astype(np.float64))
        else:
            dfs.append(df[col])
    
    df_out = pd.concat(dfs, axis=1)
    if verbose:
        end_mem = df_out.memory_usage().sum() / 1024**2
        num_reduction = str(100 * (start_mem - end_mem) / start_mem)
        print(f'Mem. usage decreased to {str(end_mem)[:3]}Mb:  {num_reduction[:2]}% reduction')
    return df_out

In [32]:
features = make_feature(all_df)

 81%|████████▏ | 22/27 [01:53<00:18,  3.75s/it]

26


100%|██████████| 27/27 [02:27<00:00,  5.44s/it]


In [33]:
features = reduce_mem_usage(features)

Mem. usage decreased to 39.Mb:  75% reduction


In [42]:
print(features.shape)
train = features[:len(train_data)].reset_index(drop=True)
test = features[len(train_data):].reset_index(drop=True)

(24034, 865)


In [43]:
def calc_rmse(y, y_oof):
    score = ((y.values.reshape(1,-1) - y_oof)**2).mean()**0.5
    return score

params = {
    "boosting_type":'gbdt',
    "num_leaves":34,  #18, #34
    "max_depth":-1,
    "learning_rate":0.01, #0.1
    "n_estimators":20000,
    
    "objective":"regression",
    
    "metric":"rmse", #rmse mae
    "force_col_wise":True,
    "bin_construct_sample_cnt":2000,
    
    "bagging_freq": 3,  #3
    "subsample":0.7,#0.7
    
    "colsample_bytree":0.5,
    "reg_alpha":.7, 
    "reg_lambda":.1, #l2
    "random_state":42,
    "n_jobs":-1,
}

In [44]:
def run_lgbm(_X, y, _test, splits, params, y_max, feature_selection=False):
    X = _X.copy()
    test = _test.copy()
    if feature_selection:
        feature_df = pd.read_csv(OUTPUT_DIR + "feature_importances_lgbm_False.csv")
        feature_df["average"].hist()
        plt.show()
        feature_arr = feature_df[feature_df["average"]>150]["feature"]
        print(_X.columns, "->", feature_arr.shape)
        X = _X[feature_arr]
        test = _test[feature_arr]
        
    X.to_pickle(OUTPUT_DIR + "features.pkl")
    feature_importances = pd.DataFrame(data=X.columns, columns=["feature"])
    
    y_oof = np.zeros(X.shape[0])
    y_pred = np.zeros(test.shape[0])
    scores = []
    for i, (train_idx, valid_idx) in enumerate(splits):
        print("fold", i)
        X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]

        idx_use = y_train < y_max
        y_train = np.where(idx_use, y_train, y_max)

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_valid = lgb.Dataset(X_valid, y_valid)
        lgb_clf = lgb.train(params, lgb_train,
                            valid_names=["train", "valid"], 
                            valid_sets=[lgb_train, lgb_valid], 
                            early_stopping_rounds=500, #100, 0311
                            verbose_eval=500,
                           )
        
        oof = lgb_clf.predict(X_valid)
        y_oof[valid_idx] = oof
        
        y_pred_per_fold = lgb_clf.predict(test)
        y_pred += y_pred_per_fold / N_SPLIT
        
        feature_importances[i] = lgb_clf.feature_importance()
        
        score = calc_rmse(y_valid, oof)
        print(f"score:{score}")
        scores.append(score)
        pickle.dump(lgb_clf, open(OUTPUT_DIR + f'lgbm_model_fold{i}_{feature_selection}.pkl', 'wb'))
        
        sample = pd.read_csv(DATADIR + "atmacup10__sample_submission.csv")
        sample["likes"] = y_pred_per_fold
        sample.to_csv(OUTPUT_DIR + f"submission_lgbm{i}_{feature_selection}_{score}.csv", index=False)
        

    score = calc_rmse(y, y_oof)
    print(scores)
    print(f"score:{score}")
    scores.append(score)
    scores = np.array(scores)
    
    feature_importances['average'] = feature_importances.mean(axis=1)
    feature_importances = feature_importances.sort_values("average", ascending=False)
    
    y_oof = np.expm1(y_oof)
    y_oof = np.where(y_oof < 0, 0, y_oof)
    y_pred = np.expm1(y_pred)
    y_pred = np.where(y_pred < 0, 0, y_pred)
        
    sample = pd.read_csv(DATADIR + "atmacup10__sample_submission.csv")
    print(y_pred.shape, sample.shape)
    sample["likes"] = y_pred
    sample.to_csv(OUTPUT_DIR + f"submission_lgbm_{feature_selection}.csv", index=False)
    
    pd.concat([pd.Series(y_oof), pd.Series(y_pred)], axis=0).reset_index(drop=True).to_csv(OUTPUT_DIR + f"oof_and_pred_lgbm_{feature_selection}.csv", index=False)
    feature_importances.to_csv(OUTPUT_DIR + f"feature_importances_lgbm_{feature_selection}.csv",index=False)
    return y_oof, y_pred, scores, feature_importances

In [45]:
def calc_class_weight(target):
    weight = np.where(target==0, 
                      np.ones(target.shape[0])*((target==1).sum() / (target==0).sum()), 
                      np.ones(target.shape[0]))
    return weight


def run_lgbm_pseudo(_X, _y, _test, splits, params, y_max, feature_selection=False):
    X = _X.copy()
    test = _test.copy()
    y = _y.copy()
    X.to_pickle(OUTPUT_DIR + "features_pseudo.pkl")
    feature_importances = pd.DataFrame(data=X.columns, columns=["feature"])
    
    y_oof = np.zeros(X.shape[0])
    y_pred = np.zeros(test.shape[0])
    scores = []
    
    weight_series = pd.concat([pd.Series(np.ones(len(train_data))),
                           pd.Series(np.ones(len(test_data)) * 0.5)], axis=0).reset_index(drop=True)
    weight_series = weight_series.rename("weight")
    
    _params = {
    "boosting_type":'gbdt',
    "num_leaves":34,  #18, #34
    "max_depth":-1,
    "learning_rate":0.1, #0.1
    "n_estimators":20000,
    
    "objective":"regression",
    
    "metric":"rmse", #rmse mae
    "force_col_wise":True,
    "bin_construct_sample_cnt":2000,
    
    "bagging_freq": 3,  #3
    "subsample":0.7,#0.7
    
    "colsample_bytree":0.5, #0.7 
    "reg_alpha":.7, #1. #l1 (.7, .1)
    "reg_lambda":.1, #l2
    "random_state":42,
    "n_jobs":-1,
    "verbose":-1,
    }
    
    
    for i, (train_idx, valid_idx) in enumerate(splits):
        print("fold", i)
        print(X.shape, test.shape)
        X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]

        idx_use = y_train < y_max
        y_train = np.where(idx_use, y_train, y_max)
        
        weight_train = weight_series.loc[train_idx]
        weight_test = weight_series.loc[valid_idx]

        lgb_train = lgb.Dataset(X_train, y_train, weight=weight_train)
        lgb_valid = lgb.Dataset(X_valid, y_valid, weight=weight_test)
                               
        lgb_clf = lgb.train(_params, lgb_train,
                            valid_names=["train", "valid"], 
                            valid_sets=[lgb_train, lgb_valid], 
                            early_stopping_rounds=500, #100, 0311
                            verbose_eval=500,
                           )
        
        oof = lgb_clf.predict(X_valid)
        y_oof[valid_idx] = oof
        
        y_pred_per_fold = lgb_clf.predict(test)
        y_pred += y_pred_per_fold / N_SPLIT
        
        feature_importances[i] = lgb_clf.feature_importance()
        
        score = calc_rmse(y_valid, oof) #log_loss
        print(f"score:{score}")
        scores.append(score)
        pickle.dump(lgb_clf, open(OUTPUT_DIR + f'lgbm_model_fold{i}_{feature_selection}_pseudo.pkl', 'wb'))
        
        sample = pd.read_csv(DATADIR + "atmacup10__sample_submission.csv")
        sample["likes"] = y_pred_per_fold
        sample.to_csv(OUTPUT_DIR + f"submission_lgbm{i}_{feature_selection}_{score}_pseudo.csv", index=False)
        

    score = calc_rmse(y, y_oof)
    print(scores)
    print(f"score:{score}")
    scores.append(score)
    scores = np.array(scores)
    
    feature_importances['average'] = feature_importances.mean(axis=1)
    feature_importances = feature_importances.sort_values("average", ascending=False)
    
    y_oof = np.expm1(y_oof)
    y_oof = np.where(y_oof < 0, 0, y_oof)
    y_pred = np.expm1(y_pred)
    y_pred = np.where(y_pred < 0, 0, y_pred)
        
    sample = pd.read_csv(DATADIR + "atmacup10__sample_submission.csv")
    print(y_pred.shape, sample.shape)
    sample["likes"] = y_pred
    sample.to_csv(OUTPUT_DIR + f"submission_lgbm_{feature_selection}_pseudo.csv", index=False)
    
    pd.concat([pd.Series(y_oof), pd.Series(y_pred)], axis=0).reset_index(drop=True).to_csv(OUTPUT_DIR + f"oof_and_pred_lgbm_{feature_selection}_pseudo.csv", index=False)
    feature_importances.to_csv(OUTPUT_DIR + f"feature_importances_lgbm_{feature_selection}_pseudo.csv",index=False)
    return y_oof, y_pred, scores, feature_importances

In [46]:
def make_kfolds(df_input, N_SPLIT, SEED):
    folds = KFold(n_splits=N_SPLIT, random_state=SEED, shuffle=True)
    splits = folds.split(df_input)
    return splits


def make_stratifirdkfolds(_y, N_SPLIT, SEED):
    folds = StratifiedKFold(n_splits=N_SPLIT, random_state=SEED, shuffle=True)
    splits = folds.split(_y, pd.cut(_y, 8, labels=False))
    return splits

In [47]:
train_data = pd.read_csv("../input/train.csv")
y = train_data["likes"].copy()
y = np.log1p(y)

In [48]:

N_SPLIT = 5
target_col = "likes"

_train = train.copy()
_test = test.copy()

_y = y.copy()

assert _train.shape[0]==train.shape[0]
assert _test.shape[0]==test.shape[0]

splits = make_stratifirdkfolds(_y, N_SPLIT, SEED)
y_oof, y_pred, scores, feature_importances = run_lgbm(_train, _y, _test, splits, params, 11, feature_selection=False)


fold 0




[LightGBM] [Info] Total Bins 155899
[LightGBM] [Info] Number of data points in the train set: 9620, number of used features: 864
[LightGBM] [Info] Start training from score 1.654172
Training until validation scores don't improve for 500 rounds
[500]	train's rmse: 0.771079	valid's rmse: 1.02143
[1000]	train's rmse: 0.604058	valid's rmse: 0.999074
[1500]	train's rmse: 0.492135	valid's rmse: 0.989559
[2000]	train's rmse: 0.407586	valid's rmse: 0.98425
[2500]	train's rmse: 0.34135	valid's rmse: 0.981857
[3000]	train's rmse: 0.288485	valid's rmse: 0.980461
[3500]	train's rmse: 0.24575	valid's rmse: 0.97949
[4000]	train's rmse: 0.210453	valid's rmse: 0.979082
[4500]	train's rmse: 0.181346	valid's rmse: 0.978444
[5000]	train's rmse: 0.157311	valid's rmse: 0.9783
[5500]	train's rmse: 0.137089	valid's rmse: 0.978051
[6000]	train's rmse: 0.120099	valid's rmse: 0.977854
[6500]	train's rmse: 0.105705	valid's rmse: 0.977642
[7000]	train's rmse: 0.0934314	valid's rmse: 0.977707
Early stopping, best 



[LightGBM] [Info] Total Bins 156293
[LightGBM] [Info] Number of data points in the train set: 9621, number of used features: 865
[LightGBM] [Info] Start training from score 1.655197
Training until validation scores don't improve for 500 rounds
[500]	train's rmse: 0.775598	valid's rmse: 1.00806
[1000]	train's rmse: 0.606262	valid's rmse: 0.981564
[1500]	train's rmse: 0.49229	valid's rmse: 0.972541
[2000]	train's rmse: 0.407564	valid's rmse: 0.967993
[2500]	train's rmse: 0.342115	valid's rmse: 0.965574
[3000]	train's rmse: 0.289206	valid's rmse: 0.963747
[3500]	train's rmse: 0.246418	valid's rmse: 0.963113
[4000]	train's rmse: 0.211525	valid's rmse: 0.96209
[4500]	train's rmse: 0.182365	valid's rmse: 0.961723
[5000]	train's rmse: 0.158327	valid's rmse: 0.961445
[5500]	train's rmse: 0.138114	valid's rmse: 0.961293
Early stopping, best iteration is:
[5274]	train's rmse: 0.146846	valid's rmse: 0.961192
score:0.9611923684520962
fold 2




[LightGBM] [Info] Total Bins 155831
[LightGBM] [Info] Number of data points in the train set: 9621, number of used features: 865
[LightGBM] [Info] Start training from score 1.654526
Training until validation scores don't improve for 500 rounds
[500]	train's rmse: 0.776183	valid's rmse: 0.993414
[1000]	train's rmse: 0.606775	valid's rmse: 0.971232
[1500]	train's rmse: 0.493659	valid's rmse: 0.962977
[2000]	train's rmse: 0.40885	valid's rmse: 0.959549
[2500]	train's rmse: 0.342834	valid's rmse: 0.957967
[3000]	train's rmse: 0.289701	valid's rmse: 0.957499
[3500]	train's rmse: 0.246871	valid's rmse: 0.956934
[4000]	train's rmse: 0.211728	valid's rmse: 0.956263
[4500]	train's rmse: 0.182739	valid's rmse: 0.956328
Early stopping, best iteration is:
[4288]	train's rmse: 0.194256	valid's rmse: 0.956059
score:0.9560592254427311
fold 3




[LightGBM] [Info] Total Bins 156352
[LightGBM] [Info] Number of data points in the train set: 9621, number of used features: 864
[LightGBM] [Info] Start training from score 1.655344
Training until validation scores don't improve for 500 rounds
[500]	train's rmse: 0.775994	valid's rmse: 0.998373
[1000]	train's rmse: 0.606026	valid's rmse: 0.975948
[1500]	train's rmse: 0.492519	valid's rmse: 0.967056
[2000]	train's rmse: 0.408113	valid's rmse: 0.963188
[2500]	train's rmse: 0.342137	valid's rmse: 0.960951
[3000]	train's rmse: 0.289307	valid's rmse: 0.959508
[3500]	train's rmse: 0.246546	valid's rmse: 0.958349
[4000]	train's rmse: 0.211615	valid's rmse: 0.95759
[4500]	train's rmse: 0.182615	valid's rmse: 0.957469
[5000]	train's rmse: 0.15852	valid's rmse: 0.957152
[5500]	train's rmse: 0.138297	valid's rmse: 0.956848
[6000]	train's rmse: 0.121386	valid's rmse: 0.956801
[6500]	train's rmse: 0.107009	valid's rmse: 0.956612
[7000]	train's rmse: 0.0946986	valid's rmse: 0.956598
Early stopping, 



[LightGBM] [Info] Total Bins 155959
[LightGBM] [Info] Number of data points in the train set: 9621, number of used features: 864
[LightGBM] [Info] Start training from score 1.655451
Training until validation scores don't improve for 500 rounds
[500]	train's rmse: 0.778322	valid's rmse: 0.997716
[1000]	train's rmse: 0.608078	valid's rmse: 0.969847
[1500]	train's rmse: 0.494259	valid's rmse: 0.958981
[2000]	train's rmse: 0.409535	valid's rmse: 0.953631
[2500]	train's rmse: 0.343118	valid's rmse: 0.951094
[3000]	train's rmse: 0.290325	valid's rmse: 0.949989
[3500]	train's rmse: 0.247419	valid's rmse: 0.948781
[4000]	train's rmse: 0.211879	valid's rmse: 0.948186
[4500]	train's rmse: 0.182671	valid's rmse: 0.948132
[5000]	train's rmse: 0.158415	valid's rmse: 0.947827
[5500]	train's rmse: 0.138187	valid's rmse: 0.947635
[6000]	train's rmse: 0.121036	valid's rmse: 0.947421
[6500]	train's rmse: 0.10661	valid's rmse: 0.947235
Early stopping, best iteration is:
[6492]	train's rmse: 0.106829	vali

In [49]:
"""
N_SPLIT = 5
target_col = "likes"

_train = pd.concat([train.copy(), test.copy()], axis=0).reset_index(drop=True)
_test = test.copy()
_y = y.copy()
_y = pd.concat([_y, np.log1p(pd.read_csv("../output/start16/submission_lgbm_False.csv")["likes"])], axis=0).reset_index(drop=True)

assert _train.shape[0]==(train.shape[0] + test.shape[0])
assert _test.shape[0]==test.shape[0]

print(_train.shape, _y.shape)

splits = make_stratifirdkfolds(_y, N_SPLIT, SEED)
y_oof, y_pred, scores, feature_importances = run_lgbm_pseudo(_train, _y, _test, splits, params, 11, feature_selection=False)
"""

'\nN_SPLIT = 5\ntarget_col = "likes"\n\n_train = pd.concat([train.copy(), test.copy()], axis=0).reset_index(drop=True)\n_test = test.copy()\n_y = y.copy()\n_y = pd.concat([_y, np.log1p(pd.read_csv("../output/start16/submission_lgbm_False.csv")["likes"])], axis=0).reset_index(drop=True)\n\nassert _train.shape[0]==(train.shape[0] + test.shape[0])\nassert _test.shape[0]==test.shape[0]\n\nprint(_train.shape, _y.shape)\n\nsplits = make_stratifirdkfolds(_y, N_SPLIT, SEED)\ny_oof, y_pred, scores, feature_importances = run_lgbm_pseudo(_train, _y, _test, splits, params, 11, feature_selection=False)\n'

In [50]:
# trainでfit, testでpredict
# [0.9937712180369499, 0.9879254590058039, 0.9878856826781501, 0.9875887449174304, 0.986000326337547]
# score:0.988638303123017

# count encoderの追加とか, 変なlabel encoderの修正
# [0.99512793349549, 0.9760252704277805, 0.9839311150065218, 0.982252494283172, 0.9875356866796321]
# score:0.9849954729872427

## onehot tables
# [1.0045167537759587, 0.9772717821971039, 0.979420767098949, 0.9703726546630224, 0.9735351838633651]
# score:0.9811006329513632

# LGBM upgrade, 不適切なtableのcount encoding削除
# [0.9930255688202387, 0.9753716568338878, 0.9802319472423626, 0.974020652037711, 0.9700600237968715]
# score:0.9785753773934237

# colorでExGとか, some_feature()とか,
# principal makerでcount encoder ngram(1,1)大事, principal_firstに変えた途端CVは微改善でLB大幅down
# [0.9876207284842683, 0.9749697685627453, 0.9663708610364038, 0.9683445892252829, 0.9678386797339881]
# score:0.9730619935566562

# null feature, sentimentとか_len_percentageとか
# [0.9834260348382874, 0.9701033138631951, 0.9694680407449775, 0.9612780741465334, 0.9572818995892636]
#score:0.9683544691954975

# bagging 効きました
# [0.9800986986278741, 0.9620824474869699, 0.9583931837498129, 0.9591031389070643, 0.9521792020128582]
# score:0.9624190286090247

# ["dating_year_early", "dating_year_late"]のcount encoding ,  lazy_merge_tables_feature
# [0.9839995220002874, 0.9605324292607755, 0.9542514170305185, 0.9587683997897447, 0.9483622094005497]
# score:0.9612616178598731

"""
params2 = params.copy()
params2["objective"] = "regression"
splits = make_stratifirdkfolds(train_data, N_SPLIT, SEED)
y_oof, y_pred, scores, feature_importances = run_lgbm(train, y, test, splits, params2)
"""

'\nparams2 = params.copy()\nparams2["objective"] = "regression"\nsplits = make_stratifirdkfolds(train_data, N_SPLIT, SEED)\ny_oof, y_pred, scores, feature_importances = run_lgbm(train, y, test, splits, params2)\n'

In [None]:
plt.scatter(y, np.log1p(y_oof))

In [None]:
feature_importances

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(12,4))
pd.Series(np.log1p(y_oof)).hist(ax=ax[0])
pd.Series(y).hist(ax=ax[1])

In [None]:
import seaborn as sns
DIS = 40
order = list(feature_importances["feature"])

plt.figure(figsize=(10, DIS * 4 / 10))
sns.barplot(x="average", y="feature", data=feature_importances.reset_index().head(DIS), order=order[:DIS])

In [None]:
feature_importances[feature_importances["feature"].str.contains("title", 
                                                                na=False
                                                               )]

In [None]:
feature_importances[feature_importances["average"] == 0]