In [1]:
import warnings

warnings.filterwarnings('ignore')

import numpy as  np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re
import string
import xgboost as xgb
import gc

nrows = None
sub = pd.read_csv("sample_submission.csv", nrows=nrows)
len_sub = len(sub)
print("Sample submission len {}".format(len_sub))

##############################################################################
NFOLDS = 5
SEED = 42

stopwords = {x: 1 for x in stopwords.words('russian')}

non_alphanums = re.compile(u'[^A-Za-z0-9]+')

non_alphanumpunct = re.compile(u'[^A-Za-z0-9\.?!,; \(\)\[\]\'\"\$]+')

RE_PUNCTUATION = '|'.join([re.escape(x) for x in string.punctuation])

import string

def normalize_text(text):
    text = text.lower().strip()
    for s in string.punctuation:
        text = text.replace(s, ' ')
    text = text.strip().split(' ')
    return u' '.join(x for x in text if len(x) > 1 and x not in stopwords)


# Load csv
print("\n[+] Load csv ")
train_df = pd.read_csv("train_with_image_features.csv", parse_dates=["activation_date"], nrows=nrows)

train_id = train_df["item_id"].values

test_df = pd.read_csv("test_with_image_features.csv", parse_dates=["activation_date"], nrows=nrows)

user_df = pd.read_csv('aggregated_features.csv', nrows=nrows)

train_df["description_norm"] = train_df["description"].apply(lambda x : normalize_text(str(x)))
test_df["description_norm"] = test_df["description"].apply(lambda x : normalize_text(str(x)))

train_df["title_norm"] = train_df["title"].apply(lambda x : normalize_text(str(x)))
test_df["title_norm"] = test_df["title"].apply(lambda x : normalize_text(str(x)))

y = train_df.deal_probability.copy()

ntr = len(train_df)
nte = len(test_df)

# Merge two dataframes
n_train = len(train_df)

oof_train = np.zeros((n_train,))

train_df['deal_probability'] = train_df['deal_probability'].astype('float64')

df = pd.concat([train_df, test_df], axis=0).set_index("item_id")

df['item_seq_number'] = df['item_seq_number'].astype('int64')
df['image_brightness'] = df['image_brightness'].astype('float64')
df['image_file_size'] = df['image_file_size'].astype('float64')
df['image_focus_measure'] = df['image_focus_measure'].astype('float64')
df['height'] = df['height'].astype('float64')
df['width'] = df['width'].astype('float64')

df = pd.merge(left=df, right=user_df, how="left", on=["user_id"])

del train_df, test_df
gc.collect()

cols_to_fill = ['description', 'param_1', 'param_2', 'param_3', 'description_norm']
df[cols_to_fill] = df[cols_to_fill].fillna(' ')

eps = 0.001

df['city'] = df['city'] + '_' + df['region']
df["price"] = np.log(df["price"].astype('float32') + eps)
df["price"].fillna(df["price"].mean(), inplace=True)
df["image_top_1"].fillna(df["image_top_1"].mean(), inplace=True)
df['avg_days_up_user'].fillna(-1, inplace=True)
df['avg_times_up_user'].fillna(-1, inplace=True)
df['n_user_items'].fillna(-1, inplace=True)

df['no_img'] = pd.isnull(df.image).astype(int)
df['no_dsc'] = pd.isnull(df.description).astype(int)
df['no_p1'] = pd.isnull(df.param_1).astype(int)
df['no_p2'] = pd.isnull(df.param_2).astype(int)
df['no_p3'] = pd.isnull(df.param_3).astype(int)
df['weekday'] = df['activation_date'].dt.weekday
df["item_seq_bin"] = df["item_seq_number"] // 100
df["ads_count"] = df.groupby("user_id", as_index=False)["user_id"].transform(lambda s: s.count())

textfeats1 = ['description', "title", 'param_1', 'param_2', 'param_3', 'description_norm', "title_norm"]
for col in textfeats1:
    df[col] = df[col].astype(str)
    df[col] = df[col].astype(str).fillna(' ')
    df[col] = df[col].str.lower()

textfeats = ['description', "title"]
for col in textfeats:
    df[col + '_num_words'] = df[col].apply(lambda s: len(s.split()))
    df[col + '_num_unique_words'] = df[col].apply(lambda s: len(set(w for w in s.split())))
    df[col + '_words_vs_unique'] = df[col + '_num_unique_words'] / df[col + '_num_words'] * 100
    df[col + '_num_lowE'] = df[col].str.count("[a-z]")
    df[col + '_num_lowR'] = df[col].str.count("[а-я]")
    df[col + '_num_pun'] = df[col].str.count("[[:punct:]]")
    df[col + '_num_dig'] = df[col].str.count("[[:digit:]]")

df['param_2'] = df['param_1'] + ' ' + df['param_2']
df['param_3'] = df['param_2'] + ' ' + df['param_3']

df['params'] = df['param_3'] + ' ' + df['title_norm']
df['text'] = df['description_norm'] + ' ' + df['title_norm']

names = ["city", "param_1", "user_id"]
for i in names:
    df.loc[df[i].value_counts()[df[i]].values < 100, i] = "Rare_value"

df.loc[df["image_top_1"].value_counts()[df["image_top_1"]].values < 200, "image_top_1"] = -1
df.loc[df["item_seq_number"].value_counts()[df["item_seq_number"]].values < 150, "item_seq_number"] = -1

cat_cols = ['user_id', 'region', 'city', 'category_name', "parent_category_name",
        'param_1', 'param_2', 'param_3', 'user_type',
        'weekday', 'ads_count']

for c in cat_cols:
    le = LabelEncoder()
    allvalues = np.unique(df[c].values).tolist()
    le.fit(allvalues)
    df[c] = le.transform(df[c].values)

df['image_top_1'] = df['image_top_1'].astype('float64')

X_train = df[:n_train]
X_test = df[n_train:]

del df
gc.collect()

class MakeBasicFeatures():
    def __init__(self, cols):
        self._stats = None
        self._agg_cols = cols

    def fit(self, df):

        self._stats = {}

        for c in tqdm(self._agg_cols, total=len(self._agg_cols)):

            gp = df.groupby(c)[['deal_probability', 'price']]
            desc = gp.describe()
            self._stats[c] = desc[[('deal_probability', 'mean'), ('deal_probability', 'std'),
                                   ('price', 'mean'), ('price', 'std')]]

    def transform(self, df):

        for c in tqdm(self._agg_cols, total=len(self._agg_cols)):

            df[c + '_dp_mean'] = df[c].map(self._stats[c][('deal_probability', 'mean')])
            df[c + '_dp_std'] = df[c].map(self._stats[c][('deal_probability', 'std')])
            df[c + '_price_mean'] = df[c].map(self._stats[c][('price', 'mean')])
            df[c + '_price_std'] = df[c].map(self._stats[c][('price', 'std')])

            df[c + '_to_price'] = df.price / df[c + '_price_mean']
            df[c + '_to_price'] = df[c + '_to_price'].fillna(1.0)

    def fit_transform(self, df):
        self.fit(df)
        self.transform(df)


fStats = MakeBasicFeatures(['region', 'city', 'parent_category_name', 'category_name',
                             'param_1', 'param_2', 'param_3', 'user_type', 'image_top_1',
                             'ads_count', 'weekday'])

###############################################################################

titles_tfidf = TfidfVectorizer(
    stop_words=stopwords,
    max_features=20000,
    norm='l2',
    sublinear_tf=True,
    smooth_idf=False,
    dtype=np.float32,
)

tr_titles = titles_tfidf.fit_transform(X_train.text)
te_titles = titles_tfidf.transform(X_test.text)

desc_tfidf = TfidfVectorizer(
    stop_words=stopwords,
    max_features=15000,
    norm='l2',
    sublinear_tf=True,
    smooth_idf=False,
    dtype=np.float32,
)

tr_desc = desc_tfidf.fit_transform(X_train.params)
te_desc = desc_tfidf.transform(X_test.params)

gc.collect()

from sklearn.model_selection import KFold

skf = KFold(n_splits=10, random_state=5555513, shuffle=True)

preds_all = []

for fold, (train_index, val_index) in enumerate(skf.split(X_train)):

    print('[+] Fold ', fold)

    train_data = X_train.iloc[train_index]
    validation_data = X_train.iloc[val_index]

    fStats.fit_transform(train_data)
    fStats.transform(validation_data)
    fStats.transform(X_test)

    tr_titles = titles_tfidf.fit_transform(train_data.text)
    va_titles = titles_tfidf.transform(validation_data.text)
    te_titles = titles_tfidf.transform(X_test.text)

    tr_desc = desc_tfidf.fit_transform(train_data.params)
    va_desc = desc_tfidf.transform(validation_data.params)
    te_desc = desc_tfidf.transform(X_test.params)

    columns_to_drop = ['title', 'description', 'params', 'image',
                   'activation_date', 'deal_probability', 'title_norm', 'description_norm', 'text']

    X_tr = hstack([csr_matrix(train_data.drop(columns_to_drop, axis=1)), tr_titles, tr_desc])
    y_tr = train_data['deal_probability']
    del tr_titles, tr_desc, train_data

    gc.collect()
    X_va = hstack([csr_matrix(validation_data.drop(columns_to_drop, axis=1)), va_titles, va_desc])
    y_va = validation_data['deal_probability']
    del va_titles, va_desc, validation_data
    gc.collect()
    X_te = hstack([csr_matrix(X_test.drop(columns_to_drop, axis=1)), te_titles, te_desc])

    del te_titles, te_desc

    gc.collect()

    params = {
        'objective': 'reg:logistic',
        'booster': "gbtree",
        'eval_metric': "rmse",
        'gpu_id': 0,
        'max_depth': 21,
        'eta': 0.05,
        'min_child_weight': 11,
        'gamma': 0,
        'subsample': 0.85,
        'colsample_bytree': 0.7,
        'silent': True,
        'alpha': 2.0,
        'lambda': 0,
        'nthread': 32,
    }

    xg_train = xgb.DMatrix(X_tr, label=y_tr)
    xg_val = xgb.DMatrix(X_va, label=y_va)
    xg_test = xgb.DMatrix(X_te)
    acc_res = {}
    watchlist = [(xg_train, 'train'), (xg_val, 'val')]
    num_round = 500000
    bst = xgb.train(params, xg_train, num_round, evals=watchlist, early_stopping_rounds=200, evals_result=acc_res,verbose_eval=50)
    pred = bst.predict(xg_test)
    oof_train[val_index] = bst.predict(xg_val)

    preds_all.append(pred)

preds_all = np.array(preds_all)
preds_avg = np.mean(preds_all, axis=0)

sub['deal_probability'] = preds_avg
sub['deal_probability'] = sub['deal_probability'].clip(0.0, 1.0)
sub.to_csv('output/xgboost_submission.csv', index=False)

oof_train = oof_train.reshape(-1, 1)
#oof_train = np.clip(oof_train, 0.0, 1.0)

sub_df_train = pd.DataFrame({"item_id": train_id})
sub_df_train["deal_probability"] = oof_train
sub_df_train.to_csv("xgboost_oof.csv", index=False)

[nltk_data] Downloading package stopwords to /home/prune/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Sample submission len 508438

[+] Load csv 
[+] Fold  0


100%|██████████| 11/11 [00:43<00:00,  3.91s/it]
100%|██████████| 11/11 [00:13<00:00,  1.19s/it]
100%|██████████| 11/11 [00:07<00:00,  1.41it/s]
100%|██████████| 11/11 [00:02<00:00,  4.97it/s]


[0]	train-rmse:0.428244	val-rmse:0.428499
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 200 rounds.
[50]	train-rmse:0.206374	val-rmse:0.22246
[100]	train-rmse:0.192583	val-rmse:0.218663
[150]	train-rmse:0.188175	val-rmse:0.218069
[200]	train-rmse:0.185963	val-rmse:0.217744
[250]	train-rmse:0.184399	val-rmse:0.217519
[300]	train-rmse:0.183068	val-rmse:0.217324
[350]	train-rmse:0.181758	val-rmse:0.217183
[400]	train-rmse:0.180831	val-rmse:0.217053
[450]	train-rmse:0.180019	val-rmse:0.216947
[500]	train-rmse:0.179164	val-rmse:0.216852
[550]	train-rmse:0.178012	val-rmse:0.21674
[600]	train-rmse:0.177318	val-rmse:0.216654
[650]	train-rmse:0.17646	val-rmse:0.216584
[700]	train-rmse:0.175666	val-rmse:0.216513
[750]	train-rmse:0.175084	val-rmse:0.216455
[800]	train-rmse:0.174488	val-rmse:0.216384
[850]	train-rmse:0.173727	val-rmse:0.216326
[900]	train-rmse:0.173125	val-rmse:0.216274
[950]	train-rmse:0.172526	va

100%|██████████| 11/11 [00:42<00:00,  3.87s/it]
100%|██████████| 11/11 [00:12<00:00,  1.14s/it]
100%|██████████| 11/11 [00:07<00:00,  1.49it/s]
100%|██████████| 11/11 [00:02<00:00,  5.30it/s]


[0]	train-rmse:0.428268	val-rmse:0.428268
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 200 rounds.
[50]	train-rmse:0.206206	val-rmse:0.223045
[100]	train-rmse:0.19193	val-rmse:0.219208
[150]	train-rmse:0.187961	val-rmse:0.218681
[200]	train-rmse:0.186008	val-rmse:0.218349
[250]	train-rmse:0.184404	val-rmse:0.218097
[300]	train-rmse:0.183261	val-rmse:0.217901
[350]	train-rmse:0.182138	val-rmse:0.217748
[400]	train-rmse:0.180899	val-rmse:0.21758
[450]	train-rmse:0.17988	val-rmse:0.217483
[500]	train-rmse:0.179148	val-rmse:0.217381
[550]	train-rmse:0.178207	val-rmse:0.217289
[600]	train-rmse:0.177403	val-rmse:0.217202
[650]	train-rmse:0.176682	val-rmse:0.217127
[700]	train-rmse:0.17603	val-rmse:0.217054
[750]	train-rmse:0.175367	val-rmse:0.216981
[800]	train-rmse:0.174707	val-rmse:0.216911
[850]	train-rmse:0.174051	val-rmse:0.216839
[900]	train-rmse:0.173532	val-rmse:0.216807
[950]	train-rmse:0.172811	val

100%|██████████| 11/11 [00:42<00:00,  3.90s/it]
100%|██████████| 11/11 [00:12<00:00,  1.15s/it]
100%|██████████| 11/11 [00:07<00:00,  1.45it/s]
100%|██████████| 11/11 [00:02<00:00,  5.18it/s]


[0]	train-rmse:0.428254	val-rmse:0.428327
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 200 rounds.
[50]	train-rmse:0.206268	val-rmse:0.22273
[100]	train-rmse:0.192193	val-rmse:0.218991
[150]	train-rmse:0.188059	val-rmse:0.2184
[200]	train-rmse:0.186035	val-rmse:0.218094
[250]	train-rmse:0.184321	val-rmse:0.217883
[300]	train-rmse:0.183031	val-rmse:0.217659
[350]	train-rmse:0.181902	val-rmse:0.217504
[400]	train-rmse:0.180929	val-rmse:0.217396
[450]	train-rmse:0.179703	val-rmse:0.217275
[500]	train-rmse:0.178733	val-rmse:0.217183
[550]	train-rmse:0.177949	val-rmse:0.21709
[600]	train-rmse:0.177008	val-rmse:0.217012
[650]	train-rmse:0.176196	val-rmse:0.216949
[700]	train-rmse:0.175618	val-rmse:0.216894
[750]	train-rmse:0.175087	val-rmse:0.216836
[800]	train-rmse:0.174465	val-rmse:0.216766
[850]	train-rmse:0.173875	val-rmse:0.216715
[900]	train-rmse:0.173376	val-rmse:0.216662
[950]	train-rmse:0.172844	val

100%|██████████| 11/11 [00:42<00:00,  3.91s/it]
100%|██████████| 11/11 [00:13<00:00,  1.20s/it]
100%|██████████| 11/11 [00:07<00:00,  1.42it/s]
100%|██████████| 11/11 [00:02<00:00,  5.12it/s]


[0]	train-rmse:0.428258	val-rmse:0.428331
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 200 rounds.
[50]	train-rmse:0.206168	val-rmse:0.222629
[100]	train-rmse:0.191711	val-rmse:0.218946
[150]	train-rmse:0.187657	val-rmse:0.218384
[200]	train-rmse:0.185959	val-rmse:0.218116
[250]	train-rmse:0.18381	val-rmse:0.21785
[300]	train-rmse:0.182483	val-rmse:0.21767
[350]	train-rmse:0.181178	val-rmse:0.217509
[400]	train-rmse:0.180348	val-rmse:0.217396
[450]	train-rmse:0.179438	val-rmse:0.217261
[500]	train-rmse:0.178641	val-rmse:0.217157
[550]	train-rmse:0.177938	val-rmse:0.217072
[600]	train-rmse:0.176896	val-rmse:0.216975
[650]	train-rmse:0.176243	val-rmse:0.216907
[700]	train-rmse:0.175432	val-rmse:0.216829
[750]	train-rmse:0.174636	val-rmse:0.21675
[800]	train-rmse:0.17393	val-rmse:0.216705
[850]	train-rmse:0.17312	val-rmse:0.21665
[900]	train-rmse:0.172619	val-rmse:0.216603
[950]	train-rmse:0.172022	val-rm

100%|██████████| 11/11 [00:43<00:00,  3.94s/it]
100%|██████████| 11/11 [00:12<00:00,  1.15s/it]
100%|██████████| 11/11 [00:07<00:00,  1.44it/s]
100%|██████████| 11/11 [00:02<00:00,  5.11it/s]


[0]	train-rmse:0.428218	val-rmse:0.428836
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 200 rounds.
[50]	train-rmse:0.206463	val-rmse:0.222076
[100]	train-rmse:0.19277	val-rmse:0.218203
[150]	train-rmse:0.188083	val-rmse:0.217597
[200]	train-rmse:0.185905	val-rmse:0.21723
[250]	train-rmse:0.184568	val-rmse:0.216992
[300]	train-rmse:0.183321	val-rmse:0.216813
[350]	train-rmse:0.182219	val-rmse:0.216646
[400]	train-rmse:0.180875	val-rmse:0.216477
[450]	train-rmse:0.180241	val-rmse:0.216384
[500]	train-rmse:0.179412	val-rmse:0.216261
[550]	train-rmse:0.178807	val-rmse:0.216161
[600]	train-rmse:0.177962	val-rmse:0.216061
[650]	train-rmse:0.177215	val-rmse:0.216003
[700]	train-rmse:0.176318	val-rmse:0.215907
[750]	train-rmse:0.17538	val-rmse:0.215832
[800]	train-rmse:0.174798	val-rmse:0.215783
[850]	train-rmse:0.174182	val-rmse:0.21573
[900]	train-rmse:0.173575	val-rmse:0.215659
[950]	train-rmse:0.172827	val

100%|██████████| 11/11 [00:43<00:00,  3.91s/it]
100%|██████████| 11/11 [00:12<00:00,  1.15s/it]
100%|██████████| 11/11 [00:07<00:00,  1.45it/s]
100%|██████████| 11/11 [00:02<00:00,  5.19it/s]


[0]	train-rmse:0.428268	val-rmse:0.428518
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 200 rounds.
[50]	train-rmse:0.206425	val-rmse:0.22187
[100]	train-rmse:0.192513	val-rmse:0.217961
[150]	train-rmse:0.187925	val-rmse:0.217333
[200]	train-rmse:0.186154	val-rmse:0.217022
[250]	train-rmse:0.184939	val-rmse:0.216797
[300]	train-rmse:0.183712	val-rmse:0.216613
[350]	train-rmse:0.182783	val-rmse:0.216461
[400]	train-rmse:0.181792	val-rmse:0.216334
[450]	train-rmse:0.180974	val-rmse:0.216206
[500]	train-rmse:0.179983	val-rmse:0.216087
[550]	train-rmse:0.17938	val-rmse:0.216012
[600]	train-rmse:0.178591	val-rmse:0.215907
[650]	train-rmse:0.177698	val-rmse:0.215827
[700]	train-rmse:0.17688	val-rmse:0.215739
[750]	train-rmse:0.176227	val-rmse:0.215675
[800]	train-rmse:0.175597	val-rmse:0.215614
[850]	train-rmse:0.174961	val-rmse:0.215561
[900]	train-rmse:0.174388	val-rmse:0.215488
[950]	train-rmse:0.173793	va

100%|██████████| 11/11 [00:43<00:00,  3.98s/it]
100%|██████████| 11/11 [00:13<00:00,  1.23s/it]
100%|██████████| 11/11 [00:08<00:00,  1.32it/s]
100%|██████████| 11/11 [00:02<00:00,  5.15it/s]


[0]	train-rmse:0.428236	val-rmse:0.428767
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 200 rounds.
[50]	train-rmse:0.206363	val-rmse:0.223255
[100]	train-rmse:0.192071	val-rmse:0.219386
[150]	train-rmse:0.18747	val-rmse:0.2188
[200]	train-rmse:0.185373	val-rmse:0.21852
[250]	train-rmse:0.183922	val-rmse:0.218308
[300]	train-rmse:0.182773	val-rmse:0.21812
[350]	train-rmse:0.181749	val-rmse:0.21799
[400]	train-rmse:0.18054	val-rmse:0.217876
[450]	train-rmse:0.179772	val-rmse:0.217786
[500]	train-rmse:0.178776	val-rmse:0.217673
[550]	train-rmse:0.178085	val-rmse:0.217581
[600]	train-rmse:0.177165	val-rmse:0.217509
[650]	train-rmse:0.176396	val-rmse:0.217447
[700]	train-rmse:0.175599	val-rmse:0.217403
[750]	train-rmse:0.174911	val-rmse:0.217349
[800]	train-rmse:0.174062	val-rmse:0.217296
[850]	train-rmse:0.173379	val-rmse:0.217253
[900]	train-rmse:0.172748	val-rmse:0.217202
[950]	train-rmse:0.172211	val-rm

100%|██████████| 11/11 [00:43<00:00,  3.96s/it]
100%|██████████| 11/11 [00:12<00:00,  1.14s/it]
100%|██████████| 11/11 [00:07<00:00,  1.46it/s]
100%|██████████| 11/11 [00:02<00:00,  5.23it/s]


[0]	train-rmse:0.4283	val-rmse:0.428157
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 200 rounds.
[50]	train-rmse:0.206337	val-rmse:0.222755
[100]	train-rmse:0.192268	val-rmse:0.218973
[150]	train-rmse:0.1876	val-rmse:0.218336
[200]	train-rmse:0.185835	val-rmse:0.218014
[250]	train-rmse:0.18434	val-rmse:0.217731
[300]	train-rmse:0.183043	val-rmse:0.217542
[400]	train-rmse:0.181327	val-rmse:0.217293
[450]	train-rmse:0.180452	val-rmse:0.217208
[500]	train-rmse:0.179502	val-rmse:0.217099
[600]	train-rmse:0.177711	val-rmse:0.216919
[650]	train-rmse:0.176866	val-rmse:0.216843
[700]	train-rmse:0.176047	val-rmse:0.216768
[750]	train-rmse:0.175379	val-rmse:0.216706
[800]	train-rmse:0.174836	val-rmse:0.21665
[850]	train-rmse:0.174227	val-rmse:0.216601
[900]	train-rmse:0.173583	val-rmse:0.216555
[950]	train-rmse:0.172933	val-rmse:0.216502
[1000]	train-rmse:0.172368	val-rmse:0.216472
[1050]	train-rmse:0.171724	val

100%|██████████| 11/11 [00:42<00:00,  3.91s/it]
100%|██████████| 11/11 [00:12<00:00,  1.14s/it]
100%|██████████| 11/11 [00:07<00:00,  1.46it/s]
100%|██████████| 11/11 [00:02<00:00,  5.31it/s]


[0]	train-rmse:0.428288	val-rmse:0.428308
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 200 rounds.
[50]	train-rmse:0.206384	val-rmse:0.22296
[100]	train-rmse:0.192587	val-rmse:0.219144
[150]	train-rmse:0.188344	val-rmse:0.218485
[200]	train-rmse:0.186425	val-rmse:0.21819
[250]	train-rmse:0.184638	val-rmse:0.217904
[300]	train-rmse:0.183243	val-rmse:0.217717
[350]	train-rmse:0.182298	val-rmse:0.217584
[400]	train-rmse:0.181192	val-rmse:0.21744
[450]	train-rmse:0.180342	val-rmse:0.217329
[500]	train-rmse:0.179383	val-rmse:0.217226
[550]	train-rmse:0.17843	val-rmse:0.217143
[600]	train-rmse:0.177725	val-rmse:0.217085
[650]	train-rmse:0.17697	val-rmse:0.217003
[700]	train-rmse:0.176271	val-rmse:0.216925
[750]	train-rmse:0.175618	val-rmse:0.216869
[800]	train-rmse:0.174876	val-rmse:0.216811
[850]	train-rmse:0.17397	val-rmse:0.216744
[900]	train-rmse:0.173394	val-rmse:0.216691
[950]	train-rmse:0.172704	val-r

100%|██████████| 11/11 [00:43<00:00,  3.94s/it]
100%|██████████| 11/11 [00:12<00:00,  1.15s/it]
100%|██████████| 11/11 [00:07<00:00,  1.45it/s]
100%|██████████| 11/11 [00:02<00:00,  5.21it/s]


[0]	train-rmse:0.428241	val-rmse:0.428613
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 200 rounds.
[50]	train-rmse:0.206389	val-rmse:0.222869
[100]	train-rmse:0.192405	val-rmse:0.21906
[150]	train-rmse:0.188139	val-rmse:0.218452
[200]	train-rmse:0.185966	val-rmse:0.218092
[250]	train-rmse:0.184793	val-rmse:0.2179
[300]	train-rmse:0.183567	val-rmse:0.217713
[350]	train-rmse:0.182555	val-rmse:0.217557
[400]	train-rmse:0.181424	val-rmse:0.217414
[450]	train-rmse:0.180583	val-rmse:0.217292
[500]	train-rmse:0.17974	val-rmse:0.217179
[550]	train-rmse:0.179059	val-rmse:0.217113
[600]	train-rmse:0.178158	val-rmse:0.217011
[650]	train-rmse:0.177477	val-rmse:0.21693
[700]	train-rmse:0.176707	val-rmse:0.21686
[750]	train-rmse:0.176148	val-rmse:0.216802
[800]	train-rmse:0.175549	val-rmse:0.216747
[850]	train-rmse:0.174684	val-rmse:0.216695
[900]	train-rmse:0.173749	val-rmse:0.216635
[950]	train-rmse:0.173155	val-r

In [5]:
train_df = pd.read_csv("train_with_image_features.csv", parse_dates=["activation_date"], nrows=nrows)


In [10]:
desc_1 = train_df.groupby('region')[['deal_probability', 'price']].describe()

In [14]:
train_df['region'].map(desc_1[('deal_probability', 'mean')])

0          0.122004
1          0.136721
2          0.135944
3          0.142602
4          0.145908
5          0.142602
6          0.146608
7          0.143685
8          0.155921
9          0.146608
10         0.143696
11         0.124193
12         0.155921
13         0.135944
14         0.148859
15         0.142602
16         0.147066
17         0.124689
18         0.129185
19         0.145603
20         0.124689
21         0.122004
22         0.143229
23         0.135944
24         0.146608
25         0.141007
26         0.136721
27         0.135480
28         0.145991
29         0.146608
             ...   
1503394    0.135944
1503395    0.139679
1503396    0.143882
1503397    0.128844
1503398    0.120357
1503399    0.145908
1503400    0.141007
1503401    0.143696
1503402    0.136721
1503403    0.146608
1503404    0.136721
1503405    0.131574
1503406    0.147066
1503407    0.122004
1503408    0.145991
1503409    0.141007
1503410    0.147066
1503411    0.136257
1503412    0.120357


In [15]:
desc_1

Unnamed: 0_level_0,deal_probability,deal_probability,deal_probability,deal_probability,deal_probability,deal_probability,deal_probability,deal_probability,price,price,price,price,price,price,price,price
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Алтайский край,41520.0,0.136257,0.256975,0.0,0.0,0.0,0.14983,1.0,39046.0,138424.7,1356891.0,0.0,400.0,1000.0,4000.0,150000000.0
Башкортостан,68291.0,0.148859,0.260909,0.0,0.0,0.0,0.19291,1.0,62700.0,197320.1,1938338.0,0.0,500.0,1200.0,7000.0,370000000.0
Белгородская область,28868.0,0.145603,0.263102,0.0,0.0,0.0,0.17092,1.0,26599.0,262343.9,4433644.0,0.0,500.0,1490.0,8000.0,400500700.0
Владимирская область,26741.0,0.140988,0.264097,0.0,0.0,0.0,0.14983,1.0,25189.0,214394.8,2967067.0,0.0,450.0,1100.0,8000.0,400100300.0
Волгоградская область,48998.0,0.145908,0.267685,0.0,0.0,0.0,0.16155,1.0,45405.0,155699.4,1204021.0,0.0,400.0,1000.0,6000.0,152000000.0
Воронежская область,44116.0,0.141007,0.265426,0.0,0.0,0.0,0.14689,1.0,41613.0,159444.3,1060954.0,0.0,500.0,1500.0,8000.0,120000000.0
Иркутская область,44030.0,0.139679,0.255214,0.0,0.0,0.0,0.160187,1.0,41363.0,2141739.0,390907100.0,0.0,500.0,1500.0,6500.0,79501010000.0
Калининградская область,32756.0,0.131574,0.262747,0.0,0.0,0.0,0.1243,1.0,31138.0,203655.8,1506144.0,0.0,500.0,1200.0,5700.0,100015000.0
Кемеровская область,44635.0,0.13548,0.25708,0.0,0.0,0.0,0.14502,1.0,42137.0,180450.3,7838382.0,0.0,450.0,1000.0,5000.0,1600025000.0
Краснодарский край,141416.0,0.147066,0.258484,0.0,0.0,0.0,0.18059,1.0,132313.0,846663.9,8417655.0,0.0,600.0,3000.0,32000.0,1170000000.0


In [4]:
class MakeBasicFeatures():
    def __init__(self, cols):
        self._stats = None
        self._agg_cols = cols

    def fit(self, df):

        self._stats = {}

        for c in tqdm(self._agg_cols, total=len(self._agg_cols)):

            gp = df.groupby(c)[['deal_probability', 'price']]
            desc = gp.describe()
            self._stats[c] = desc[[('deal_probability', 'mean'), ('deal_probability', 'std'),
                                   ('price', 'mean'), ('price', 'std')]]

    def transform(self, df):

        for c in tqdm(self._agg_cols, total=len(self._agg_cols)):

            df[c + '_dp_mean'] = df[c].map(self._stats[c][('deal_probability', 'mean')])
            df[c + '_dp_std'] = df[c].map(self._stats[c][('deal_probability', 'std')])
            df[c + '_price_mean'] = df[c].map(self._stats[c][('price', 'mean')])
            df[c + '_price_std'] = df[c].map(self._stats[c][('price', 'std')])

            df[c + '_to_price'] = df.price / df[c + '_price_mean']
            df[c + '_to_price'] = df[c + '_to_price'].fillna(1.0)

    def fit_transform(self, df):
        self.fit(df)
        self.transform(df)


fStats = MakeBasicFeatures(['region', 'city', 'parent_category_name', 'category_name',
                             'param_1', 'param_2', 'param_3', 'user_type', 'image_top_1',
                             'ads_count', 'weekday'])