In [29]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing, model_selection, metrics
import lightgbm as lgb


import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

color = sns.color_palette()
%matplotlib inline
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [7]:
train_df = pd.read_csv("data/train.csv", parse_dates=["activation_date"])
test_df = pd.read_csv("data/test.csv", parse_dates=["activation_date"])
print("Train file rows and columns are : ", train_df.shape)
print("Test file rows and columns are : ", test_df.shape)

Train file rows and columns are :  (1503424, 18)
Test file rows and columns are :  (508438, 17)


In [8]:
train_y = train_df["deal_probability"].values
test_id = test_df["item_id"].values

# New variable on weekday #
train_df["activation_weekday"] = train_df["activation_date"].dt.weekday
test_df["activation_weekday"] = test_df["activation_date"].dt.weekday

# Label encode the categorical variables #
cat_vars = ["region", "city", "parent_category_name", "category_name", "user_type", "param_1", "param_2", "param_3"]
for col in cat_vars:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))



In [11]:
cols_to_drop = ["item_id", "user_id", "title", "activation_date", "image"]
train_X = train_df.drop(cols_to_drop + ["deal_probability"], axis=1)
test_X = test_df.drop(cols_to_drop, axis=1)

In [36]:
#Creating Word Vector for Description
EMBEDDING_FILE = f'wiki.ru.vec'
embed_size = 300
max_features = 20000
maxlen = 300

In [42]:
list_sentences_train = train_X['description'].fillna('_na_').values
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
X_train = pad_sequences(list_tokenized_train, maxlen = maxlen)

In [53]:
#def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
#embeddings_index = dict(get_coefs(o.strip().split()[0], *o.strip().split()[1:]) for o in open(EMBEDDING_FILE))
embedding_index = {}
for o in open(EMBEDDING_FILE):
    vec = o.strip().split(' ')
    word = vec[0]
    arr = np.asarray(vec[1:], dtype='float32')
    embedding_index[word] = arr

In [51]:
i = 1
for o in open(EMBEDDING_FILE):
    i += 1
    print(o.strip().split())
    if i == 5:
        break

['1888423', '300']
['</s>', '0.0052262', '0.20497', '0.096731', '0.047762', '0.04126', '0.055935', '-0.039348', '-0.14709', '0.15184', '0.17101', '-0.018269', '0.096936', '0.22801', '0.10778', '-0.09184', '-0.36971', '-0.15413', '-0.32771', '0.26368', '0.42271', '0.25759', '0.012656', '-0.26443', '-0.07946', '0.032226', '-0.42462', '0.12959', '0.015581', '-0.1945', '-0.16886', '-0.08058', '-0.11546', '0.254', '-0.10331', '-0.04658', '0.092436', '-0.34151', '-0.10176', '-0.077935', '0.26197', '-0.12292', '0.097782', '0.012892', '0.015233', '0.07506', '0.013053', '-0.10969', '0.053725', '0.04215', '0.15373', '0.052467', '0.27102', '0.20063', '-0.17201', '0.42024', '0.20941', '0.17578', '0.060628', '0.17401', '-0.20136', '-0.10175', '-0.27376', '0.021373', '0.055022', '0.20643', '0.31438', '-0.18813', '-0.18306', '0.23244', '-0.099184', '0.03191', '0.010869', '0.1723', '-0.11144', '-0.0047661', '-0.12414', '-0.079561', '0.16612', '0.40702', '-0.14031', '0.25533', '0.22604', '0.24683', '0.

In [17]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 30,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.7,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20, evals_result=evals_result)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result

In [18]:
# Splitting the data for model training#
dev_X = train_X.iloc[:-200000,:]
val_X = train_X.iloc[-200000:,:]
dev_y = train_y[:-200000]
val_y = train_y[-200000:]
print(dev_X.shape, val_X.shape, test_X.shape)

# Training the model #
pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, test_X)

# Making a submission file #
pred_test[pred_test>1] = 1
pred_test[pred_test<0] = 0
sub_df = pd.DataFrame({"item_id":test_id})
sub_df["deal_probability"] = pred_test
sub_df.to_csv("baseline_lgb.csv", index=False)

(1303424, 12) (200000, 12) (508438, 12)
Training until validation scores don't improve for 100 rounds.
[20]	valid_0's rmse: 0.236221
[40]	valid_0's rmse: 0.233507
[60]	valid_0's rmse: 0.232136
[80]	valid_0's rmse: 0.231493
[100]	valid_0's rmse: 0.231028
[120]	valid_0's rmse: 0.230668
[140]	valid_0's rmse: 0.230338
[160]	valid_0's rmse: 0.230086
[180]	valid_0's rmse: 0.229867
[200]	valid_0's rmse: 0.22968
[220]	valid_0's rmse: 0.229557
[240]	valid_0's rmse: 0.2294
[260]	valid_0's rmse: 0.229287
[280]	valid_0's rmse: 0.229155
[300]	valid_0's rmse: 0.229047
[320]	valid_0's rmse: 0.228926
[340]	valid_0's rmse: 0.228818
[360]	valid_0's rmse: 0.228743
[380]	valid_0's rmse: 0.228671
[400]	valid_0's rmse: 0.22859
[420]	valid_0's rmse: 0.228505
[440]	valid_0's rmse: 0.228418
[460]	valid_0's rmse: 0.228355
[480]	valid_0's rmse: 0.228291
[500]	valid_0's rmse: 0.228261
[520]	valid_0's rmse: 0.22821
[540]	valid_0's rmse: 0.22815
[560]	valid_0's rmse: 0.228073
[580]	valid_0's rmse: 0.228022
[600]	va