In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing, model_selection, metrics
import lightgbm as lgb


import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

color = sns.color_palette()
%matplotlib inline
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

In [9]:
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [10]:
train_df = pd.read_csv("/Users/shuvrajit/ComputerScience/Dev/KaggleChallanges/downloads/DemandPrediction/train.csv", parse_dates=["activation_date"])
test_df = pd.read_csv("/Users/shuvrajit/ComputerScience/Dev/KaggleChallanges/downloads/DemandPrediction/test.csv", parse_dates=["activation_date"])
print("Train file rows and columns are : ", train_df.shape)
print("Test file rows and columns are : ", test_df.shape)

Train file rows and columns are :  (1503424, 18)
Test file rows and columns are :  (508438, 17)


In [8]:
train_y = train_df["deal_probability"].values
test_id = test_df["item_id"].values

# New variable on weekday #
train_df["activation_weekday"] = train_df["activation_date"].dt.weekday
test_df["activation_weekday"] = test_df["activation_date"].dt.weekday

# Label encode the categorical variables #
cat_vars = ["region", "city", "parent_category_name", "category_name", "user_type", "param_1", "param_2", "param_3"]
for col in cat_vars:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))



In [11]:
cols_to_drop = ["item_id", "user_id", "title", "activation_date", "image"]
train_X = train_df.drop(cols_to_drop + ["deal_probability"], axis=1)
test_X = test_df.drop(cols_to_drop, axis=1)

In [12]:
#Creating Word Vector for Description
EMBEDDING_FILE = f'/Users/shuvrajit/ComputerScience/Dev/KaggleChallanges/downloads/DemandPrediction/wiki.ru.vec'
embed_size = 300
max_features = 20000
maxlen = 300

In [13]:
list_sentences_train = train_X['description'].fillna('_na_').values
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
X_train = pad_sequences(list_tokenized_train, maxlen = maxlen)

In [15]:
#def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
#embeddings_index = dict(get_coefs(o.strip().split()[0], *o.strip().split()[1:]) for o in open(EMBEDDING_FILE))
embedding_index = {}
for o in open(EMBEDDING_FILE):
    vec = o.strip().split(' ')
    word = vec[0]
    arr = np.asarray(vec[1:], dtype='float32')
    if len(vec[1:]) == 300:
        embedding_index[word] = arr

In [17]:
i = 0
for key in embedding_index:
    i += 1
    if len(embedding_index[key]) != 300:
        print(key)
        break
#print(i, l)

In [18]:
all_embs = np.stack(embedding_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.003528327, 0.2954682)

In [19]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [22]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(300, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(300, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="sigmoid", name = 'my_layer')(x)
#layer_name = 'my_layer'
model = Model(inputs=inp, outputs=x)
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer(layer_name).output)

model.compile(loss='mean_squared_error', optimizer='adam')


In [35]:
intermediate_output = intermediate_layer_model.predict(X_train[1:5])
#X_train[0].shape

In [36]:
intermediate_output

array([[0.5145814 , 0.45168105, 0.44174707, 0.53815275, 0.49735138,
        0.5191435 , 0.438084  , 0.49662465, 0.43768388, 0.45061246,
        0.4574356 , 0.4744294 , 0.4641357 , 0.53288555, 0.5424755 ,
        0.43678963, 0.4772676 , 0.532655  , 0.55526876, 0.48597708,
        0.44658732, 0.50368047, 0.4931569 , 0.5274568 , 0.51022637,
        0.49541217, 0.5171523 , 0.42435357, 0.50572634, 0.47048974,
        0.45545664, 0.48040065, 0.49365968, 0.4708676 , 0.5717914 ,
        0.49891979, 0.48603195, 0.43684295, 0.50723636, 0.5246791 ,
        0.49917704, 0.4746427 , 0.4772224 , 0.52373767, 0.50451505,
        0.5182245 , 0.46885806, 0.52016264, 0.5173303 , 0.48977652],
       [0.48377913, 0.4701091 , 0.44298217, 0.53482664, 0.51578003,
        0.5222676 , 0.44861495, 0.49661312, 0.44453806, 0.46103522,
        0.4466633 , 0.46372524, 0.4509338 , 0.52669954, 0.53323054,
        0.43376154, 0.46445298, 0.5430966 , 0.5510078 , 0.49908578,
        0.45838836, 0.47341922, 0.51408046, 0.5

In [101]:
model.fit(X_train[:2400], train_y[:2400], epochs = 2, batch_size=16)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1ad88a2e10>

In [None]:
model.evaluate(X_train[3001:4600], train_y[3001:4600], batch_size = 16)

In [17]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 30,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.7,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20, evals_result=evals_result)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result

In [18]:
# Splitting the data for model training#
dev_X = train_X.iloc[:-200000,:]
val_X = train_X.iloc[-200000:,:]
dev_y = train_y[:-200000]
val_y = train_y[-200000:]
print(dev_X.shape, val_X.shape, test_X.shape)

# Training the model #
pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, test_X)

# Making a submission file #
pred_test[pred_test>1] = 1
pred_test[pred_test<0] = 0
sub_df = pd.DataFrame({"item_id":test_id})
sub_df["deal_probability"] = pred_test
sub_df.to_csv("baseline_lgb.csv", index=False)

(1303424, 12) (200000, 12) (508438, 12)
Training until validation scores don't improve for 100 rounds.
[20]	valid_0's rmse: 0.236221
[40]	valid_0's rmse: 0.233507
[60]	valid_0's rmse: 0.232136
[80]	valid_0's rmse: 0.231493
[100]	valid_0's rmse: 0.231028
[120]	valid_0's rmse: 0.230668
[140]	valid_0's rmse: 0.230338
[160]	valid_0's rmse: 0.230086
[180]	valid_0's rmse: 0.229867
[200]	valid_0's rmse: 0.22968
[220]	valid_0's rmse: 0.229557
[240]	valid_0's rmse: 0.2294
[260]	valid_0's rmse: 0.229287
[280]	valid_0's rmse: 0.229155
[300]	valid_0's rmse: 0.229047
[320]	valid_0's rmse: 0.228926
[340]	valid_0's rmse: 0.228818
[360]	valid_0's rmse: 0.228743
[380]	valid_0's rmse: 0.228671
[400]	valid_0's rmse: 0.22859
[420]	valid_0's rmse: 0.228505
[440]	valid_0's rmse: 0.228418
[460]	valid_0's rmse: 0.228355
[480]	valid_0's rmse: 0.228291
[500]	valid_0's rmse: 0.228261
[520]	valid_0's rmse: 0.22821
[540]	valid_0's rmse: 0.22815
[560]	valid_0's rmse: 0.228073
[580]	valid_0's rmse: 0.228022
[600]	va