In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing, model_selection, metrics
import lightgbm as lgb

color = sns.color_palette()
%matplotlib inline

#import plotly.offline as py
#py.init_notebook_mode(connected=True)
#import plotly.graph_objs as go
#import plotly.tools as tls

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

In [4]:
train_df = pd.read_csv("data/train.csv", parse_dates=["activation_date"])
test_df = pd.read_csv("data/test.csv", parse_dates=["activation_date"])
print("Train file rows and columns are : ", train_df.shape)
print("Test file rows and columns are : ", test_df.shape)

Train file rows and columns are :  (1503424, 18)
Test file rows and columns are :  (508438, 17)


In [9]:
train_y = train_df["deal_probability"].values
test_id = test_df["item_id"].values

# New variable on weekday #
train_df["activation_weekday"] = train_df["activation_date"].dt.weekday
test_df["activation_weekday"] = test_df["activation_date"].dt.weekday

# Label encode the categorical variables #
cat_vars = ["region", "city", "parent_category_name", "category_name", "user_type", "param_1", "param_2", "param_3"]
for col in cat_vars:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))

cols_to_drop = ["item_id", "user_id", "title", "description", "activation_date", "image"]


In [12]:
train_X = train_df.drop(cols_to_drop + ["deal_probability"], axis=1)
test_X = test_df.drop(cols_to_drop, axis=1)

In [17]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 30,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.7,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20, evals_result=evals_result)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result

In [18]:
# Splitting the data for model training#
dev_X = train_X.iloc[:-200000,:]
val_X = train_X.iloc[-200000:,:]
dev_y = train_y[:-200000]
val_y = train_y[-200000:]
print(dev_X.shape, val_X.shape, test_X.shape)

# Training the model #
pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, test_X)

# Making a submission file #
pred_test[pred_test>1] = 1
pred_test[pred_test<0] = 0
sub_df = pd.DataFrame({"item_id":test_id})
sub_df["deal_probability"] = pred_test
sub_df.to_csv("baseline_lgb.csv", index=False)

(1303424, 12) (200000, 12) (508438, 12)
Training until validation scores don't improve for 100 rounds.
[20]	valid_0's rmse: 0.236221
[40]	valid_0's rmse: 0.233507
[60]	valid_0's rmse: 0.232136
[80]	valid_0's rmse: 0.231493
[100]	valid_0's rmse: 0.231028
[120]	valid_0's rmse: 0.230668
[140]	valid_0's rmse: 0.230338
[160]	valid_0's rmse: 0.230086
[180]	valid_0's rmse: 0.229867
[200]	valid_0's rmse: 0.22968
[220]	valid_0's rmse: 0.229557
[240]	valid_0's rmse: 0.2294
[260]	valid_0's rmse: 0.229287
[280]	valid_0's rmse: 0.229155
[300]	valid_0's rmse: 0.229047
[320]	valid_0's rmse: 0.228926
[340]	valid_0's rmse: 0.228818
[360]	valid_0's rmse: 0.228743
[380]	valid_0's rmse: 0.228671
[400]	valid_0's rmse: 0.22859
[420]	valid_0's rmse: 0.228505
[440]	valid_0's rmse: 0.228418
[460]	valid_0's rmse: 0.228355
[480]	valid_0's rmse: 0.228291
[500]	valid_0's rmse: 0.228261
[520]	valid_0's rmse: 0.22821
[540]	valid_0's rmse: 0.22815
[560]	valid_0's rmse: 0.228073
[580]	valid_0's rmse: 0.228022
[600]	va