In [1]:
# coding:utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import ensemble, metrics
from datetime import datetime as dt
from sklearn.preprocessing import LabelEncoder

# pandas
import pandas as pd
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_colwidth", 1000)

# LightGBM
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 交差検証
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# AUC
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

#catboost
from catboost import CatBoostClassifier, Pool

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
purchase_df = pd.read_csv('../input/purchase_record.csv')
user_df = pd.read_csv('../input/user_info.csv')
test_df = pd.read_csv('../input/purchase_record_test.csv')

In [4]:
purchase_df.fillna(0, inplace=True)

In [5]:
### FREQUENCY ENCODING

encoding = purchase_df.groupby('product_id').size()
encoding = encoding/len(purchase_df)
purchase_df['enc_product_id'] = purchase_df.product_id.map(encoding)

encoding = test_df.groupby('product_id').size()
encoding = encoding/len(test_df)
test_df['enc_product_id'] = test_df.product_id.map(encoding)

In [6]:
encoding = user_df.groupby('attribute_1').size()
encoding = encoding/len(user_df)
user_df['enc_attribute_1'] = user_df.attribute_1.map(encoding)

encoding = user_df.groupby('attribute_2').size()
encoding = encoding/len(user_df)
user_df['enc_attribute_2'] = user_df.attribute_2.map(encoding)

encoding = user_df.groupby('attribute_3').size()
encoding = encoding/len(user_df)
user_df['enc_attribute_3'] = user_df.attribute_3.map(encoding)

In [7]:
train_df = pd.merge(purchase_df, user_df, how='left', on='user_id')

In [8]:
train_df['date_diff'] = (pd.to_datetime(train_df['date_x']) - pd.to_datetime(train_df['date_y'])).dt.days

In [9]:
train_df['date_x_dt'] = pd.to_datetime(train_df['date_x'])
train_df['date_y_dt'] = pd.to_datetime(train_df['date_y'])

train_df['year_x'] = train_df['date_x_dt'].dt.to_period('Y').astype(int).astype(float)
train_df['month_x'] = train_df['date_x_dt'].dt.to_period('M').astype(int).astype(float)
train_df['day_x'] = train_df['date_x_dt'].dt.to_period('D').astype(int).astype(float)
train_df['year_y'] = train_df['date_y_dt'].dt.to_period('Y').astype(int).astype(float)
train_df['month_y'] = train_df['date_y_dt'].dt.to_period('M').astype(int).astype(float)
train_df['day_y'] = train_df['date_y_dt'].dt.to_period('D').astype(int).astype(float)

In [10]:
train_df.drop(columns = 'date_x_dt', inplace=True)
train_df.drop(columns = 'date_y_dt', inplace=True)

In [11]:
test_df.fillna(0, inplace=True)

In [12]:
test_df = pd.merge(test_df, user_df, how='left', on='user_id')

In [13]:
test_df['date_diff'] = (pd.to_datetime(test_df['date_x']) - pd.to_datetime(test_df['date_y'])).dt.days

In [14]:
test_df['date_x_dt'] = pd.to_datetime(test_df['date_x'])
test_df['date_y_dt'] = pd.to_datetime(test_df['date_y'])

test_df['year_x'] = test_df['date_x_dt'].dt.to_period('Y').astype(int).astype(float)
test_df['month_x'] = test_df['date_x_dt'].dt.to_period('M').astype(int).astype(float)
test_df['day_x'] = test_df['date_x_dt'].dt.to_period('D').astype(int).astype(float)
test_df['year_y'] = test_df['date_y_dt'].dt.to_period('Y').astype(int).astype(float)
test_df['month_y'] = test_df['date_y_dt'].dt.to_period('M').astype(int).astype(float)
test_df['day_y'] = test_df['date_y_dt'].dt.to_period('D').astype(int).astype(float)

In [15]:
test_df.drop(columns = 'date_x_dt', inplace=True)
test_df.drop(columns = 'date_y_dt', inplace=True)

In [16]:
X = train_df.drop(columns='purchase')
y = train_df.purchase

In [17]:
# 必要ない列を削除
X = X.drop('user_id', axis=1)
X = X.drop('purchase_id', axis=1)
X = X.drop('date_x', axis=1)
X = X.drop('date_y', axis=1)

In [18]:
X.head(500)

Unnamed: 0,date_x,product_id,parts_1,parts_2,parts_3,parts_4,parts_5,parts_6,parts_7,parts_8,parts_9,enc_product_id,date_y,attribute_1,attribute_2,attribute_3,attribute_4,attribute_5,attribute_6,attribute_7,attribute_8,attribute_9,attribute_10,attribute_11,attribute_12,attribute_13,attribute_14,attribute_15,attribute_16,attribute_17,attribute_18,attribute_19,attribute_20,attribute_21,attribute_22,attribute_23,attribute_24,attribute_25,attribute_26,attribute_27,attribute_28,attribute_29,attribute_30,enc_attribute_1,enc_attribute_2,enc_attribute_3,date_diff,year_x,month_x,day_x,year_y,month_y,day_y
0,2017-09-27,product 2,0,0,0,0,0,0,0,0,0,0.450107,2016-06-29,id 5,id 5,id 5,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,0.073892,0.076166,0.152797,455,47.0,572.0,17436.0,46.0,557.0,16981.0
1,2017-09-27,product 2,0,0,0,0,0,0,0,0,0,0.450107,2016-06-29,id 5,id 5,id 5,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,0.073892,0.076166,0.152797,455,47.0,572.0,17436.0,46.0,557.0,16981.0
2,2017-11-23,product 2,0,0,0,0,0,0,0,0,0,0.450107,2016-01-06,id 28,id 9,id 5,False,False,True,True,False,False,False,True,False,False,False,False,False,True,False,True,True,False,False,True,True,True,True,True,True,True,False,0.003569,0.01222,0.152797,687,47.0,574.0,17493.0,46.0,552.0,16806.0
3,2017-11-23,product 2,0,0,0,0,0,0,0,0,0,0.450107,2016-01-06,id 28,id 9,id 5,False,False,True,True,False,False,False,True,False,False,False,False,False,True,False,True,True,False,False,True,True,True,True,True,True,True,False,0.003569,0.01222,0.152797,687,47.0,574.0,17493.0,46.0,552.0,16806.0
4,2018-02-07,product 2,0,0,0,0,0,0,0,0,0,0.450107,2017-06-10,id 4,id 8,id 5,True,True,True,True,True,True,False,True,False,True,False,True,True,True,True,True,True,True,False,False,True,True,True,True,False,True,True,0.14854,0.039706,0.152797,242,48.0,577.0,17569.0,47.0,569.0,17327.0
5,2018-06-28,product 2,0,0,0,0,0,0,0,0,0,0.450107,2017-06-10,id 4,id 8,id 5,True,True,True,True,True,True,False,True,False,True,False,True,True,True,True,True,True,True,False,False,True,True,True,True,False,True,True,0.14854,0.039706,0.152797,383,48.0,581.0,17710.0,47.0,569.0,17327.0
6,2017-08-10,product 4,0,0,0,0,0,0,0,0,0,0.094757,2017-06-10,id 4,id 8,id 5,True,True,True,True,True,True,False,True,False,True,False,True,True,True,True,True,True,True,False,False,True,True,True,True,False,True,True,0.14854,0.039706,0.152797,61,47.0,571.0,17388.0,47.0,569.0,17327.0
7,2018-03-02,product 4,0,0,0,0,0,0,0,0,0,0.094757,2017-06-10,id 4,id 8,id 5,True,True,True,True,True,True,False,True,False,True,False,True,True,True,True,True,True,True,False,False,True,True,True,True,False,True,True,0.14854,0.039706,0.152797,265,48.0,578.0,17592.0,47.0,569.0,17327.0
8,2017-09-13,product 2,0,0,0,0,0,0,0,0,0,0.450107,2017-06-10,id 4,id 8,id 5,True,True,True,True,True,True,False,True,False,True,False,True,True,True,True,True,True,True,False,False,True,True,True,True,False,True,True,0.14854,0.039706,0.152797,95,47.0,572.0,17422.0,47.0,569.0,17327.0
9,2018-02-10,product 4,0,0,0,0,0,0,0,0,0,0.094757,2017-06-10,id 4,id 8,id 5,True,True,True,True,True,True,False,True,False,True,False,True,True,True,True,True,True,True,False,False,True,True,True,True,False,True,True,0.14854,0.039706,0.152797,245,48.0,577.0,17572.0,47.0,569.0,17327.0


In [19]:
# カテゴリカル変数として扱うカラム指定
categorical_features_indices = np.where(X.dtypes != np.float)[0]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
# データセットを生成する
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [22]:
# パラメータを定義
lgbm_params = {'objective': 'binary',
                           'metric': 'auc',}       
# 学習
model = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_eval)

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields date_x, product_id, parts_1, parts_2, parts_3, parts_4, parts_5, parts_6, parts_7, parts_8, parts_9, date_y, attribute_1, attribute_2, attribute_3

In [None]:
%%time
model.fit(X_train, y_train, eval_set=(X_test, y_test), plot=True)

In [None]:
%%time
preds_class = model.predict(test_df)
preds_proba = model.predict_proba(test_df)

In [None]:
fi = model.feature_importances_

In [None]:
feature_importance_df = pd.DataFrame(X.columns)

In [None]:
feature_importance_df['feature_importances'] = fi

In [None]:
feature_importance_df.sort_values(by='feature_importances', ascending=False)

In [None]:
df_preds_proba = pd.DataFrame(preds_proba)

In [None]:
test_df['probability'] = df_preds_proba[1]

In [None]:
submit_df = test_df[['purchase_id', 'probability']]

In [None]:
submit_df.to_csv('../output/submit.csv', header=False, index=False)