In [1]:
# coding:utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import ensemble, metrics
from datetime import datetime as dt
from sklearn.preprocessing import LabelEncoder


# pandas
import pandas as pd

# LightGBM
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 交差検証
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
purchase_df = pd.read_csv('../input/purchase_record.csv')
user_df = pd.read_csv('../input/user_info.csv')
test_df = pd.read_csv('../input/purchase_record_test.csv')

In [3]:
purchase_df.shape

(1803574, 14)

In [4]:
purchase_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1803574 entries, 0 to 1803573
Data columns (total 14 columns):
user_id        object
purchase_id    object
date           object
product_id     object
parts_1        object
parts_2        object
parts_3        object
parts_4        object
parts_5        object
parts_6        object
parts_7        object
parts_8        object
parts_9        object
purchase       int64
dtypes: int64(1), object(13)
memory usage: 192.6+ MB


In [5]:
purchase_df.head()

Unnamed: 0,user_id,purchase_id,date,product_id,parts_1,parts_2,parts_3,parts_4,parts_5,parts_6,parts_7,parts_8,parts_9,purchase
0,uid_100,pid_0000000000,2017-09-27,product 2,,,,,,,,,,0
1,uid_100,pid_0000000001,2017-09-27,product 2,,,,,,,,,,0
2,uid_100002,pid_0000000002,2017-11-23,product 2,,,,,,,,,,1
3,uid_100002,pid_0000000003,2017-11-23,product 2,,,,,,,,,,1
4,uid_100003,pid_0000000004,2018-02-07,product 2,,,,,,,,,,1


In [6]:
purchase_df.fillna(0, inplace=True)

In [7]:
purchase_df.purchase.value_counts()

0    928044
1    875530
Name: purchase, dtype: int64

In [8]:
user_df.shape

(189114, 32)

In [9]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189114 entries, 0 to 189113
Data columns (total 32 columns):
user_id         189114 non-null object
date            189114 non-null object
attribute_1     189114 non-null object
attribute_2     189114 non-null object
attribute_3     189114 non-null object
attribute_4     189114 non-null bool
attribute_5     189114 non-null bool
attribute_6     189114 non-null bool
attribute_7     189114 non-null bool
attribute_8     189114 non-null bool
attribute_9     189114 non-null bool
attribute_10    189114 non-null bool
attribute_11    189114 non-null bool
attribute_12    189114 non-null bool
attribute_13    189114 non-null bool
attribute_14    189114 non-null bool
attribute_15    189114 non-null bool
attribute_16    189114 non-null bool
attribute_17    189114 non-null bool
attribute_18    189114 non-null bool
attribute_19    189114 non-null bool
attribute_20    189114 non-null bool
attribute_21    189114 non-null bool
attribute_22    189114 non-n

In [10]:
user_df.head()

Unnamed: 0,user_id,date,attribute_1,attribute_2,attribute_3,attribute_4,attribute_5,attribute_6,attribute_7,attribute_8,...,attribute_21,attribute_22,attribute_23,attribute_24,attribute_25,attribute_26,attribute_27,attribute_28,attribute_29,attribute_30
0,uid_100,2016-06-29,id 5,id 5,id 5,True,False,False,True,True,...,True,False,True,True,False,False,True,True,True,False
1,uid_100002,2016-01-06,id 28,id 9,id 5,False,False,True,True,False,...,False,False,True,True,True,True,True,True,True,False
2,uid_100003,2017-06-10,id 4,id 8,id 5,True,True,True,True,True,...,True,False,False,True,True,True,True,False,True,True
3,uid_100004,2017-07-20,id 40,id 25,id 9,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4,uid_100006,2017-07-27,id 40,id 25,id 9,False,False,False,False,False,...,False,False,False,True,False,False,False,True,True,False


In [11]:
train_df = pd.merge(purchase_df, user_df, how='left', on='user_id')

In [12]:
train_df.head()

Unnamed: 0,user_id,purchase_id,date_x,product_id,parts_1,parts_2,parts_3,parts_4,parts_5,parts_6,...,attribute_21,attribute_22,attribute_23,attribute_24,attribute_25,attribute_26,attribute_27,attribute_28,attribute_29,attribute_30
0,uid_100,pid_0000000000,2017-09-27,product 2,0,0,0,0,0,0,...,True,False,True,True,False,False,True,True,True,False
1,uid_100,pid_0000000001,2017-09-27,product 2,0,0,0,0,0,0,...,True,False,True,True,False,False,True,True,True,False
2,uid_100002,pid_0000000002,2017-11-23,product 2,0,0,0,0,0,0,...,False,False,True,True,True,True,True,True,True,False
3,uid_100002,pid_0000000003,2017-11-23,product 2,0,0,0,0,0,0,...,False,False,True,True,True,True,True,True,True,False
4,uid_100003,pid_0000000004,2018-02-07,product 2,0,0,0,0,0,0,...,True,False,False,True,True,True,True,False,True,True


In [13]:
import pandas_profiling as pdp # pandas_profilingインポート
pdp.ProfileReport(train_df) # レポートの作成



In [14]:
# train_dfから日付を削除する
print( train_df.drop('date_x', axis=1).head())
train_df = train_df.drop('date_x', axis=1)
train_df = train_df.drop('purchase_id', axis=1)

      user_id     purchase_id product_id parts_1 parts_2 parts_3 parts_4  \
0     uid_100  pid_0000000000  product 2       0       0       0       0   
1     uid_100  pid_0000000001  product 2       0       0       0       0   
2  uid_100002  pid_0000000002  product 2       0       0       0       0   
3  uid_100002  pid_0000000003  product 2       0       0       0       0   
4  uid_100003  pid_0000000004  product 2       0       0       0       0   

  parts_5 parts_6 parts_7  ... attribute_21 attribute_22  attribute_23  \
0       0       0       0  ...         True        False          True   
1       0       0       0  ...         True        False          True   
2       0       0       0  ...        False        False          True   
3       0       0       0  ...        False        False          True   
4       0       0       0  ...         True        False         False   

  attribute_24 attribute_25 attribute_26 attribute_27  attribute_28  \
0         True        False