In [1]:
# coding:utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import ensemble, metrics
from datetime import datetime as dt
from sklearn.preprocessing import LabelEncoder


# pandas
import pandas as pd

# LightGBM
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 交差検証
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [2]:
purchase_df = pd.read_csv('../input/purchase_record.csv')
user_df = pd.read_csv('../input/user_info.csv')
test_df = pd.read_csv('../input/purchase_record_test.csv')

In [3]:
purchase_df.shape

(1803574, 14)

In [4]:
purchase_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1803574 entries, 0 to 1803573
Data columns (total 14 columns):
user_id        object
purchase_id    object
date           object
product_id     object
parts_1        object
parts_2        object
parts_3        object
parts_4        object
parts_5        object
parts_6        object
parts_7        object
parts_8        object
parts_9        object
purchase       int64
dtypes: int64(1), object(13)
memory usage: 192.6+ MB


In [5]:
purchase_df.head()

Unnamed: 0,user_id,purchase_id,date,product_id,parts_1,parts_2,parts_3,parts_4,parts_5,parts_6,parts_7,parts_8,parts_9,purchase
0,uid_100,pid_0000000000,2017-09-27,product 2,,,,,,,,,,0
1,uid_100,pid_0000000001,2017-09-27,product 2,,,,,,,,,,0
2,uid_100002,pid_0000000002,2017-11-23,product 2,,,,,,,,,,1
3,uid_100002,pid_0000000003,2017-11-23,product 2,,,,,,,,,,1
4,uid_100003,pid_0000000004,2018-02-07,product 2,,,,,,,,,,1


In [6]:
purchase_df.fillna(0, inplace=True)

In [7]:
purchase_df.purchase.value_counts()

0    928044
1    875530
Name: purchase, dtype: int64

In [8]:
user_df.shape

(189114, 32)

In [9]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189114 entries, 0 to 189113
Data columns (total 32 columns):
user_id         189114 non-null object
date            189114 non-null object
attribute_1     189114 non-null object
attribute_2     189114 non-null object
attribute_3     189114 non-null object
attribute_4     189114 non-null bool
attribute_5     189114 non-null bool
attribute_6     189114 non-null bool
attribute_7     189114 non-null bool
attribute_8     189114 non-null bool
attribute_9     189114 non-null bool
attribute_10    189114 non-null bool
attribute_11    189114 non-null bool
attribute_12    189114 non-null bool
attribute_13    189114 non-null bool
attribute_14    189114 non-null bool
attribute_15    189114 non-null bool
attribute_16    189114 non-null bool
attribute_17    189114 non-null bool
attribute_18    189114 non-null bool
attribute_19    189114 non-null bool
attribute_20    189114 non-null bool
attribute_21    189114 non-null bool
attribute_22    189114 non-n

In [10]:
user_df.head()

Unnamed: 0,user_id,date,attribute_1,attribute_2,attribute_3,attribute_4,attribute_5,attribute_6,attribute_7,attribute_8,...,attribute_21,attribute_22,attribute_23,attribute_24,attribute_25,attribute_26,attribute_27,attribute_28,attribute_29,attribute_30
0,uid_100,2016-06-29,id 5,id 5,id 5,True,False,False,True,True,...,True,False,True,True,False,False,True,True,True,False
1,uid_100002,2016-01-06,id 28,id 9,id 5,False,False,True,True,False,...,False,False,True,True,True,True,True,True,True,False
2,uid_100003,2017-06-10,id 4,id 8,id 5,True,True,True,True,True,...,True,False,False,True,True,True,True,False,True,True
3,uid_100004,2017-07-20,id 40,id 25,id 9,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4,uid_100006,2017-07-27,id 40,id 25,id 9,False,False,False,False,False,...,False,False,False,True,False,False,False,True,True,False


In [11]:
train_df = pd.merge(purchase_df, user_df, how='left', on='user_id')

In [12]:
train_df.head()

Unnamed: 0,user_id,purchase_id,date_x,product_id,parts_1,parts_2,parts_3,parts_4,parts_5,parts_6,...,attribute_21,attribute_22,attribute_23,attribute_24,attribute_25,attribute_26,attribute_27,attribute_28,attribute_29,attribute_30
0,uid_100,pid_0000000000,2017-09-27,product 2,0,0,0,0,0,0,...,True,False,True,True,False,False,True,True,True,False
1,uid_100,pid_0000000001,2017-09-27,product 2,0,0,0,0,0,0,...,True,False,True,True,False,False,True,True,True,False
2,uid_100002,pid_0000000002,2017-11-23,product 2,0,0,0,0,0,0,...,False,False,True,True,True,True,True,True,True,False
3,uid_100002,pid_0000000003,2017-11-23,product 2,0,0,0,0,0,0,...,False,False,True,True,True,True,True,True,True,False
4,uid_100003,pid_0000000004,2018-02-07,product 2,0,0,0,0,0,0,...,True,False,False,True,True,True,True,False,True,True


In [13]:
# train_dfから日付を削除する
print( train_df.drop('date_x', axis=1).head())

      user_id     purchase_id product_id parts_1 parts_2 parts_3 parts_4  \
0     uid_100  pid_0000000000  product 2       0       0       0       0   
1     uid_100  pid_0000000001  product 2       0       0       0       0   
2  uid_100002  pid_0000000002  product 2       0       0       0       0   
3  uid_100002  pid_0000000003  product 2       0       0       0       0   
4  uid_100003  pid_0000000004  product 2       0       0       0       0   

  parts_5 parts_6 parts_7  ... attribute_21 attribute_22  attribute_23  \
0       0       0       0  ...         True        False          True   
1       0       0       0  ...         True        False          True   
2       0       0       0  ...        False        False          True   
3       0       0       0  ...        False        False          True   
4       0       0       0  ...         True        False         False   

  attribute_24 attribute_25 attribute_26 attribute_27  attribute_28  \
0         True        False

In [14]:
# test_dfから日付を削除する
print( train_df.drop('date_x', axis=1).head())

      user_id     purchase_id product_id parts_1 parts_2 parts_3 parts_4  \
0     uid_100  pid_0000000000  product 2       0       0       0       0   
1     uid_100  pid_0000000001  product 2       0       0       0       0   
2  uid_100002  pid_0000000002  product 2       0       0       0       0   
3  uid_100002  pid_0000000003  product 2       0       0       0       0   
4  uid_100003  pid_0000000004  product 2       0       0       0       0   

  parts_5 parts_6 parts_7  ... attribute_21 attribute_22  attribute_23  \
0       0       0       0  ...         True        False          True   
1       0       0       0  ...         True        False          True   
2       0       0       0  ...        False        False          True   
3       0       0       0  ...        False        False          True   
4       0       0       0  ...         True        False         False   

  attribute_24 attribute_25 attribute_26 attribute_27  attribute_28  \
0         True        False

In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1803574 entries, 0 to 1803573
Data columns (total 45 columns):
user_id         object
purchase_id     object
date_x          object
product_id      object
parts_1         object
parts_2         object
parts_3         object
parts_4         object
parts_5         object
parts_6         object
parts_7         object
parts_8         object
parts_9         object
purchase        int64
date_y          object
attribute_1     object
attribute_2     object
attribute_3     object
attribute_4     bool
attribute_5     bool
attribute_6     bool
attribute_7     bool
attribute_8     bool
attribute_9     bool
attribute_10    bool
attribute_11    bool
attribute_12    bool
attribute_13    bool
attribute_14    bool
attribute_15    bool
attribute_16    bool
attribute_17    bool
attribute_18    bool
attribute_19    bool
attribute_20    bool
attribute_21    bool
attribute_22    bool
attribute_23    bool
attribute_24    bool
attribute_25    bool
attribute_26

In [16]:
train_df.shape

(1803574, 45)

In [17]:
train_df.isnull().sum().sum()

0

In [18]:
# %%time
# pdp.ProfileReport(train_df)

In [19]:
test_df = pd.merge(test_df, user_df, how='left', on='user_id')

In [20]:
test_df.shape

(215041, 44)

In [21]:
# sample_train_df = train_df.sample(n=len(train_df)//500)

In [22]:
# sample_test_df = test_df.sample(n=len(test_df)//500)

In [23]:
# train_df = sample_train_df
# test_df = sample_test_df

In [24]:
train = pd.get_dummies(train_df, drop_first=True)

MemoryError: Unable to allocate array with shape (1803574, 1803574) and data type uint8

In [None]:
test = pd.get_dummies(test_df, drop_first=True)

In [None]:
train = train.drop(columns='purchase')

In [None]:
def fill_missing_columns(df_a, df_b):
    columns_for_b = set(df_a.columns) - set(df_b.columns)
    for column in columns_for_b:
        df_b[column] = 0
    columns_for_a = set(df_b.columns) - set(df_a.columns)
    for column in columns_for_a:
        df_a[column] = 0

In [None]:
%%time
fill_missing_columns(train, test)

In [None]:
X = train
y = train_df.purchase

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg.score(X_train, y_train)

In [None]:
logreg.score(X_test, y_test)

In [None]:
logreg.classes_

In [None]:
proba = logreg.predict_proba(test)[:, 1]

In [None]:
sample_test_df['probability'] = proba

In [None]:
submit_df = sample_test_df[['purchase_id', 'probability']]

In [None]:
submit_df.head()

In [None]:
submit_df.to_csv('./submit_01.csv', header=False, index=False, )