In [2]:
# Google Driveのマウント
from google.colab import drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd ./drive/"My Drive"/AIquest/pbl4

/content/drive/My Drive/AIquest/pbl4


In [1]:
!pip install catboost



In [0]:
import re
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from catboost import CatBoostClassifier, Pool


# ファイル読み込み

In [0]:
raw_purchase_record_df = pd.read_csv("purchase_record.csv")
raw_purchase_record_test_df = pd.read_csv("purchase_record_test.csv")
raw_user_info_df = pd.read_csv("user_info.csv")

purchase_record_df = raw_purchase_record_df.copy()
purchase_record_test_df = raw_purchase_record_test_df.copy()
user_info_df = raw_user_info_df.copy()

# 前処理

In [0]:
# user_info_dfのアカウント作成日時に変更
user_info_df = user_info_df.rename(columns={"date": "acount_created_date"})

# user_infoとpurchase_recordのマージ
train_df = pd.merge(purchase_record_df, user_info_df, on='user_id')
test_df = pd.merge(purchase_record_test_df, user_info_df, on='user_id')

In [0]:
# NaNを0で埋める
train_df = train_df.fillna("0")
test_df = test_df.fillna("0")

# 時間データを経過年月に変更
train_df["year"] = train_df["date"].apply(lambda x : int(x.split("-")[0]))
train_df["month"] = train_df["date"].apply(lambda x : int(x.split("-")[1]))
train_df["day"] = train_df["date"].apply(lambda x : int(x.split("-")[2]))
test_df["year"] = test_df["date"].apply(lambda x : int(x.split("-")[0]))
test_df["month"] = test_df["date"].apply(lambda x : int(x.split("-")[1]))
test_df["day"] = test_df["date"].apply(lambda x : int(x.split("-")[2]))

train_df["elapsed_month"] = (train_df["year"] - 2017) * 12 + (train_df["month"] - 7) # 最小値が2017年7月
test_df["elapsed_month"] = (test_df["year"] - 2017) * 12 + (test_df["month"] - 7) # 最小値が2017年7月
train_df["elapsed_day"] = (train_df["year"] - 2017) * 12 * 30 + (train_df["month"] - 7) * 30 + train_df["day"] # 最小値が2017年7月
test_df["elapsed_day"] = (test_df["year"] - 2017) * 12 * 30 + (test_df["month"] - 7) * 30 + test_df["day"] # 最小値が2017年7月

train_df["account_year"] = train_df["acount_created_date"].apply(lambda x : int(x.split("-")[0]))
train_df["account_month"] = train_df["acount_created_date"].apply(lambda x : int(x.split("-")[1]))
test_df["account_year"] = test_df["acount_created_date"].apply(lambda x : int(x.split("-")[0]))
test_df["account_month"] = test_df["acount_created_date"].apply(lambda x : int(x.split("-")[1]))

train_df["elapsed_account_month"] = (train_df["account_year"] - 2016) * 12 + (train_df["account_month"] - 1)
test_df["elapsed_account_month"] = (test_df["account_year"] - 2016) * 12 + (test_df["account_month"] - 1)

# user_infoとpurchase_recordのdateの差分
train_df["elapsed_month_user_and_purchase"] = (train_df["year"] - train_df["account_year"]) * 12 + (train_df["month"] - train_df["account_month"])
test_df["elapsed_month_user_and_purchase"] = (test_df["year"] - test_df["account_year"]) * 12 + (test_df["month"] - test_df["account_month"])

# 不要な変数を削除
train_df = train_df.drop(["date", "acount_created_date"], axis=1)
test_df = test_df.drop(["date", "acount_created_date"], axis=1)

In [0]:
# カテゴリ変数の置換
for i in range(train_df.shape[1]):
    if train_df.iloc[:,i].dtypes == object:
      column_name = train_df.columns[i]
      train_df[column_name] = train_df[column_name].apply(lambda x : int(re.sub("\\D", "", x)))
      test_df[column_name] = test_df[column_name].apply(lambda x : int(re.sub("\\D", "", x)))
    elif train_df.iloc[:,i].dtypes == bool:
      column_name = train_df.columns[i]
      train_df[column_name] = train_df[column_name].astype('int')
      test_df[column_name] = test_df[column_name].astype('int')

In [0]:
# カスタマイズを行っているかどうか
train_df["custom"] = train_df["parts_1"] + train_df["parts_2"] \
                    + train_df["parts_3"] + train_df["parts_4"] \
                    + train_df["parts_5"] + train_df["parts_6"] \
                    + train_df["parts_7"] + train_df["parts_8"] \
                    + train_df["parts_9"]
test_df["custom"] = test_df["parts_1"] + test_df["parts_2"] \
                    + test_df["parts_3"] + test_df["parts_4"] \
                    + test_df["parts_5"] + test_df["parts_6"] \
                    + test_df["parts_7"] + test_df["parts_8"] \
                    + test_df["parts_9"]
train_df["flag_custom"] = train_df["custom"].apply(lambda x : int(x>0))
test_df["flag_custom"] = test_df["custom"].apply(lambda x : int(x>0))

# AttributeにおいてNaNでない数
train_df["Attribute"] = train_df["attribute_4"] + train_df["attribute_5"] \
                            + train_df["attribute_6"] + train_df["attribute_7"] \
                            + train_df["attribute_8"] + train_df["attribute_9"]  \
                            + train_df["attribute_10"] + train_df["attribute_11"]  \
                            + train_df["attribute_12"] + train_df["attribute_13"] \
                            + train_df["attribute_14"] + train_df["attribute_15"]  \
                            + train_df["attribute_16"] + train_df["attribute_17"]  \
                            + train_df["attribute_18"] + train_df["attribute_19"]  \
                            + train_df["attribute_20"] + train_df["attribute_21"]  \
                            + train_df["attribute_22"] + train_df["attribute_23"] \
                            + train_df["attribute_24"] + train_df["attribute_25"]  \
                            + train_df["attribute_26"] + train_df["attribute_27"]  \
                            + train_df["attribute_28"] + train_df["attribute_29"]  \
                            + train_df["attribute_30"]
test_df["Attribute"] = test_df["attribute_4"] + test_df["attribute_5"] \
                          + test_df["attribute_6"] + test_df["attribute_7"] \
                          + test_df["attribute_8"] + test_df["attribute_9"]  \
                          + test_df["attribute_10"] + test_df["attribute_11"]  \
                          + test_df["attribute_12"] + test_df["attribute_13"] \
                          + test_df["attribute_14"] + test_df["attribute_15"]  \
                          + test_df["attribute_16"] + test_df["attribute_17"]  \
                          + test_df["attribute_18"] + test_df["attribute_19"]  \
                          + test_df["attribute_20"] + test_df["attribute_21"]  \
                          + test_df["attribute_22"] + test_df["attribute_23"] \
                          + test_df["attribute_24"] + test_df["attribute_25"]  \
                          + test_df["attribute_26"] + test_df["attribute_27"]  \
                          + test_df["attribute_28"] + test_df["attribute_29"]  \
                          + test_df["attribute_30"]

In [0]:
# Frequency Encoding
# 商品ラインナップ別
grouped_df = train_df.groupby(['product_id', 'parts_1', 'parts_2', 'parts_3', 'parts_4', 'parts_5', 'parts_6', 'parts_7', 'parts_8', 'parts_9']).size().reset_index(name="product_counts")

train_df = train_df.merge(grouped_df, how = "left", on = ['product_id', 'parts_1', 'parts_2', 'parts_3', 'parts_4', 'parts_5', 'parts_6', 'parts_7', 'parts_8', 'parts_9'])
train_df["product_frequency"] = train_df["product_counts"] / train_df["product_counts"].count()
test_df = test_df.merge(grouped_df, how = "left", on = ['product_id', 'parts_1', 'parts_2', 'parts_3', 'parts_4', 'parts_5', 'parts_6', 'parts_7', 'parts_8', 'parts_9'])
test_df["product_frequency"] = test_df["product_counts"] / test_df["product_counts"].count()

In [0]:
# Attribute_1
encoding = train_df.groupby('attribute_1').size()
encoding = encoding/len(train_df)
train_df['attribute_1_freq'] = train_df.product_id.map(encoding)

encoding = test_df.groupby('attribute_1').size()
encoding = encoding/len(test_df)
test_df['attribute_1_freq'] = test_df.product_id.map(encoding)

# Attribute_2
encoding = train_df.groupby('attribute_2').size()
encoding = encoding/len(train_df)
train_df['attribute_2_freq'] = train_df.product_id.map(encoding)

encoding = test_df.groupby('attribute_2').size()
encoding = encoding/len(test_df)
test_df['attribute_2_freq'] = test_df.product_id.map(encoding)

# Attribute_3
encoding = train_df.groupby('attribute_3').size()
encoding = encoding/len(train_df)
train_df['attribute_3_freq'] = train_df.product_id.map(encoding)

encoding = test_df.groupby('attribute_3').size()
encoding = encoding/len(test_df)
test_df['attribute_3_freq'] = test_df.product_id.map(encoding)

In [0]:
# 型の変換 (Catboost用) ※ もっと賢い方法があるはず
train_df["elapsed_month"] = train_df["elapsed_month"].astype(float)
test_df["elapsed_month"] = test_df["elapsed_month"].astype(float)
train_df["elapsed_day"] = train_df["elapsed_day"].astype(float)
test_df["elapsed_day"] = test_df["elapsed_day"].astype(float)
train_df["elapsed_account_month"] = train_df["elapsed_account_month"].astype(float)
test_df["elapsed_account_month"] = test_df["elapsed_account_month"].astype(float)
train_df["elapsed_month_user_and_purchase"] = train_df["elapsed_month_user_and_purchase"].astype(float)
test_df["elapsed_month_user_and_purchase"] = test_df["elapsed_month_user_and_purchase"].astype(float)
train_df["product_counts"] = train_df["product_counts"].astype(float)
test_df["product_counts"] = test_df["product_counts"].astype(float)
train_df["product_frequency"] = train_df["product_frequency"].astype(float)
test_df["product_frequency"] = test_df["product_frequency"].astype(float)
train_df["attribute_1_freq"] = train_df["attribute_1_freq"].astype(float)
test_df["attribute_1_freq"] = test_df["attribute_1_freq"].astype(float)
train_df["attribute_2_freq"] = train_df["attribute_2_freq"].astype(float)
test_df["attribute_2_freq"] = test_df["attribute_2_freq"].astype(float)
train_df["attribute_3_freq"] = train_df["attribute_3_freq"].astype(float)
test_df["attribute_3_freq"] = test_df["attribute_3_freq"].astype(float)

# 学習

In [0]:
# 2017年7月〜2018年4月までを訓練データとして、2018年5月〜6月をテストデータとして使用。
# 日付でソート
sort_train_df = train_df.sort_values("elapsed_day")

# 学習データと訓練データに分割
train_X = sort_train_df[(sort_train_df["month"] != 5 ) & (sort_train_df["month"] != 6 )].drop(["purchase", "purchase_id"], axis = 1)
train_y = sort_train_df[(sort_train_df["month"] != 5 ) & (sort_train_df["month"] != 6 )]["purchase"]
test_X = sort_train_df[(sort_train_df["month"] == 5 ) | (sort_train_df["month"] == 6 )].drop(["purchase", "purchase_id"], axis = 1)
test_y = sort_train_df[(sort_train_df["month"] == 5 ) | (sort_train_df["month"] == 6 )]["purchase"]

x = test_df.drop(["purchase_id"], axis = 1)

In [14]:
# product_idやparts、Attribuはカテゴリ変数として学習させる
categorical_features_indices = np.where(train_X.dtypes != np.float)[0]

# ハイパーパラメータ (Optunaで調整済み)
params = {
    'iterations': 119, 
    'depth': 8, 
    'learning_rate': 0.27, 
    'random_strength': 75,
     'bagging_temperature': 0.057, 
     'od_type': 'Iter', 
     'od_wait': 18,
     'iterations' : 1000, 
     'eval_metric': "AUC", 
     'cat_features': categorical_features_indices,
     'random_seed' : 42,
     }

model = CatBoostClassifier(**params)
model.fit(train_X, train_y, eval_set=(test_X, test_y))

0:	test: 0.8348441	best: 0.8348441 (0)	total: 8.93s	remaining: 2h 28m 43s
1:	test: 0.9135861	best: 0.9135861 (1)	total: 19s	remaining: 2h 38m 15s
2:	test: 0.9165791	best: 0.9165791 (2)	total: 27.1s	remaining: 2h 29m 49s
3:	test: 0.9159297	best: 0.9165791 (2)	total: 30.8s	remaining: 2h 7m 41s
4:	test: 0.9245143	best: 0.9245143 (4)	total: 34s	remaining: 1h 52m 51s
5:	test: 0.9273313	best: 0.9273313 (5)	total: 44.6s	remaining: 2h 3m 12s
6:	test: 0.9265628	best: 0.9273313 (5)	total: 53.3s	remaining: 2h 5m 55s
7:	test: 0.9272983	best: 0.9273313 (5)	total: 1m 3s	remaining: 2h 11m 10s
8:	test: 0.9279579	best: 0.9279579 (8)	total: 1m 14s	remaining: 2h 16m 23s
9:	test: 0.9246210	best: 0.9279579 (8)	total: 1m 24s	remaining: 2h 19m 38s
10:	test: 0.9247578	best: 0.9279579 (8)	total: 1m 36s	remaining: 2h 24m 38s
11:	test: 0.9255727	best: 0.9279579 (8)	total: 1m 45s	remaining: 2h 24m 50s
12:	test: 0.9255572	best: 0.9279579 (8)	total: 1m 50s	remaining: 2h 19m 41s
13:	test: 0.9254712	best: 0.9279579 (

<catboost.core.CatBoostClassifier at 0x7f1b69cd1630>

# 評価

In [17]:
y_pred = model.predict(test_X, prediction_type='Probability')[:, 1]

# AUC (Area Under the Curve) を計算する
fpr, tpr, thresholds = metrics.roc_curve(test_y, y_pred)
auc = metrics.auc(fpr, tpr)
print(auc)

0.927957939702426


# 提出用データの作成

In [0]:
y_pred = model.predict(x, prediction_type='Probability')[:, 1]
raw_purchase_record_test_df["y"] = y_pred

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 提出用ファイルの作成
submittion_file = raw_purchase_record_test_df[['purchase_id', 'y']]
submittion_file.to_csv('submittion9.csv', index=False, header=False)