02で前処理をしたデータの読み込みと03で学習したモデルを使って推論を行うためのnotebookです。  

## 必要なライブラリのimport

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import pickle

In [2]:
# 予測したいデータの読み込み
test = pd.read_csv('../data/processed/processed20240621_test.csv')

## 保存されたモデルを使って予測を行う

In [3]:
features = ['elapsed_time', 'hist_category_1_sum', 'hist_installments_sum',
 'hist_purchase_date_ptp', 'hist_purchase_date_min', 'hist_month_diff_mean',
 'hist_merchant_avg_purchases_lag3_sum', 'hist_merchant_active_months_lag12_sum',
 'auth_category_1_sum', 'auth_category_1_mean', 'auth_purchase_amount_sum',
 'auth_installments_sum', 'auth_purchase_month_std', 'auth_purchase_date_ptp',
 'auth_purchase_date_min', 'auth_purchase_date_max', 'auth_month_lag_mean',
 'auth_month_diff_mean', 'auth_merchant_group_id_nunique',
 'auth_merchant_avg_purchases_lag3_min', 'new_category_1_sum',
 'new_category_1_mean', 'new_purchase_amount_sum', 'new_purchase_amount_mean',
 'new_purchase_amount_max', 'new_purchase_amount_std', 'new_purchase_month_mean',
 'new_purchase_date_ptp', 'new_purchase_date_min', 'new_purchase_date_max',
 'new_month_lag_mean', 'new_merchant_avg_purchases_lag6_sum', 'month_lag_std',
 'purchase_amount_count_mean', 'purchase_amount_mean_mean',
 'purchase_amount_max_mean', 'authorized_flag_mean']

In [4]:
n_folds = 5
predictions = np.zeros(len(test))

for fold_ in range(n_folds):
    # モデルを読み込む
    with open(f'../src/models/model_fold_{fold_}.pkl', 'rb') as f:
        clf = pickle.load(f)
    
    # モデルで予測を行う
    fold_predictions = clf.predict(test[features])
    
    # 予測結果を足し合わせる
    predictions += fold_predictions

# 平均を取る
predictions /= n_folds

# RMSEを計算する（テストデータに対して真の値がある場合）
# y_testが存在する場合に限り
if 'y_test' in locals():
    score = mean_squared_error(y_test, predictions, squared=False)
    print(f'RMSE={score}')

In [5]:
# 外れ値

predictions_out = np.where(predictions < -17, -33.21928095, predictions)

## 提出用ファイルの作成

In [6]:
sub_df = pd.DataFrame({"card_id":test["card_id"].values})
sub_df["target"] = predictions_out
sub_df.to_csv("submit3.csv", index=False)