データの読み込みと前処理を行うためのnotebookです。  
モデルの学習と予測にはここで処理をかけたデータを利用するようにして下さい。

## 必要なライブラリのimport

In [1]:
import warnings
import time
import sys
import datetime

import numpy as np
import pandas as pd

warnings.simplefilter(action='ignore', category=FutureWarning)

## データの読み込み

In [2]:
def reduce_mem_usage(df, verbose=True):
    """
    データフレームのメモリ使用量を減らす。

    Parameters
    ----------
    df : pd.DataFrame
        メモリ使用量を削減したいデータフレーム。
    verbose : bool, optional
        メモリ使用量の削減結果を出力するかどうか（デフォルトは True）。

    Returns
    -------
    pd.DataFrame
        メモリ使用量が削減されたデータフレーム。
    """

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def binarize(df):
    """
    指定された列を二値化する。

    Parameters
    ----------
    df : pd.DataFrame
        二値化対象のデータフレーム。

    Returns
    -------
    pd.DataFrame
        二値化されたデータフレーム。
    """

    for col in ['authorized_flag', 'category_1']:
        df[col] = df[col].map({'Y': 1, 'N': 0})
    return df


def read_data(input_file):
    """
    指定されたファイルからデータを読み込み、前処理を行う。

    Parameters
    ----------
    input_file : str
        読み込むデータファイルのパス。

    Returns
    -------
    pd.DataFrame
        前処理されたデータフレーム。
    """
    
    df = pd.read_csv(input_file)
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['elapsed_time'] = (pd.Timestamp('2018-02-01') - df['first_active_month']).dt.days
    return df

In [3]:
new_transactions = pd.read_csv('../data/row/new_merchant_transactions.csv',
                               parse_dates=['purchase_date'])

historical_transactions = pd.read_csv('../data/row/historical_transactions.csv',
                                      parse_dates=['purchase_date'])

historical_transactions = binarize(historical_transactions)
new_transactions = binarize(new_transactions)

In [4]:
historical_transactions = historical_transactions[['authorized_flag', 'card_id', 'purchase_amount', 'purchase_date']]
new_transactions = new_transactions[['authorized_flag', 'card_id', 'purchase_amount', 'purchase_date']]

## 特徴量作成

In [5]:
def reduce_mem_usage(df, verbose=True):
    """
    データフレームのメモリ使用量を減らす。

    Parameters
    ----------
    df : pd.DataFrame
        メモリ使用量を削減したいデータフレーム。
    verbose : bool, optional
        メモリ使用量の削減結果を出力するかどうか（デフォルトは True）。

    Returns
    -------
    pd.DataFrame
        メモリ使用量が削減されたデータフレーム。
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
# データ準備
historical_transactions['purchase_date'] = pd.to_datetime(historical_transactions['purchase_date'])
new_transactions['purchase_date'] = pd.to_datetime(new_transactions['purchase_date'])

# メモリ使用量の削減
historical_transactions = reduce_mem_usage(historical_transactions)
new_transactions = reduce_mem_usage(new_transactions)

Mem. usage decreased to 583.04 Mb (34.4% reduction)
Mem. usage decreased to 35.57 Mb (40.6% reduction)


In [8]:
import calendar
from datetime import datetime

# 日付を三角関数変換し、新しいデータフレームとして取得する関数
def get_trigonometrical_date(df):
    # df['purchase_date'] = pd.to_datetime(df['purchase_date'])

    # 年ごとの日数を計算
    years = df['purchase_date'].dt.year
    unique_years = years.unique()
    days_in_year_dict = {year: (datetime(year + 1, 1, 1) - datetime(year, 1, 1)).days for year in unique_years}
    days_in_year = years.map(days_in_year_dict).to_numpy()

    # 年の日数に基づいて日をエンコード
    day_of_year = df['purchase_date'].dt.dayofyear.to_numpy()
    day_of_year_sin = np.sin(2 * np.pi * day_of_year / days_in_year)
    day_of_year_cos = np.cos(2 * np.pi * day_of_year / days_in_year)

    # 曜日をエンコード
    day_of_week = df['purchase_date'].dt.weekday.to_numpy()
    day_of_week_sin = np.sin(2 * np.pi * day_of_week / 7)
    day_of_week_cos = np.cos(2 * np.pi * day_of_week / 7)

    # 一か月の周期をエンコード（各月の最大日数を使用）
    days_in_month = df['purchase_date'].apply(lambda date: calendar.monthrange(date.year, date.month)[1]).to_numpy()
    day_of_month = df['purchase_date'].dt.day.to_numpy()
    day_of_month_sin = np.sin(2 * np.pi * day_of_month / days_in_month)
    day_of_month_cos = np.cos(2 * np.pi * day_of_month / days_in_month)

    # 一日の周期（時刻）をエンコード
    hour_of_day = df['purchase_date'].dt.hour + df['purchase_date'].dt.minute / 60.0
    hour_of_day_sin = np.sin(2 * np.pi * hour_of_day / 24)
    hour_of_day_cos = np.cos(2 * np.pi * hour_of_day / 24)

    # 結果をデータフレームに追加
    date_trigonometry = pd.DataFrame({
        'day_of_year': day_of_year,
        'day_of_year_sin': day_of_year_sin,
        'day_of_year_cos': day_of_year_cos,
        'day_of_month': day_of_month,
        'day_of_month_sin': day_of_month_sin,
        'day_of_month_cos': day_of_month_cos,
        'day_of_week': day_of_week,
        'day_of_week_sin': day_of_week_sin,
        'day_of_week_cos': day_of_week_cos,
        'hour_of_day': hour_of_day,
        'hour_of_day_sin': hour_of_day_sin,
        'hour_of_day_cos': hour_of_day_cos
    })

    return date_trigonometry

In [9]:
new_transactions_sin = new_transactions.join(get_trigonometrical_date(new_transactions))
historical_transactions_sin = historical_transactions.join(get_trigonometrical_date(historical_transactions))

In [10]:
# メモリ使用量の削減
historical_transactions_sin = reduce_mem_usage(historical_transactions_sin)
new_transactions_sin = reduce_mem_usage(new_transactions_sin)

Mem. usage decreased to 1193.84 Mb (59.0% reduction)
Mem. usage decreased to 76.76 Mb (60.2% reduction)


In [11]:
# purchase_month列を追加

historical_transactions_sin['purchase_month'] = historical_transactions_sin['purchase_date'].dt.to_period('M').dt.to_timestamp()
new_transactions_sin['purchase_month'] = new_transactions_sin['purchase_date'].dt.to_period('M').dt.to_timestamp()
historical_transactions_sin['purchase_year'] = historical_transactions_sin['purchase_date'].dt.year
new_transactions_sin['purchase_year'] = new_transactions_sin['purchase_date'].dt.year
historical_transactions_sin['epoch'] = pd.DatetimeIndex(historical_transactions_sin['purchase_date']).astype(np.int64) * 1e-9
new_transactions_sin['epoch'] = pd.DatetimeIndex(new_transactions_sin['purchase_date']).astype(np.int64) * 1e-9


all_transactions = pd.concat([historical_transactions_sin, new_transactions_sin], axis=0).reset_index(drop=True)
# authorized_flagに基づいてデータを分割
authorized_transactions = historical_transactions_sin[historical_transactions_sin['authorized_flag'] == 1]
historical_transactions = historical_transactions_sin[historical_transactions_sin['authorized_flag'] == 0]
new_transactions = new_transactions_sin.copy()


In [12]:
all_transactions = reduce_mem_usage(all_transactions)
authorized_transactions = reduce_mem_usage(authorized_transactions)
historical_transactions = reduce_mem_usage(historical_transactions)
new_transactions = reduce_mem_usage(new_transactions)

Mem. usage decreased to 1689.24 Mb (9.5% reduction)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float16)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.int16)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float16)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,co

Mem. usage decreased to 1597.89 Mb (11.3% reduction)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float32)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.int16)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float16)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,co

Mem. usage decreased to 156.02 Mb (8.5% reduction)
Mem. usage decreased to 102.97 Mb (9.8% reduction)


In [13]:
authorized_transactions['purchase_amount'] = authorized_transactions['purchase_amount'].astype(np.float32)
new_transactions['purchase_amount'] = new_transactions['purchase_amount'].astype(np.float32)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  authorized_transactions['purchase_amount'] = authorized_transactions['purchase_amount'].astype(np.float32)


In [14]:
del new_transactions_sin
del historical_transactions_sin

In [16]:
def value1(df, suf=None):
    valuelist = ['day_of_year', 'day_of_year_sin', 'day_of_year_cos',
            'day_of_month', 'day_of_month_sin', 'day_of_month_cos',
            'day_of_week', 'day_of_week_sin', 'day_of_week_cos',
            'hour_of_day', 'hour_of_day_sin', 'hour_of_day_cos', 'purchase_year', 'epoch']
    valuelistwc = ['card_id', 'day_of_year', 'day_of_year_sin', 'day_of_year_cos',
            'day_of_month', 'day_of_month_sin', 'day_of_month_cos',
            'day_of_week', 'day_of_week_sin', 'day_of_week_cos',
            'hour_of_day', 'hour_of_day_sin', 'hour_of_day_cos', 'purchase_year', 'epoch']
    kari_add_df = all_transactions.card_id.drop_duplicates()
    kari_add_df = pd.DataFrame(kari_add_df.reset_index(drop=True))
    kari_add_df[valuelist] = 0
    for mode in ['max', 'min', 'old', 'new']:
        if mode == 'max':
            amt_idx = df.groupby('card_id')['purchase_amount'].idxmax()
        elif mode == 'min':
            amt_idx = df.groupby('card_id')['purchase_amount'].idxmin()
        elif mode == 'old':
            amt_idx = df.groupby('card_id')['purchase_date'].idxmin()
        elif mode == 'new':
            amt_idx = df.groupby('card_id')['purchase_date'].idxmax()
        kari_add_df = pd.merge(kari_add_df, df.loc[amt_idx, valuelistwc], how='left', on='card_id', suffixes=['', ('_' + suf + mode)])
    return kari_add_df.drop(columns=valuelist)

In [17]:
# ユーザーごとの最高額、最低額、最初の決済、最後の決済の日時情報
add_df = all_transactions.card_id.drop_duplicates()
add_df = pd.merge(add_df,value1(all_transactions, 'all'), on='card_id', how='inner')
add_df = pd.merge(add_df,value1(authorized_transactions, 'auth'), on='card_id', how='inner')
add_df = pd.merge(add_df,value1(historical_transactions, 'hist'), on='card_id', how='inner')
add_df = pd.merge(add_df,value1(new_transactions, 'new'), on='card_id', how='inner')


In [18]:
def value2(df, suf=None):
    valuelist = ['day_of_month', 'day_of_month_sin', 'day_of_month_cos',
            'day_of_week', 'day_of_week_sin', 'day_of_week_cos',
            'hour_of_day', 'hour_of_day_sin', 'hour_of_day_cos', 'epoch']
    valuelistwc = ['card_id','day_of_month', 'day_of_month_sin', 'day_of_month_cos',
            'day_of_week', 'day_of_week_sin', 'day_of_week_cos',
            'hour_of_day', 'hour_of_day_sin', 'hour_of_day_cos', 'epoch']
    kari_add_df = all_transactions.card_id.drop_duplicates()
    kari_add_df = pd.DataFrame(kari_add_df.reset_index(drop=True))
    kari_add_df[valuelist] = 0

    for mode in ['allmean', 'newmean', 'maxmean']:
        if mode == 'allmean':
            dfagg = df.groupby('card_id').mean()[valuelist]
        elif mode == 'newmean':
            a = df.groupby('card_id')['purchase_month'].max()
            dfagg = df.merge(a, how='inner', on=['card_id', 'purchase_month']).groupby('card_id').mean()[valuelist]
        elif mode == 'maxmean':
            a = df.groupby(['card_id', 'purchase_month'])['purchase_amount'].sum().reset_index()
            dfagg = df.merge(a.loc[a.groupby('card_id')['purchase_amount'].idxmax()][['card_id', 'purchase_month']], how='inner', on=['card_id', 'purchase_month']).groupby('card_id').mean()[valuelist]
        # elif mode == 'allmode':
        #     amt_idx = df.groupby('card_id')['purchase_date'].idxmax()
        kari_add_df = pd.merge(kari_add_df, dfagg, how='left', on='card_id', suffixes=['', ('_' + suf + mode)])
    return kari_add_df.drop(columns=valuelist)

In [19]:
# ユーザーごとの全期間、最新月、決済金額合計が最大となる月それぞれの平均日時情報
add_df = pd.merge(add_df,value2(all_transactions, 'all'), on='card_id', how='inner')
add_df = pd.merge(add_df,value2(authorized_transactions, 'auth'), on='card_id', how='inner')
add_df = pd.merge(add_df,value2(historical_transactions, 'hist'), on='card_id', how='inner')
add_df = pd.merge(add_df,value2(new_transactions, 'new'), on='card_id', how='inner')


In [20]:
def value3(df, suf=None):
    valuelist = ['day_of_week', 'purchase_year', 'purchase_month']
    kari_add_df = all_transactions.card_id.drop_duplicates()
    kari_add_df = pd.DataFrame(kari_add_df.reset_index(drop=True))
    kari_add_df[valuelist] = 0
    dfagg = df.groupby('card_id')[valuelist].apply(lambda x: x.mode()).reset_index().groupby('card_id')[valuelist].max()
    kari_add_df = pd.merge(kari_add_df, dfagg, how='left', on='card_id', suffixes=['', ('_' + suf + 'allmode')])
    return kari_add_df.drop(columns=valuelist)

In [21]:
add_df = pd.merge(add_df,value3(all_transactions, 'all'), on='card_id', how='inner')
add_df = pd.merge(add_df,value3(authorized_transactions, 'auth'), on='card_id', how='inner')
add_df = pd.merge(add_df,value3(historical_transactions, 'hist'), on='card_id', how='inner')
add_df = pd.merge(add_df,value3(new_transactions, 'new'), on='card_id', how='inner')

In [22]:
def categorize(row):
    if row['purchase_amount'] < row['Q1']:
        return 'under25_mean'
    elif row['purchase_amount'] > row['Q3']:
        return 'up25_mean'
    else:
        return '25-75_mean'

def value4(df, suf=None):

    kari_add_df = df.groupby('card_id').purchase_date.count().rename('purchase_count')

    percentiles = df.groupby('card_id')['purchase_amount'].quantile([0.25, 0.75]).unstack(level=1)
    percentiles.columns = ['Q1', 'Q3']
    dff = pd.merge(df, percentiles, on='card_id', how='left')
    dff['amount_rank'] = dff.apply(categorize, axis=1)

    kari_add_df = pd.merge(kari_add_df, dff.groupby(['card_id', 'amount_rank'])['purchase_amount'].mean().unstack(), on='card_id', how='left')
    kari_add_df['diff_up25-2575'] = kari_add_df['up25_mean'] - kari_add_df['25-75_mean']
    kari_add_df['diff_2575-under25'] = kari_add_df['25-75_mean'] - kari_add_df['under25_mean']
    kari_add_df['diff_up25-under25'] = kari_add_df['up25_mean'] - kari_add_df['under25_mean']
    kari_add_df['ratio_up25/2575'] = kari_add_df['up25_mean'] / kari_add_df['25-75_mean']
    kari_add_df['ratio_under25/2575'] = kari_add_df['under25_mean'] / kari_add_df['25-75_mean']
    return kari_add_df.add_suffix('_' + suf)

In [23]:
add_df = pd.merge(add_df,value4(all_transactions, 'all'), on='card_id', how='inner')
add_df = pd.merge(add_df,value4(authorized_transactions, 'auth'), on='card_id', how='left')
add_df = pd.merge(add_df,value4(historical_transactions, 'hist'), on='card_id', how='left')
add_df = pd.merge(add_df,value4(new_transactions, 'new'), on='card_id', how='left')

In [25]:
# データの結合
train = read_data('../data/processed/processed20240625_train.csv')
test = read_data('../data/processed/processed20240625_test.csv')


train = pd.merge(train, add_df, on='card_id', how='left')
test = pd.merge(test, add_df, on='card_id', how='left')


In [27]:
add_df

Unnamed: 0,card_id,day_of_year_allmax,day_of_year_sin_allmax,day_of_year_cos_allmax,day_of_month_allmax,day_of_month_sin_allmax,day_of_month_cos_allmax,day_of_week_allmax,day_of_week_sin_allmax,day_of_week_cos_allmax,...,ratio_under25/2575_hist,purchase_count_new,25-75_mean_new,under25_mean_new,up25_mean_new,diff_up25-2575_new,diff_2575-under25_new,diff_up25-under25_new,ratio_up25/2575_new,ratio_under25/2575_new
0,C_ID_4e6213e9bc,15,0.255371,0.966797,15,0.101196,-0.994629,0,0.000000,1.000000,...,1.102842,2.0,-0.649414,,-0.703125,-0.053711,,,1.082707,
1,C_ID_5037ff576e,208,-0.425049,-0.905273,27,-0.724609,0.688965,3,0.433838,-0.900879,...,0.542981,2.0,1.356445,,0.334961,-1.021484,,,0.246940,
2,C_ID_0e171c1b48,100,0.988770,-0.150024,10,0.866211,-0.500000,1,0.781738,0.623535,...,1.420394,16.0,-0.141846,-0.699097,-0.733765,-0.591919,0.557251,-0.034668,5.172977,4.928571
3,C_ID_48fb13e70f,158,0.409424,-0.912598,7,0.994629,0.104553,2,0.975098,-0.222534,...,,1.0,,-0.446289,,,,,,
4,C_ID_fc8e41b9cf,192,-0.162842,-0.986816,11,0.791016,-0.612305,1,0.781738,0.623535,...,-0.836690,21.0,0.343259,-0.586093,-0.703809,-1.047067,0.929352,-0.117716,-2.050374,-1.707438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325535,C_ID_803aa0aed4,66,0.907227,0.421143,7,0.988281,0.151367,2,0.975098,-0.222534,...,,9.0,-0.549316,-0.663965,-0.727539,-0.178223,0.114648,-0.063574,1.324444,1.208711
325536,C_ID_62df280b20,261,-0.976074,-0.217773,18,-0.587891,-0.809082,0,0.000000,1.000000,...,,4.0,0.110352,-0.572021,-0.641602,-0.751953,0.682373,-0.069580,-5.814159,-5.183628
325537,C_ID_e49b1996b0,72,0.945801,0.325439,13,0.485352,-0.874512,1,0.781738,0.623535,...,,1.0,,-0.617676,,,,,,
325538,C_ID_2863d2fa95,20,0.337402,0.941406,20,-0.791016,-0.612305,4,-0.433838,-0.900879,...,,,,,,,,,,


## 前処理終了後のデータの保存
- 基本的にモデルの学習・ハイパーパラメータチューニングを行う際にはここで作成した同じデータを使い回して下さい。
- 適宜前処理を変更した場合はファイル名を変えるなどして管理して下さい。

In [26]:
# データの保存
train.to_csv('../data/processed/processed20240627_train.csv',index=None)
test.to_csv('../data/processed/processed20240627_test.csv',index=None)