In [1]:
import gc
import numpy as np
import pandas as pd
import lightgbm as lgb
from  datetime import datetime, timedelta

# Prepare Datasets for Training

## Define the correct data type for each column in the datasets

### *calendar.csv*

In [27]:
# Correct data types for "calendar.csv"
calendarDTypes = {"event_name_1": "category", 
                  "event_name_2": "category", 
                  "event_type_1": "category", 
                  "event_type_2": "category", 
                  "weekday": "category", 
                  'wm_yr_wk': 'int16', 
                  "wday": "int16",
                  "month": "int16", 
                  "year": "int16", 
                  "snap_CA": "float32", 
                  'snap_TX': 'float32', 
                  'snap_WI': 'float32' }

# Read csv file
# ここは変える
calendar = pd.read_csv("data/calendar.csv", 
                       dtype = calendarDTypes)

calendar["date"] = pd.to_datetime(calendar["date"])

# Transform categorical features into integers
for col, colDType in calendarDTypes.items():
    if colDType == "category":
        calendar[col] = calendar[col].cat.codes.astype("int16")
        calendar[col] -= calendar[col].min()

calendar.head(50)

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,2,1,1,2011,d_1,0,0,0,0,0.0,0.0,0.0
1,2011-01-30,11101,3,2,1,2011,d_2,0,0,0,0,0.0,0.0,0.0
2,2011-01-31,11101,1,3,1,2011,d_3,0,0,0,0,0.0,0.0,0.0
3,2011-02-01,11101,5,4,2,2011,d_4,0,0,0,0,1.0,1.0,0.0
4,2011-02-02,11101,6,5,2,2011,d_5,0,0,0,0,1.0,0.0,1.0
5,2011-02-03,11101,4,6,2,2011,d_6,0,0,0,0,1.0,1.0,1.0
6,2011-02-04,11101,0,7,2,2011,d_7,0,0,0,0,1.0,0.0,0.0
7,2011-02-05,11102,2,1,2,2011,d_8,0,0,0,0,1.0,1.0,1.0
8,2011-02-06,11102,3,2,2,2011,d_9,27,4,0,0,1.0,1.0,1.0
9,2011-02-07,11102,1,3,2,2011,d_10,0,0,0,0,1.0,1.0,0.0


### *sell_prices.csv*

In [28]:
# Correct data types for "sell_prices.csv"
priceDTypes = {"store_id": "category", 
               "item_id": "category", 
               "wm_yr_wk": "int16",
               "sell_price":"float32"}

# Read csv file
prices = pd.read_csv("data/sell_prices.csv", 
                     dtype = priceDTypes)

# Transform categorical features into integers
for col, colDType in priceDTypes.items():
    if colDType == "category":
        prices[col] = prices[col].cat.codes.astype("int16")
        prices[col] -= prices[col].min()
        
prices.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,0,0,11325,9.58
1,0,0,11326,9.58
2,0,0,11327,8.26
3,0,0,11328,8.26
4,0,0,11329,8.26


### *sales_train_validation.csv*

### *train.csv*

In [30]:
firstDay = 250
lastDay = 1913

# Use x sales days (columns) for training
numCols = [f"d_{day}" for day in range(firstDay, lastDay+1)]

# Define all categorical columns
catCols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']

# Define the correct data types for "sales_train_validation.csv"
dtype = {numCol: "float32" for numCol in numCols} 
dtype.update({catCol: "category" for catCol in catCols if catCol != "id"})

# Read csv file
# ここ変える
ds = pd.read_csv("data/trainset.csv", 
                 usecols = catCols + numCols, dtype = dtype)

# Transform categorical features into integers
for col in catCols:
    if col != "id":
        ds[col] = ds[col].cat.codes.astype("int16")
        ds[col] -= ds[col].min()
        
ds = pd.melt(ds,
             id_vars = catCols,
             value_vars = [col for col in ds.columns if col.startswith("d_")],
             var_name = "d",
             value_name = "sales")

# Merge "ds" with "calendar" and "prices" dataframe
#ここでカレンダーとくっつけてる
ds = ds.merge(calendar, on = "d", copy = False)
ds = ds.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)

ds.head(200)

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_002_CA_1_evaluation,1,0,0,0,0,d_250,0.0,2011-10-05,11136,...,10,2011,0,0,0,0,1.0,1.0,1.0,3.97
1,HOBBIES_1_002_CA_1_evaluation,1,0,0,0,0,d_251,0.0,2011-10-06,11136,...,10,2011,0,0,0,0,1.0,1.0,1.0,3.97
2,HOBBIES_1_002_CA_1_evaluation,1,0,0,0,0,d_252,0.0,2011-10-07,11136,...,10,2011,0,0,0,0,1.0,1.0,0.0,3.97
3,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_250,0.0,2011-10-05,11136,...,10,2011,0,0,0,0,1.0,1.0,1.0,4.34
4,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_251,4.0,2011-10-06,11136,...,10,2011,0,0,0,0,1.0,1.0,1.0,4.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,HOBBIES_1_127_CA_1_evaluation,122,0,0,0,0,d_250,1.0,2011-10-05,11136,...,10,2011,0,0,0,0,1.0,1.0,1.0,1.00
196,HOBBIES_1_127_CA_1_evaluation,122,0,0,0,0,d_251,0.0,2011-10-06,11136,...,10,2011,0,0,0,0,1.0,1.0,1.0,1.00
197,HOBBIES_1_127_CA_1_evaluation,122,0,0,0,0,d_252,1.0,2011-10-07,11136,...,10,2011,0,0,0,0,1.0,1.0,0.0,1.00
198,HOBBIES_1_128_CA_1_evaluation,123,0,0,0,0,d_250,0.0,2011-10-05,11136,...,10,2011,0,0,0,0,1.0,1.0,1.0,8.77


In [41]:
ds.tail(60)

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
42372622,FOODS_3_798_WI_3_evaluation,3019,6,9,2,2,d_1912,1.0,2016-04-23,11613,...,4,2016,0,0,0,0,0.0,0.0,0.0,1.98
42372623,FOODS_3_798_WI_3_evaluation,3019,6,9,2,2,d_1913,2.0,2016-04-24,11613,...,4,2016,0,0,0,0,0.0,0.0,0.0,1.98
42372624,FOODS_3_799_WI_3_evaluation,3020,6,9,2,2,d_1912,1.0,2016-04-23,11613,...,4,2016,0,0,0,0,0.0,0.0,0.0,2.18
42372625,FOODS_3_799_WI_3_evaluation,3020,6,9,2,2,d_1913,0.0,2016-04-24,11613,...,4,2016,0,0,0,0,0.0,0.0,0.0,2.18
42372626,FOODS_3_800_WI_3_evaluation,3021,6,9,2,2,d_1912,6.0,2016-04-23,11613,...,4,2016,0,0,0,0,0.0,0.0,0.0,1.88
42372627,FOODS_3_800_WI_3_evaluation,3021,6,9,2,2,d_1913,10.0,2016-04-24,11613,...,4,2016,0,0,0,0,0.0,0.0,0.0,1.88
42372628,FOODS_3_801_WI_3_evaluation,3022,6,9,2,2,d_1912,7.0,2016-04-23,11613,...,4,2016,0,0,0,0,0.0,0.0,0.0,1.0
42372629,FOODS_3_801_WI_3_evaluation,3022,6,9,2,2,d_1913,2.0,2016-04-24,11613,...,4,2016,0,0,0,0,0.0,0.0,0.0,1.0
42372630,FOODS_3_802_WI_3_evaluation,3023,6,9,2,2,d_1912,1.0,2016-04-23,11613,...,4,2016,0,0,0,0,0.0,0.0,0.0,1.68
42372631,FOODS_3_802_WI_3_evaluation,3023,6,9,2,2,d_1913,0.0,2016-04-24,11613,...,4,2016,0,0,0,0,0.0,0.0,0.0,1.68


## 追加した特徴量エンジニアリング

In [42]:
print(ds.columns.tolist())

['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd', 'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price']


In [43]:
ds["wday"]

0           5
1           6
2           7
3           5
4           6
           ..
42372677    2
42372678    1
42372679    2
42372680    1
42372681    2
Name: wday, Length: 42372682, dtype: int16

In [44]:
ds["event_name_1"].head(100)

0     0
1     0
2     0
3     0
4     0
     ..
95    0
96    0
97    0
98    0
99    0
Name: event_name_1, Length: 100, dtype: int16

## Create features

### Sales features

In [45]:
dayLags = [7, 28]
lagSalesCols = [f"lag_{dayLag}" for dayLag in dayLags]
for dayLag, lagSalesCol in zip(dayLags, lagSalesCols):
    ds[lagSalesCol] = ds[["id","sales"]].groupby("id")["sales"].shift(dayLag)
    
windows = [7, 28]
for window in windows:
    for dayLag, lagSalesCol in zip(dayLags, lagSalesCols):
        ds[f"rmean_{dayLag}_{window}"] = ds[["id", lagSalesCol]].groupby("id")[lagSalesCol].transform(lambda x: x.rolling(window).mean())

## 追加した特徴量エンジニアリング

In [46]:
# イベントがあるかどうかを判定する列を作成 (既にない場合)
ds['is_event'] = (~ds['event_name_1'].isna() | ~ds['event_name_2'].isna()).astype('int8')

# イベントの翌日を判定する列を作成
ds['is_event_next_day'] = ds['is_event'].shift(1, fill_value=0).astype('int8')

# snapの翌日を判別する列を作成 (CA, TX, WIそれぞれ)
for region in ['CA', 'TX', 'WI']:
    snap_col = f'snap_{region}'  # 現在のスナップ列名
    next_day_col = f'snap_{region}_next_day'  # 翌日スナップ列名
    
    # snap列が1の翌日を判別
    ds[next_day_col] = ds[snap_col].shift(1, fill_value=0).astype('int8')

# 予測対象日から28日前までの売り上げ (lag特徴量)
ds['lag_28'] = ds[["id", "sales"]].groupby("id")["sales"].shift(28)

# 予測対象日から7日、14日、30日、60日の売り上げの sum, min, max, mean
lag_days = [7, 14, 30, 60]
for lag in lag_days:
    ds[f"lag_{lag}_sum"] = ds[["id", "sales"]].groupby("id")["sales"].shift(1).rolling(lag).sum()
    ds[f"lag_{lag}_min"] = ds[["id", "sales"]].groupby("id")["sales"].shift(1).rolling(lag).min()
    ds[f"lag_{lag}_max"] = ds[["id", "sales"]].groupby("id")["sales"].shift(1).rolling(lag).max()
    ds[f"lag_{lag}_mean"] = ds[["id", "sales"]].groupby("id")["sales"].shift(1).rolling(lag).mean()

# 予測対象日から7日おきに遡って4週間の売り上げの平均 (曜日の傾向をとらえる)
ds["weekly_avg_4_weeks"] = (
    ds[["id", "sales"]]
    .groupby("id")["sales"]
    .transform(lambda x: x.shift(1).rolling(4 * 7, step=7).mean())
)

# 予測対象日から7日おきに遡って8週間の売り上げの平均 (曜日の傾向をとらえる)
ds["weekly_avg_8_weeks"] = (
    ds[["id", "sales"]]
    .groupby("id")["sales"]
    .transform(lambda x: x.shift(1).rolling(8 * 7, step=7).mean())
)

# 予測対象日から7日おきに遡って12週間の売り上げの平均 (曜日の傾向をとらえる)
ds["weekly_avg_12_weeks"] = (
    ds[["id", "sales"]]
    .groupby("id")["sales"]
    .transform(lambda x: x.shift(1).rolling(12 * 7, step=7).mean())
)


ds.head(50)

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,...,lag_30_min,lag_30_max,lag_30_mean,lag_60_sum,lag_60_min,lag_60_max,lag_60_mean,weekly_avg_4_weeks,weekly_avg_8_weeks,weekly_avg_12_weeks
0,HOBBIES_1_002_CA_1_evaluation,1,0,0,0,0,d_250,0.0,2011-10-05,11136,...,,,,,,,,,,
1,HOBBIES_1_002_CA_1_evaluation,1,0,0,0,0,d_251,0.0,2011-10-06,11136,...,,,,,,,,,,
2,HOBBIES_1_002_CA_1_evaluation,1,0,0,0,0,d_252,0.0,2011-10-07,11136,...,,,,,,,,,,
3,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_250,0.0,2011-10-05,11136,...,,,,,,,,,,
4,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_251,4.0,2011-10-06,11136,...,,,,,,,,,,
5,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_252,2.0,2011-10-07,11136,...,,,,,,,,,,
6,HOBBIES_1_005_CA_1_evaluation,4,0,0,0,0,d_250,0.0,2011-10-05,11136,...,,,,,,,,,,
7,HOBBIES_1_005_CA_1_evaluation,4,0,0,0,0,d_251,1.0,2011-10-06,11136,...,,,,,,,,,,
8,HOBBIES_1_005_CA_1_evaluation,4,0,0,0,0,d_252,1.0,2011-10-07,11136,...,,,,,,,,,,
9,HOBBIES_1_008_CA_1_evaluation,7,0,0,0,0,d_250,1.0,2011-10-05,11136,...,,,,,,,,,,


### Date features

In [47]:
dateFeatures = {"wday": "weekday",
                "week": "weekofyear",
                "month": "month",
                "quarter": "quarter",
                "year": "year",
                "mday": "day"}

for featName, featFunc in dateFeatures.items():
    if featName in ds.columns:
        ds[featName] = ds[featName].astype("int16")
    else:
        ds[featName] = ds["date"].dt.isocalendar().week.astype("int16")

In [48]:
ds.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,...,lag_60_sum,lag_60_min,lag_60_max,lag_60_mean,weekly_avg_4_weeks,weekly_avg_8_weeks,weekly_avg_12_weeks,week,quarter,mday
0,HOBBIES_1_002_CA_1_evaluation,1,0,0,0,0,d_250,0.0,2011-10-05,11136,...,,,,,,,,40,40,40
1,HOBBIES_1_002_CA_1_evaluation,1,0,0,0,0,d_251,0.0,2011-10-06,11136,...,,,,,,,,40,40,40
2,HOBBIES_1_002_CA_1_evaluation,1,0,0,0,0,d_252,0.0,2011-10-07,11136,...,,,,,,,,40,40,40
3,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_250,0.0,2011-10-05,11136,...,,,,,,,,40,40,40
4,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_251,4.0,2011-10-06,11136,...,,,,,,,,40,40,40


In [49]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42372682 entries, 0 to 42372681
Data columns (total 55 columns):
 #   Column               Dtype         
---  ------               -----         
 0   id                   object        
 1   item_id              int16         
 2   dept_id              int16         
 3   store_id             int16         
 4   cat_id               int16         
 5   state_id             int16         
 6   d                    object        
 7   sales                float32       
 8   date                 datetime64[ns]
 9   wm_yr_wk             int16         
 10  weekday              int16         
 11  wday                 int16         
 12  month                int16         
 13  year                 int16         
 14  event_name_1         int16         
 15  event_type_1         int16         
 16  event_name_2         int16         
 17  event_type_2         int16         
 18  snap_CA              float32       
 19  snap_TX            

### Remove unnecessary rows and columns

In [50]:
print(ds.columns.tolist())

['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd', 'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_7', 'lag_28', 'rmean_7_7', 'rmean_28_7', 'rmean_7_28', 'rmean_28_28', 'is_event', 'is_event_next_day', 'snap_CA_next_day', 'snap_TX_next_day', 'snap_WI_next_day', 'lag_7_sum', 'lag_7_min', 'lag_7_max', 'lag_7_mean', 'lag_14_sum', 'lag_14_min', 'lag_14_max', 'lag_14_mean', 'lag_30_sum', 'lag_30_min', 'lag_30_max', 'lag_30_mean', 'lag_60_sum', 'lag_60_min', 'lag_60_max', 'lag_60_mean', 'weekly_avg_4_weeks', 'weekly_avg_8_weeks', 'weekly_avg_12_weeks', 'week', 'quarter', 'mday']


In [51]:
# NaN を削除
ds.dropna(inplace=True)

# Define columns that need to be removed
unusedCols = ["id","sales", "date" ,"d", "wm_yr_wk", "weekday"]
print(ds.columns.tolist())
trainCols = ds.columns[~ds.columns.isin(unusedCols)]
X_train = ds[trainCols]
y_train = ds["sales"]


['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd', 'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_7', 'lag_28', 'rmean_7_7', 'rmean_28_7', 'rmean_7_28', 'rmean_28_28', 'is_event', 'is_event_next_day', 'snap_CA_next_day', 'snap_TX_next_day', 'snap_WI_next_day', 'lag_7_sum', 'lag_7_min', 'lag_7_max', 'lag_7_mean', 'lag_14_sum', 'lag_14_min', 'lag_14_max', 'lag_14_mean', 'lag_30_sum', 'lag_30_min', 'lag_30_max', 'lag_30_mean', 'lag_60_sum', 'lag_60_min', 'lag_60_max', 'lag_60_mean', 'weekly_avg_4_weeks', 'weekly_avg_8_weeks', 'weekly_avg_12_weeks', 'week', 'quarter', 'mday']


### Split dataset into train and validation set

In [52]:
np.random.seed(777)

# Define categorical features
catFeats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + \
           ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]

validInds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
trainInds = np.setdiff1d(X_train.index.values, validInds)

trainData = lgb.Dataset(X_train.loc[trainInds], label = y_train.loc[trainInds], 
                        categorical_feature = catFeats, free_raw_data = False)
validData = lgb.Dataset(X_train.loc[validInds], label = y_train.loc[validInds],
                        categorical_feature = catFeats, free_raw_data = False)

In [53]:
del ds, X_train, y_train, validInds, trainInds ; gc.collect()

15

# Model

In [56]:
params = {
    "objective": "poisson",
    "metric": "rmse",
    "force_row_wise": True,
    "learning_rate": 0.075,
    "sub_row": 0.75,
    "bagging_freq": 1,
    "lambda_l2": 0.1,
    'verbosity': 1, # ここで出力の詳細を制御します
    # 'num_iterations': 1200,
    'num_iterations': 2400,
    'num_leaves': 128,
    "min_data_in_leaf": 100,
    "verbose": -1 # verbose_evalと同様に出力を抑制・制御する場合の設定
}

In [57]:
# Train LightGBM model
m_lgb = lgb.train(params, trainData, valid_sets=[validData])



[LightGBM] [Info] Total Bins 8175
[LightGBM] [Info] Number of data points in the train set: 3613068, number of used features: 44
[LightGBM] [Info] Start training from score 0.280307


In [58]:
# Save the model
m_lgb.save_model("model.lgb")

<lightgbm.basic.Booster at 0x24d9b15e290>

# Predictions

In [61]:
#もらったやつ
# Last day used for training
# trLast = 1913
trLast = 1927
# Maximum lag day
maxLags = 57

# Create dataset for predictions
def create_ds():
    
    startDay = trLast - maxLags
    
    numCols = [f"d_{day}" for day in range(startDay, trLast + 1)]
    catCols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    
    dtype = {numCol:"float32" for numCol in numCols} 
    dtype.update({catCol: "category" for catCol in catCols if catCol != "id"})
    
    ds = pd.read_csv("data/trainset.csv", 
                     usecols = catCols + numCols, dtype = dtype)
    
    for col in catCols:
        if col != "id":
            ds[col] = ds[col].cat.codes.astype("int16")
            ds[col] -= ds[col].min()
    
    for day in range(trLast + 1, trLast+ 28 +1):
        ds[f"d_{day}"] = np.nan
    
    ds = pd.melt(ds,
                 id_vars = catCols,
                 value_vars = [col for col in ds.columns if col.startswith("d_")],
                 var_name = "d",
                 value_name = "sales")
    
    ds = ds.merge(calendar, on = "d", copy = False)
    ds = ds.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return ds




def create_features(ds):          
    dayLags = [7, 28]
    lagSalesCols = [f"lag_{dayLag}" for dayLag in dayLags]
    for dayLag, lagSalesCol in zip(dayLags, lagSalesCols):
        ds[lagSalesCol] = ds[["id","sales"]].groupby("id")["sales"].shift(dayLag)

    windows = [7, 28]
    for window in windows:
        for dayLag, lagSalesCol in zip(dayLags, lagSalesCols):
            ds[f"rmean_{dayLag}_{window}"] = ds[["id", lagSalesCol]].groupby("id")[lagSalesCol].transform(lambda x: x.rolling(window).mean())


    # イベントがあるかどうかを判定する列を作成 (既にない場合)
    ds['is_event'] = (~ds['event_name_1'].isna() | ~ds['event_name_2'].isna()).astype('int8')
    
    # イベントの翌日を判定する列を作成
    ds['is_event_next_day'] = ds['is_event'].shift(1, fill_value=0).astype('int8')
    
    # snapの翌日を判別する列を作成 (CA, TX, WIそれぞれ)
    for region in ['CA', 'TX', 'WI']:
        snap_col = f'snap_{region}'  # 現在のスナップ列名
        next_day_col = f'snap_{region}_next_day'  # 翌日スナップ列名
        
        # snap列が1の翌日を判別
        ds[next_day_col] = ds[snap_col].shift(1, fill_value=0).astype('int8')
    
    # 予測対象日から28日前までの売り上げ (lag特徴量)
    ds['lag_28'] = ds[["id", "sales"]].groupby("id")["sales"].shift(28)
    
    # 予測対象日から7日、14日、30日、60日の売り上げの sum, min, max, mean
    lag_days = [7, 14, 30, 60]
    for lag in lag_days:
        ds[f"lag_{lag}_sum"] = ds[["id", "sales"]].groupby("id")["sales"].shift(1).rolling(lag).sum()
        ds[f"lag_{lag}_min"] = ds[["id", "sales"]].groupby("id")["sales"].shift(1).rolling(lag).min()
        ds[f"lag_{lag}_max"] = ds[["id", "sales"]].groupby("id")["sales"].shift(1).rolling(lag).max()
        ds[f"lag_{lag}_mean"] = ds[["id", "sales"]].groupby("id")["sales"].shift(1).rolling(lag).mean()
    
    # 予測対象日から7日おきに遡って4週間の売り上げの平均 (曜日の傾向をとらえる)
    ds["weekly_avg_4_weeks"] = (
        ds[["id", "sales"]]
        .groupby("id")["sales"]
        .transform(lambda x: x.shift(1).rolling(4 * 7, step=7).mean())
    )
    
    # 予測対象日から7日おきに遡って8週間の売り上げの平均 (曜日の傾向をとらえる)
    ds["weekly_avg_8_weeks"] = (
        ds[["id", "sales"]]
        .groupby("id")["sales"]
        .transform(lambda x: x.shift(1).rolling(8 * 7, step=7).mean())
    )
    
    # 予測対象日から7日おきに遡って12週間の売り上げの平均 (曜日の傾向をとらえる)
    ds["weekly_avg_12_weeks"] = (
        ds[["id", "sales"]]
        .groupby("id")["sales"]
        .transform(lambda x: x.shift(1).rolling(12 * 7, step=7).mean())
    )
          
    dateFeatures = {
        "wday": "weekday",
        "week": "week",               # 修正点：週番号はisocalendar().weekを使用
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day"
    }

    for featName, featFunc in dateFeatures.items():
        if featName in ds.columns:
            ds[featName] = ds[featName].astype("int16")
        else:
            if featFunc == "week":
                # isocalendar()で週番号を取得
                ds[featName] = ds["date"].dt.isocalendar().week.astype("int16")
            else:
                ds[featName] = getattr(ds["date"].dt, featFunc).astype("int16")

In [None]:
#もらったやつ
import pandas as pd
from datetime import timedelta

# 提出ファイルを読み込み
submit_df = pd.read_csv("data/submit.csv")

# 予測開始日と列名の定義
fday = pd.to_datetime("2016-05-09")  # d_1928に対応
prediction_days = 14  # 予測日数
target_days = [f"d_{1928 + i}" for i in range(prediction_days)]  # d_1928からd_1941の列名

# 予測用のデータセットを作成
te = create_ds()

# 各日付について予測を行い、結果を埋め込む
for tdelta in range(prediction_days):
    day = fday + timedelta(days=tdelta)
    tst = te[(te['date'] >= day - timedelta(days=maxLags)) & (te['date'] <= day)].copy()
    create_features(tst)  # 特徴量の生成
    tst = tst.loc[tst['date'] == day, trainCols]  # 当日分のデータを抽出
    
    # LightGBMモデルで予測し、結果を保存
    te.loc[te['date'] == day, "sales"] = m_lgb.predict(tst)
    # 予測結果を四捨五入して整数に変換
    # te.loc[te['date'] == day, "sales"] = m_lgb.predict(tst).round()

# 提出ファイルの0の箇所に予測結果を埋め込む
for i, day_col in enumerate(target_days):
    pred_data = te.loc[te['date'] == fday + timedelta(days=i), ["id", "sales"]]
    submit_df[day_col] = submit_df["id"].map(pred_data.set_index("id")["sales"])

# 最終提出ファイルとして保存
submit_df.to_csv("submission_basic_1210.csv", index=False)
print("予測結果が 'submission.csv' に保存されました。")

In [None]:
#もらったやつを変更
import pandas as pd
from datetime import timedelta

# 提出ファイルを読み込み
submit_df = pd.read_csv("data/submit.csv")

# 予測開始日と列名の定義
fday = pd.to_datetime("2016-05-09")  # d_1928に対応
prediction_days = 14  # 予測日数
target_days = [f"d_{1928 + i}" for i in range(prediction_days)]  # d_1928からd_1941の列名

#変数
alphas = [1.028, 1.023, 1.018]
weights = [1 / len(alphas)] * len(alphas)
sub = 0.

# 予測用のデータセットを作成
te = create_ds()

# 各日付について予測を行い、結果を埋め込む
for icount, (alpha, weight) in enumerate(zip(alphas, weights)):
    te = create_ds()
    cols = [f"d_{i}" for i in range(1928, 1940)]
    for tdelta in range(prediction_days):
        day = fday + timedelta(days=tdelta)
        tst = te[(te['date'] >= day - timedelta(days=maxLags)) & (te['date'] <= day)].copy()
        create_features(tst)  # 特徴量の生成
        tst = tst.loc[tst['date'] == day, trainCols]  # 当日分のデータを抽出
        
        # LightGBMモデルで予測し、結果を保存
        alpha_lab = alpha * m_lgb.predict(tst)
        
        te.loc[te['date'] == day, "sales"] = alpha_lab.round()
        # 予測結果を四捨五入して整数に変換
        # te.loc[te['date'] == day, "sales"] = m_lgb.predict(tst).round()

    # 提出ファイルの0の箇所に予測結果を埋め込む
    for i, day_col in enumerate(target_days):
        pred_data = te.loc[te['date'] == fday + timedelta(days=i), ["id", "sales"]]
        submit_df[day_col] = submit_df["id"].map(pred_data.set_index("id")["sales"])

    submit_df.to_csv(f"submin_1210_alpha{icount}.csv", index=False)
        
    if icount == 0:
        sub = submit_df
        sub[cols] *= weight
    else:
        sub[cols] += submit_df[cols] * weight
    print(icount, alpha, weight)

# 最終提出ファイルとして保存
submit_df.to_csv("submission2_dataengineer_1210.csv", index=False)
print("予測結果が 'submission2.csv' に保存されました。")