# HW3 Forecast Notebook (Clean Example)

In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.rename(columns={'Unnamed: 0':'id'}, inplace=True)

train['period_start_dt'] = pd.to_datetime(train['period_start_dt'])
test['period_start_dt'] = pd.to_datetime(test['period_start_dt'], format='%d.%m.%Y')

train = train[train['period_start_dt'] < '2019-12-02']

test = test[test['period_start_dt'] <= '2019-12-30']

all_data = pd.concat([train, test], ignore_index=True)

for col in ['PROMO1_FLAG','PRICE_REGULAR','PRICE_AFTER_DISC','AUTORIZATION_FLAG']:
    all_data[col] = all_data[col].fillna(all_data[col].median())

all_data['ind_of_year'] = all_data['period_start_dt'].dt.year
all_data['ind_of_month'] = all_data['period_start_dt'].dt.month
all_data['ind_of_day'] = all_data['period_start_dt'].dt.day

num_cols = [c for c in all_data.select_dtypes(include=['number']).columns
            if c not in ['demand','id']]
all_data[num_cols] = all_data[num_cols].fillna(all_data[num_cols].median())

data_train = all_data[all_data['demand'].notna()].copy()
data_test = all_data[all_data['demand'].isna()].copy()

data_test.rename(columns={'demand':'predicted'}, inplace=True)

X = data_train.drop(['id','demand','period_start_dt'], axis=1)
y = data_train['demand']

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=1)

reg = GradientBoostingRegressor(
    max_depth=5,
    n_estimators=150,
    learning_rate=0.1,
    random_state=1
)
reg.fit(X_train, y_train)

val_pred = reg.predict(X_val)
print("MAE:", mean_absolute_error(y_val, val_pred))

X_test = data_test.drop(['id','predicted','period_start_dt'], axis=1)
y_test_pred = reg.predict(X_test)

data_test['predicted'] = y_test_pred
data_test.loc[data_test['predicted'] < 0, 'predicted'] = 0

sub = data_test[['id','predicted']]
sub.to_csv('submission_clean_example.csv', index=False)
sub.head()


MAE(en): 4.711842495974284


Unnamed: 0,id,predicted
34144,908,7.189127
34150,914,9.282015
34156,920,20.211774
34162,926,34.18451
34168,932,12.171891


In [9]:
test['period_start_dt'].sort_values().unique()


<DatetimeArray>
['2019-12-02 00:00:00', '2019-12-09 00:00:00', '2019-12-16 00:00:00',
 '2019-12-23 00:00:00', '2019-12-30 00:00:00']
Length: 5, dtype: datetime64[ns]

In [10]:
test.shape

(1200, 5)

In [11]:
test.head(20)


Unnamed: 0,id,product_rk,store_location_rk,period_start_dt,demand
0,908,40369,317,2019-12-02,
1,909,40370,317,2019-12-02,
2,910,40372,317,2019-12-02,
3,911,40373,317,2019-12-02,
4,912,46272,317,2019-12-02,
5,913,96212,317,2019-12-02,
6,914,40369,317,2019-12-09,
7,915,40370,317,2019-12-09,
8,916,40372,317,2019-12-09,
9,917,40373,317,2019-12-09,


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1200 entries, 0 to 1397
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 1200 non-null   int64         
 1   product_rk         1200 non-null   int64         
 2   store_location_rk  1200 non-null   int64         
 3   period_start_dt    1200 non-null   datetime64[ns]
 4   demand             0 non-null      float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 56.2 KB
