In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor

# Data Load

In [56]:
train = pd.read_csv('/content/sample_data/train.csv')
test = pd.read_csv('/content/sample_data/test.csv')

In [5]:
train

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg)
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0
...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0


In [6]:
test

Unnamed: 0,ID,timestamp,item,corporation,location
0,TG_A_J_20230304,2023-03-04,TG,A,J
1,TG_A_J_20230305,2023-03-05,TG,A,J
2,TG_A_J_20230306,2023-03-06,TG,A,J
3,TG_A_J_20230307,2023-03-07,TG,A,J
4,TG_A_J_20230308,2023-03-08,TG,A,J
...,...,...,...,...,...
1087,RD_F_J_20230327,2023-03-27,RD,F,J
1088,RD_F_J_20230328,2023-03-28,RD,F,J
1089,RD_F_J_20230329,2023-03-29,RD,F,J
1090,RD_F_J_20230330,2023-03-30,RD,F,J


# Data Pre-Processing

### Date

In [57]:
train["timestamp"] = pd.to_datetime(train["timestamp"])
test["timestamp"] = pd.to_datetime(test["timestamp"])

In [58]:
def date(df):
    df['year']=df['timestamp'].dt.year
    df['month']=df['timestamp'].dt.month
    df['day']=df['timestamp'].dt.day
    df['weekday']=df['timestamp'].dt.weekday
    # holidays
    df = df.drop(columns='timestamp')
    return df

train = date(train)
test = date(test)

### Missing values

In [9]:
train.isnull().sum()

ID             0
item           0
corporation    0
location       0
supply(kg)     0
price(원/kg)    0
year           0
month          0
day            0
weekday        0
dtype: int64

In [10]:
test.isnull().sum()

ID             0
item           0
corporation    0
location       0
year           0
month          0
day            0
weekday        0
dtype: int64

### One-Hot Encoding

In [11]:
train.dtypes

ID              object
item            object
corporation     object
location        object
supply(kg)     float64
price(원/kg)    float64
year             int32
month            int32
day              int32
weekday          int32
dtype: object

In [59]:
categorical_features = ['item', 'corporation', 'location', 'year', 'month', 'day', 'weekday']

In [14]:
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

# Data Split

In [60]:
target = train['price(원/kg)']
train = train.drop(columns=['ID', 'supply(kg)', 'price(원/kg)'])

test = test.drop(columns=['ID'])

In [61]:
x_train, x_valid, y_train, y_valid = train_test_split(train, target, test_size=0.1, random_state=42)

# Modeling

### catboost

In [62]:
train_pool = Pool(data=x_train, label=y_train, cat_features=categorical_features)
valid_pool = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
test_pool = Pool(data=test, cat_features=categorical_features)

In [64]:
cat = CatBoostRegressor(iterations=10000, learning_rate=0.1, random_strength=0.5, depth=7, random_state=42, verbose=0)
cat.fit(train_pool, eval_set=(valid_pool), verbose=100)

0:	learn: 1902.8477274	test: 1908.2532582	best: 1908.2532582 (0)	total: 39.7ms	remaining: 6m 37s
100:	learn: 954.0786071	test: 995.3750713	best: 995.3750713 (100)	total: 3.08s	remaining: 5m 2s
200:	learn: 920.3638352	test: 973.6022895	best: 973.6022895 (200)	total: 5.85s	remaining: 4m 45s
300:	learn: 899.0989304	test: 964.0609578	best: 964.0609578 (300)	total: 8.99s	remaining: 4m 49s
400:	learn: 881.5754056	test: 957.9296241	best: 957.9296241 (400)	total: 11.8s	remaining: 4m 42s
500:	learn: 866.6793321	test: 953.8000397	best: 953.7992079 (495)	total: 14.4s	remaining: 4m 33s
600:	learn: 850.2421502	test: 945.3988725	best: 945.3974747 (599)	total: 16.9s	remaining: 4m 24s
700:	learn: 838.2654381	test: 942.2725537	best: 942.2648933 (699)	total: 20.3s	remaining: 4m 28s
800:	learn: 825.6204636	test: 940.1407662	best: 939.9063751 (754)	total: 22.9s	remaining: 4m 22s
900:	learn: 814.0643826	test: 937.1301014	best: 937.0321938 (893)	total: 25.5s	remaining: 4m 17s
1000:	learn: 801.9319689	test: 

<catboost.core.CatBoostRegressor at 0x7a5df2352560>

In [65]:
preds = cat.predict(test_pool)
preds = [0 if i < 0 else i for i in preds]

### Random Forest

In [18]:
rf_pipe = Pipeline([
    ('transformer', transformer),
    ('model', RandomForestRegressor())
])

In [20]:
rf_pipe.fit(x_train, y_train)

In [41]:
preds = rf_pipe.predict(test)
preds = [0 if i < 0 else i for i in preds]

### XGBoost

In [50]:
xgb_pipe = Pipeline([
    ('transformer', transformer),
    ('model', XGBRegressor())
])

In [51]:
xgb_pipe.fit(x_train, y_train)

In [52]:
preds = xgb_pipe.predict(test)
preds = [0 if i < 0 else i for i in preds]

### catboost 2

In [69]:
train_pool = Pool(data=x_train, label=y_train, cat_features=categorical_features)
valid_pool = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
test_pool = Pool(data=test, cat_features=categorical_features)

In [70]:
cat_pipe = Pipeline([
    ('transformer', transformer),
    ('model', CatBoostRegressor())
])

In [77]:
cat_pipe.fit(x_train, y_train)

Learning rate set to 0.076772
0:	learn: 1939.4119688	total: 12.5ms	remaining: 12.5s
1:	learn: 1858.9281801	total: 23.4ms	remaining: 11.7s
2:	learn: 1786.9758033	total: 36.3ms	remaining: 12.1s
3:	learn: 1719.6470155	total: 47ms	remaining: 11.7s
4:	learn: 1660.9693225	total: 58.4ms	remaining: 11.6s
5:	learn: 1611.3137957	total: 69.6ms	remaining: 11.5s
6:	learn: 1565.5617872	total: 82.8ms	remaining: 11.7s
7:	learn: 1522.1387729	total: 95ms	remaining: 11.8s
8:	learn: 1484.0873958	total: 108ms	remaining: 11.8s
9:	learn: 1449.0604675	total: 118ms	remaining: 11.7s
10:	learn: 1419.9502979	total: 132ms	remaining: 11.8s
11:	learn: 1391.5385158	total: 142ms	remaining: 11.7s
12:	learn: 1368.2330528	total: 155ms	remaining: 11.7s
13:	learn: 1346.0594320	total: 164ms	remaining: 11.5s
14:	learn: 1327.0744060	total: 181ms	remaining: 11.9s
15:	learn: 1310.4211361	total: 193ms	remaining: 11.9s
16:	learn: 1293.7316035	total: 206ms	remaining: 11.9s
17:	learn: 1277.2145403	total: 218ms	remaining: 11.9s
18:	

In [79]:
preds = cat_pipe.predict(test)
preds = [0 if i < 0 else i for i in preds]

# Sumbit

In [80]:
submission = pd.read_csv('/content/sample_data/sample_submission.csv')
submission['answer'] = preds

In [39]:
target[train['weekday']==6].value_counts()

price(원/kg)
0.0       8442
1597.0       1
1996.0       1
1744.0       1
2000.0       1
1917.0       1
425.0        1
507.0        1
420.0        1
371.0        1
555.0        1
1000.0       1
1875.0       1
2396.0       1
1604.0       1
1625.0       1
2769.0       1
2755.0       1
1965.0       1
2213.0       1
2341.0       1
481.0        1
Name: count, dtype: int64

In [81]:
submission.loc[test['weekday'] == 6, 'answer'] = 0
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3348.227573
1,TG_A_J_20230305,0.000000
2,TG_A_J_20230306,3356.058664
3,TG_A_J_20230307,3533.305402
4,TG_A_J_20230308,3593.348667
...,...,...
1087,RD_F_J_20230327,644.127956
1088,RD_F_J_20230328,655.337819
1089,RD_F_J_20230329,573.906822
1090,RD_F_J_20230330,633.179563


In [82]:
submission.to_csv("cat2.csv", index=False)