In [2]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

# Data Load

In [24]:
train = pd.read_csv('/content/sample_data/train.csv')
test = pd.read_csv('/content/sample_data/test.csv')

In [25]:
train.shape

(59397, 7)

In [26]:
test.shape

(1092, 5)

# Data Pre-Processing

### Date

In [27]:
train["timestamp"] = pd.to_datetime(train["timestamp"])
test["timestamp"] = pd.to_datetime(test["timestamp"])

In [28]:
def date(df):
    df['year']=df['timestamp'].dt.year
    df['month']=df['timestamp'].dt.month
    df['day']=df['timestamp'].dt.day
    df['weekday']=df['timestamp'].dt.weekday
    df = df.drop(columns='timestamp')
    return df

train = date(train)
test = date(test)

In [29]:
train

Unnamed: 0,ID,item,corporation,location,supply(kg),price(원/kg),year,month,day,weekday
0,TG_A_J_20190101,TG,A,J,0.0,0.0,2019,1,1,1
1,TG_A_J_20190102,TG,A,J,0.0,0.0,2019,1,2,2
2,TG_A_J_20190103,TG,A,J,60601.0,1728.0,2019,1,3,3
3,TG_A_J_20190104,TG,A,J,25000.0,1408.0,2019,1,4,4
4,TG_A_J_20190105,TG,A,J,32352.0,1250.0,2019,1,5,5
...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,RD,F,J,452440.0,468.0,2023,2,27,0
59393,RD_F_J_20230228,RD,F,J,421980.0,531.0,2023,2,28,1
59394,RD_F_J_20230301,RD,F,J,382980.0,574.0,2023,3,1,2
59395,RD_F_J_20230302,RD,F,J,477220.0,523.0,2023,3,2,3


In [30]:
test

Unnamed: 0,ID,item,corporation,location,year,month,day,weekday
0,TG_A_J_20230304,TG,A,J,2023,3,4,5
1,TG_A_J_20230305,TG,A,J,2023,3,5,6
2,TG_A_J_20230306,TG,A,J,2023,3,6,0
3,TG_A_J_20230307,TG,A,J,2023,3,7,1
4,TG_A_J_20230308,TG,A,J,2023,3,8,2
...,...,...,...,...,...,...,...,...
1087,RD_F_J_20230327,RD,F,J,2023,3,27,0
1088,RD_F_J_20230328,RD,F,J,2023,3,28,1
1089,RD_F_J_20230329,RD,F,J,2023,3,29,2
1090,RD_F_J_20230330,RD,F,J,2023,3,30,3


### Missing values

In [31]:
train.isnull().sum()

ID             0
item           0
corporation    0
location       0
supply(kg)     0
price(원/kg)    0
year           0
month          0
day            0
weekday        0
dtype: int64

In [33]:
test.isnull().sum()

ID             0
item           0
corporation    0
location       0
year           0
month          0
day            0
weekday        0
dtype: int64

# Data Split

In [34]:
target = train['price(원/kg)']
train = train.drop(columns=['ID', 'supply(kg)', 'price(원/kg)'])

test = test.drop(columns=['ID'])

In [35]:
x_train, x_valid, y_train, y_valid = train_test_split(train, target, test_size=0.1, random_state=42)

# Modeling

### catboost

In [36]:
categorical_features = ['item', 'corporation', 'location', 'year', 'month', 'day', 'weekday']

In [37]:
train_pool = Pool(data=x_train, label=y_train, cat_features=categorical_features)
valid_pool = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
test_pool = Pool(data=test, cat_features=categorical_features)

In [38]:
cat = CatBoostRegressor(iterations=2000, learning_rate=0.1, random_strength=0.5, depth=7, random_state=42, verbose=0)
cat.fit(train_pool, eval_set=(valid_pool), verbose=100)

0:	learn: 1902.8477274	test: 1908.2532582	best: 1908.2532582 (0)	total: 126ms	remaining: 4m 11s
100:	learn: 954.0786071	test: 995.3750713	best: 995.3750713 (100)	total: 5.74s	remaining: 1m 48s
200:	learn: 920.3638352	test: 973.6022895	best: 973.6022895 (200)	total: 9.57s	remaining: 1m 25s
300:	learn: 899.0989304	test: 964.0609578	best: 964.0609578 (300)	total: 13.3s	remaining: 1m 14s
400:	learn: 881.5754056	test: 957.9296241	best: 957.9296241 (400)	total: 18.6s	remaining: 1m 14s
500:	learn: 866.6793321	test: 953.8000397	best: 953.7992079 (495)	total: 22.5s	remaining: 1m 7s
600:	learn: 850.2421502	test: 945.3988725	best: 945.3974747 (599)	total: 26.1s	remaining: 1m
700:	learn: 838.2654381	test: 942.2725537	best: 942.2648933 (699)	total: 31.9s	remaining: 59.1s
800:	learn: 825.6204636	test: 940.1407662	best: 939.9063751 (754)	total: 35.7s	remaining: 53.4s
900:	learn: 814.0643826	test: 937.1301014	best: 937.0321938 (893)	total: 39.5s	remaining: 48.2s
1000:	learn: 801.9319689	test: 935.5236

<catboost.core.CatBoostRegressor at 0x7da73c2ee470>

In [42]:
preds = cat.predict(test_pool)
preds = [0 if i < 0 else i for i in preds]

### XGBoost

In [41]:
le = LabelEncoder()
def encoding(df):
    for col in list(x_train.dtypes[x_train.dtypes == "object"].index):
        df[col] = le.fit_transform(df[col].values)
encoding(x_train)
encoding(x_valid)

# Sumbit

In [43]:
submission = pd.read_csv('/content/sample_data/sample_submission.csv')
submission['answer'] = preds
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,2359.980513
1,TG_A_J_20230305,55.365989
2,TG_A_J_20230306,2985.701723
3,TG_A_J_20230307,3334.941998
4,TG_A_J_20230308,3217.404176
...,...,...
1087,RD_F_J_20230327,681.912804
1088,RD_F_J_20230328,585.765137
1089,RD_F_J_20230329,627.716585
1090,RD_F_J_20230330,636.736585


In [None]:
submission.to_csv("submission.csv", index=False)