In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')

In [35]:
trade = pd.read_csv('international_trade.csv')
trade.head()

Unnamed: 0,기간,품목명,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
0,2019-01,토마토(신선한 것이나 냉장한 것으로 한정한다),356571,990,0,0,990
1,2019-01,양파,821330,222,4003206,1118,-896
2,2019-01,쪽파,60,1,93405,128,-127
3,2019-01,꽃양배추와 브로콜리(broccoli),160,1,638913,563,-562
4,2019-01,방울다다기 양배추,0,0,7580,38,-38


In [36]:
train.head()

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg)
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0


```python
data['ID'] = data['ID'].str.replace(r'_\d{8}$', '', regex=True)
```

1. `data['ID']`: DataFrame인 `data`에서 'ID' 열을 선택합니다.

2. `.str.replace()`: 선택된 'ID' 열의 각 값에 대해 문자열을 다른 문자열로 대체하는 함수입니다.

3. `r'_\d{8}$'`: 이는 정규 표현식(Regular Expression)을 사용하여 문자열에서 특정 패턴을 찾는 역할을 합니다.
   - `_`: 언더스코어를 나타냅니다.
   - `\d{8}`: 8개의 숫자(digit)를 나타냅니다. 
   - `$`: 문자열의 끝을 나타냅니다. 이것은 열 값이 패턴으로 끝나야 함을 의미합니다.

   따라서 이 정규 표현식은 'ID' 열의 값 중에서 뒤에 언더스코어와 8자리 숫자가 나오는 패턴을 찾습니다.

4. `''`: 찾은 패턴을 빈 문자열로 대체합니다. 이는 해당 패턴을 삭제하라는 의미입니다.

5. `regex=True`: `str.replace()` 함수에 정규 표현식을 사용하려면 이 옵션을 True로 설정해야 합니다.

결과적으로, 'ID' 열에서 패턴에 해당하는 부분(언더스코어와 8자리 숫자)을 빈 문자열로 대체하여 삭제하게 됩니다. 이를 통해 'ID' 열의 값이 변경되며, 특정 패턴을 가진 부분이 삭제된 문자열이 됩니다.

In [5]:
data = train.rename(columns={'timestamp': 'ds', 'price(원/kg)': 'y'})
data = data[['ID', 'ds', 'y']] #id 추가
data['ID'] = data['ID'].str.replace(r'_\d{8}$', '', regex=True) # 뒤에 8자리 없앰
data

Unnamed: 0,ID,ds,y
0,TG_A_J,2019-01-01,0.0
1,TG_A_J,2019-01-02,0.0
2,TG_A_J,2019-01-03,1728.0
3,TG_A_J,2019-01-04,1408.0
4,TG_A_J,2019-01-05,1250.0
...,...,...,...
59392,RD_F_J,2023-02-27,468.0
59393,RD_F_J,2023-02-28,531.0
59394,RD_F_J,2023-03-01,574.0
59395,RD_F_J,2023-03-02,523.0


## supply 0인 것 제거

In [67]:
train = pd.read_csv('train.csv')
train_notzero = train[train['supply(kg)']!=0]

In [68]:
data_notzero = train_notzero.rename(columns={'timestamp': 'ds', 'price(원/kg)': 'y'})
data_notzero = data_notzero[['ID', 'ds', 'y']] #id 추가
data_notzero['ID'] = data_notzero['ID'].str.replace(r'_\d{8}$', '', regex=True) # 뒤에 8자리 없앰
df = data_notzero
df.shape

(23945, 3)

# Prophet

In [53]:
from prophet import Prophet

In [55]:
df

Unnamed: 0,ID,ds,y
2,TG_A_J,2019-01-03,1728.0
3,TG_A_J,2019-01-04,1408.0
4,TG_A_J,2019-01-05,1250.0
6,TG_A_J,2019-01-07,1474.0
7,TG_A_J,2019-01-08,1326.0
...,...,...,...
59392,RD_F_J,2023-02-27,468.0
59393,RD_F_J,2023-02-28,531.0
59394,RD_F_J,2023-03-01,574.0
59395,RD_F_J,2023-03-02,523.0


In [78]:
# 'ds'와 'y' 열 생성
df = df[['ds', 'y']]

In [86]:
df

Unnamed: 0,ds,y
2,2019-01-03,1728.0
3,2019-01-04,1408.0
4,2019-01-05,1250.0
6,2019-01-07,1474.0
7,2019-01-08,1326.0
...,...,...
59392,2023-02-27,468.0
59393,2023-02-28,531.0
59394,2023-03-01,574.0
59395,2023-03-02,523.0


In [106]:
RANDOM_SEED = 990313
np.random.seed(RANDOM_SEED)
def ph_train(df):
    preds = []  
    for code in df['ID'].unique():
        d = df[df['ID'] == code].reset_index().drop(['ID'], axis=1).sort_values('ds')
        model = Prophet()
        model.fit(d)
        future = pd.DataFrame()
        future['ds'] = pd.date_range(start='2023-03-04', periods=28, freq='D') 
        forecast = model.predict(future)        
        pred_y = forecast['yhat'].values
        pred_code = [str(code)] * len(pred_y)
        for y_val, id_val in zip(pred_y, pred_code):
            preds.append({'ID': id_val, 'y': y_val})
    pred = pd.DataFrame(preds) 
    return pred
pred = ph_train(data_notzero)
submit = submission_df['answer'] = pred['y']
submit.to_csv('prophet.csv',index=False)

16:37:58 - cmdstanpy - INFO - Chain [1] start processing
16:37:58 - cmdstanpy - INFO - Chain [1] done processing
16:37:58 - cmdstanpy - INFO - Chain [1] start processing
16:37:58 - cmdstanpy - INFO - Chain [1] done processing
16:37:59 - cmdstanpy - INFO - Chain [1] start processing
16:37:59 - cmdstanpy - INFO - Chain [1] done processing
16:37:59 - cmdstanpy - INFO - Chain [1] start processing
16:37:59 - cmdstanpy - INFO - Chain [1] done processing
16:38:00 - cmdstanpy - INFO - Chain [1] start processing
16:38:00 - cmdstanpy - INFO - Chain [1] done processing
16:38:00 - cmdstanpy - INFO - Chain [1] start processing
16:38:00 - cmdstanpy - INFO - Chain [1] done processing
16:38:00 - cmdstanpy - INFO - Chain [1] start processing
16:38:00 - cmdstanpy - INFO - Chain [1] done processing
16:38:01 - cmdstanpy - INFO - Chain [1] start processing
16:38:01 - cmdstanpy - INFO - Chain [1] done processing
16:38:01 - cmdstanpy - INFO - Chain [1] start processing
16:38:01 - cmdstanpy - INFO - Chain [1]

In [107]:
submit

0       3471.160396
1       3098.873864
2       3713.318519
3       3657.520042
4       3840.821449
           ...     
1087     400.722775
1088     407.157066
1089     395.884027
1090     386.354473
1091     383.290957
Name: y, Length: 1092, dtype: float64

In [130]:
f_submit = pd.read_csv('sample_submission.csv')
print(f_submit)

                   ID  answer
0     TG_A_J_20230304       0
1     TG_A_J_20230305       0
2     TG_A_J_20230306       0
3     TG_A_J_20230307       0
4     TG_A_J_20230308       0
...               ...     ...
1087  RD_F_J_20230327       0
1088  RD_F_J_20230328       0
1089  RD_F_J_20230329       0
1090  RD_F_J_20230330       0
1091  RD_F_J_20230331       0

[1092 rows x 2 columns]


In [131]:
f_submit.iloc[:,1:] = submit
print(f_submit)

                   ID       answer
0     TG_A_J_20230304  3471.160396
1     TG_A_J_20230305  3098.873864
2     TG_A_J_20230306  3713.318519
3     TG_A_J_20230307  3657.520042
4     TG_A_J_20230308  3840.821449
...               ...          ...
1087  RD_F_J_20230327   400.722775
1088  RD_F_J_20230328   407.157066
1089  RD_F_J_20230329   395.884027
1090  RD_F_J_20230330   386.354473
1091  RD_F_J_20230331   383.290957

[1092 rows x 2 columns]


In [132]:
f_submit.to_csv('file_utf8.csv', index=False, encoding='utf-8')