In [582]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder

In [583]:
train = pd.read_csv('./hackathon-online-data-science-mini-epidemiology/Train_set.csv')
test = pd.read_csv('./hackathon-online-data-science-mini-epidemiology/Test_set.csv')
sample_submission = pd.read_csv('./hackathon-online-data-science-mini-epidemiology/sample_submission.csv')

In [584]:
train.dtypes

_id           int64
Disease      object
Year          int64
PROVINCE     object
Week_no       int64
Count       float64
dtype: object

In [585]:
train

Unnamed: 0,_id,Disease,Year,PROVINCE,Week_no,Count
0,1,Hand Foot Mouth,2013,Bangkok,1,62.0
1,2,Hand Foot Mouth,2013,Bangkok,2,95.0
2,3,Hand Foot Mouth,2013,Bangkok,3,88.0
3,4,Hand Foot Mouth,2013,Bangkok,4,86.0
4,5,Hand Foot Mouth,2013,Bangkok,5,132.0
...,...,...,...,...,...,...
32181,32182,Hand Foot Mouth,2020,Narathiwat,48,2.0
32182,32183,Hand Foot Mouth,2020,Narathiwat,49,1.0
32183,32184,Hand Foot Mouth,2020,Narathiwat,50,2.0
32184,32185,Hand Foot Mouth,2020,Narathiwat,51,2.0


In [586]:
test

Unnamed: 0,_id,Disease,Year,PROVINCE,Week_no
0,32187,Hand Foot Mouth,2021,Bangkok,1
1,32188,Hand Foot Mouth,2021,Bangkok,2
2,32189,Hand Foot Mouth,2021,Bangkok,3
3,32190,Hand Foot Mouth,2021,Bangkok,4
4,32191,Hand Foot Mouth,2021,Bangkok,5
...,...,...,...,...,...
11198,43385,Hand Foot Mouth,2023,Narathiwat,37
11199,43386,Hand Foot Mouth,2023,Narathiwat,38
11200,43387,Hand Foot Mouth,2023,Narathiwat,39
11201,43388,Hand Foot Mouth,2023,Narathiwat,40


In [587]:
train = train.drop(columns=["_id","Disease"])
train['PROVINCE'] = train['PROVINCE'].astype('category')

In [588]:
test = test.drop(columns=["_id","Disease"])
test['PROVINCE'] = test['PROVINCE'].astype('category')

In [589]:
label_encoder = LabelEncoder()

In [590]:
def features_ex(df):
    df['Year_sin'] = np.sin(2 * np.pi * df['Year']%10)
    df['Year_cos'] = np.cos(2 * np.pi * df['Year']%10)
    df['Week_no_sin'] = np.sin(2 * np.pi * df['Week_no'] / 52)
    df['Week_no_cos'] = np.cos(2 * np.pi * df['Week_no'] / 52)
    df['some_categorical_column'] = label_encoder.fit_transform(df['PROVINCE'])
    df['interaction_feature'] = df['Year_sin'] * df['Week_no_sin']
    df = df.drop(columns=['Year', 'Week_no','PROVINCE'])
    return df

In [591]:
train = features_ex(train)
test = features_ex(test)

In [592]:
train

Unnamed: 0,Count,Year_sin,Year_cos,Week_no_sin,Week_no_cos,some_categorical_column,interaction_feature
0,62.0,0.980454,-0.196750,1.205367e-01,0.992709,2,1.181806e-01
1,95.0,0.980454,-0.196750,2.393157e-01,0.970942,2,2.346379e-01
2,88.0,0.980454,-0.196750,3.546049e-01,0.935016,2,3.476737e-01
3,86.0,0.980454,-0.196750,4.647232e-01,0.885456,2,4.556396e-01
4,132.0,0.980454,-0.196750,5.680647e-01,0.822984,2,5.569612e-01
...,...,...,...,...,...,...,...
32181,2.0,0.894482,-0.447103,-4.647232e-01,0.885456,32,-4.156867e-01
32182,1.0,0.894482,-0.447103,-3.546049e-01,0.935016,32,-3.171878e-01
32183,2.0,0.894482,-0.447103,-2.393157e-01,0.970942,32,-2.140636e-01
32184,2.0,0.894482,-0.447103,-1.205367e-01,0.992709,32,-1.078179e-01


In [593]:
test

Unnamed: 0,Year_sin,Year_cos,Week_no_sin,Week_no_cos,some_categorical_column,interaction_feature
0,0.894482,-0.447103,0.120537,9.927089e-01,2,0.107818
1,0.894482,-0.447103,0.239316,9.709418e-01,2,0.214064
2,0.894482,-0.447103,0.354605,9.350162e-01,2,0.317188
3,0.894482,-0.447103,0.464723,8.854560e-01,2,0.415687
4,0.894482,-0.447103,0.568065,8.229839e-01,2,0.508124
...,...,...,...,...,...,...
11198,0.773203,0.634159,-0.970942,-2.393157e-01,32,-0.750735
11199,0.773203,0.634159,-0.992709,-1.205367e-01,32,-0.767565
11200,0.773203,0.634159,-1.000000,-1.836970e-16,32,-0.773203
11201,0.773203,0.634159,-0.992709,1.205367e-01,32,-0.767565


# Data preparation

In [594]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(train.drop(columns=["Count"]), train["Count"], test_size=0.3, random_state=42)

In [595]:
import xgboost as xgb

In [596]:
xgboost = xgb.XGBRegressor(use_label_encoder=False,enable_categorical=True, eval_metric='logloss')
xgboost.fit(x_train, y_train)

In [597]:
pred = xgboost.predict(x_val)

In [598]:
pred = pd.DataFrame(pred)
pred = pd.DataFrame(pred[0])

In [599]:
pred

Unnamed: 0,0
0,2.164862
1,6.140656
2,6.486517
3,3.339989
4,4.985297
...,...
9651,102.055946
9652,70.001389
9653,8.205543
9654,29.070053


In [600]:
y_val

28942      1.0
12286      4.0
4819       9.0
17754      4.0
23656      8.0
         ...  
20221    120.0
4129      90.0
14946     11.0
2306      40.0
26190    142.0
Name: Count, Length: 9656, dtype: float64

In [601]:
from sklearn.metrics import mean_squared_error

In [602]:
rmse = mean_squared_error(y_val, pred, squared=False)
print(f'Root Mean Squared Error: {rmse:.2f}')

Root Mean Squared Error: 15.76


# Real pred

In [603]:
train

Unnamed: 0,Count,Year_sin,Year_cos,Week_no_sin,Week_no_cos,some_categorical_column,interaction_feature
0,62.0,0.980454,-0.196750,1.205367e-01,0.992709,2,1.181806e-01
1,95.0,0.980454,-0.196750,2.393157e-01,0.970942,2,2.346379e-01
2,88.0,0.980454,-0.196750,3.546049e-01,0.935016,2,3.476737e-01
3,86.0,0.980454,-0.196750,4.647232e-01,0.885456,2,4.556396e-01
4,132.0,0.980454,-0.196750,5.680647e-01,0.822984,2,5.569612e-01
...,...,...,...,...,...,...,...
32181,2.0,0.894482,-0.447103,-4.647232e-01,0.885456,32,-4.156867e-01
32182,1.0,0.894482,-0.447103,-3.546049e-01,0.935016,32,-3.171878e-01
32183,2.0,0.894482,-0.447103,-2.393157e-01,0.970942,32,-2.140636e-01
32184,2.0,0.894482,-0.447103,-1.205367e-01,0.992709,32,-1.078179e-01


In [604]:
test

Unnamed: 0,Year_sin,Year_cos,Week_no_sin,Week_no_cos,some_categorical_column,interaction_feature
0,0.894482,-0.447103,0.120537,9.927089e-01,2,0.107818
1,0.894482,-0.447103,0.239316,9.709418e-01,2,0.214064
2,0.894482,-0.447103,0.354605,9.350162e-01,2,0.317188
3,0.894482,-0.447103,0.464723,8.854560e-01,2,0.415687
4,0.894482,-0.447103,0.568065,8.229839e-01,2,0.508124
...,...,...,...,...,...,...
11198,0.773203,0.634159,-0.970942,-2.393157e-01,32,-0.750735
11199,0.773203,0.634159,-0.992709,-1.205367e-01,32,-0.767565
11200,0.773203,0.634159,-1.000000,-1.836970e-16,32,-0.773203
11201,0.773203,0.634159,-0.992709,1.205367e-01,32,-0.767565


In [605]:
xgboost = xgb.XGBRegressor(use_label_encoder=False,enable_categorical=True, eval_metric='logloss')
xgboost.fit(train.drop(columns=["Count"]), train["Count"])

In [606]:
len(test)

11203

In [607]:
pred = xgboost.predict(test)

In [608]:
len(pred)

11203

In [609]:
pred= pd.DataFrame(pred)
pred = pd.DataFrame(pred[0])

In [610]:
sample_submission

Unnamed: 0,_id,Pred
0,32187,32.0
1,32188,13.0
2,32189,17.0
3,32190,
4,32191,
...,...,...
11198,43385,
11199,43386,
11200,43387,
11201,43388,


In [611]:
sample_submission['Pred'] = pred


In [612]:
sample_submission

Unnamed: 0,_id,Pred
0,32187,46.315434
1,32188,67.366089
2,32189,75.186211
3,32190,78.993187
4,32191,77.431168
...,...,...
11198,43385,19.457445
11199,43386,19.912729
11200,43387,19.033653
11201,43388,17.545162


In [613]:
sample_submission.to_csv('sub2.csv',index=False)