In [None]:
import pandas as pd
import numpy as np
import optuna
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from imblearn.pipeline import Pipeline
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [None]:
df_train = pd.read_csv('Train_set.csv')
df_test = pd.read_csv('Test_set.csv')
df = pd.concat([df_train, df_test], ignore_index=True)

In [None]:
df

In [71]:
provinces_encode = {df['PROVINCE'].unique()[i]: i for i in range(len(df['PROVINCE'].unique()))}

def preprocess(df):
    df['Count'] = df['Count'].fillna(0)
    df['PROVINCE'] = df['PROVINCE'].astype('category')
    df['week'] = df['Week_no']

    min_year = df['Year'].min()
    df['all_week'] = (df['Year'] - min_year) * 52 + df['week']

    df['week_sin'] = np.sin(2 * np.pi * df['week'] / 52)
    df['week_cos'] = np.cos(2 * np.pi * df['week'] / 52)

    def assign_season(week):
        if week in range(1, 13):
            return 1 #'Winter'
        elif week in range(13, 26):
            return 2 #'Spring'
        elif week in range(26, 39):
            return 3 #'Summer'
        else:
            return 4 #'Fall'

    df['Season'] = df['Week_no'].apply(assign_season)
    df['Season'] = df['Season'].astype('category')
    
    # df['mean_year'] = df.groupby(['Year'])['Count'].transform('mean')
    # df['median_year'] = df.groupby(['Year'])['Count'].transform('median')
    # df['std_year'] = df.groupby(['Year'])['Count'].transform('std')
    # df['sum_year'] = df.groupby(['Year'])['Count'].transform('sum')
    
    df['mean_province'] = df.groupby(['PROVINCE'])['Count'].transform('mean')
    df['median_province'] = df.groupby(['PROVINCE'])['Count'].transform('median')
    df['std_province'] = df.groupby(['PROVINCE'])['Count'].transform('std')
    df['sum_province'] = df.groupby(['PROVINCE'])['Count'].transform('sum')

    df['mean_season'] = df.groupby(['Season'])['Count'].transform('mean')
    df['median_season'] = df.groupby(['Season'])['Count'].transform('median')
    df['std_season'] = df.groupby(['Season'])['Count'].transform('std')
    df['sum_season'] = df.groupby(['Season'])['Count'].transform('sum')

    df['mean_province_season'] = df.groupby(['PROVINCE', 'Season'])['Count'].transform('mean')
    df['median_province_season'] = df.groupby(['PROVINCE', 'Season'])['Count'].transform('median')
    df['std_province_season'] = df.groupby(['PROVINCE', 'Season'])['Count'].transform('std')
    df['sum_province_season'] = df.groupby(['PROVINCE', 'Season'])['Count'].transform('sum')

    # df['lag_1'] = df.groupby(['PROVINCE'])['Count'].shift(periods=1, fill_value=0)
    # df['lag_2'] = df.groupby(['PROVINCE'])['Count'].shift(periods=2, fill_value=0)
    # df['lag_3'] = df.groupby(['PROVINCE'])['Count'].shift(periods=3, fill_value=0)

    df = df.sort_values(by=['_id'])

    df = df.drop(['_id', 'Disease', 'Week_no'], axis=1)
    return df

In [70]:
processed_train = preprocess(df_train)
split_point = int(len(processed_train) * 0.8)
train = processed_train.iloc[:split_point]
val = processed_train.iloc[split_point:]
cat_features = ['PROVINCE', 'Season']

X_train = train.drop('Count', axis=1)
y_train = train['Count']
X_val = val.drop('Count', axis=1)
y_val = val['Count']

cat = CatBoostRegressor(learning_rate=0.01, eval_metric='RMSE')

cat.fit(X_train, y_train, 
        cat_features=cat_features,
        eval_set=(X_val, y_val),
        use_best_model=True)

y_pred = cat.predict(X_val)

rmse = root_mean_squared_error(y_val, y_pred)
print(f'RMSE: {rmse}')

  df['mean_province'] = df.groupby(['PROVINCE'])['Count'].transform('mean')
  df['median_province'] = df.groupby(['PROVINCE'])['Count'].transform('median')
  df['std_province'] = df.groupby(['PROVINCE'])['Count'].transform('std')
  df['sum_province'] = df.groupby(['PROVINCE'])['Count'].transform('sum')
  df['mean_season'] = df.groupby(['Season'])['Count'].transform('mean')
  df['median_season'] = df.groupby(['Season'])['Count'].transform('median')
  df['std_season'] = df.groupby(['Season'])['Count'].transform('std')
  df['sum_season'] = df.groupby(['Season'])['Count'].transform('sum')
  df['mean_province_season'] = df.groupby(['PROVINCE', 'Season'])['Count'].transform('mean')
  df['median_province_season'] = df.groupby(['PROVINCE', 'Season'])['Count'].transform('median')
  df['std_province_season'] = df.groupby(['PROVINCE', 'Season'])['Count'].transform('std')
  df['sum_province_season'] = df.groupby(['PROVINCE', 'Season'])['Count'].transform('sum')


0:	learn: 29.6693217	test: 44.3612038	best: 44.3612038 (0)	total: 34.2ms	remaining: 34.1s
1:	learn: 29.4939554	test: 44.3405365	best: 44.3405365 (1)	total: 64.9ms	remaining: 32.4s
2:	learn: 29.3194785	test: 44.3170725	best: 44.3170725 (2)	total: 97.2ms	remaining: 32.3s
3:	learn: 29.1488175	test: 44.2985783	best: 44.2985783 (3)	total: 127ms	remaining: 31.6s
4:	learn: 28.9835239	test: 44.2813762	best: 44.2813762 (4)	total: 165ms	remaining: 32.8s
5:	learn: 28.8175223	test: 44.2585941	best: 44.2585941 (5)	total: 200ms	remaining: 33.1s
6:	learn: 28.6616844	test: 44.2517040	best: 44.2517040 (6)	total: 231ms	remaining: 32.8s
7:	learn: 28.4965035	test: 44.2359570	best: 44.2359570 (7)	total: 265ms	remaining: 32.9s
8:	learn: 28.3397914	test: 44.2384928	best: 44.2359570 (7)	total: 298ms	remaining: 32.8s
9:	learn: 28.1862312	test: 44.2214343	best: 44.2214343 (9)	total: 329ms	remaining: 32.5s
10:	learn: 28.0252189	test: 44.2016442	best: 44.2016442 (10)	total: 360ms	remaining: 32.3s
11:	learn: 27.86

In [79]:
feature_importances = cat.get_feature_importance()
feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df.sort_values(by='Importance', ascending=False, inplace=True)
print(importance_df)

                   Feature  Importance
3                 all_week   32.932574
5                 week_cos   11.828212
15    mean_province_season   10.547422
17     std_province_season    9.307219
0                     Year    8.436407
18     sum_province_season    8.380628
16  median_province_season    7.292492
4                 week_sin    1.748659
2                     week    1.660806
8          median_province    1.493391
9             std_province    1.423664
10            sum_province    1.334780
7            mean_province    1.261273
13              std_season    0.539383
14              sum_season    0.475257
12           median_season    0.431993
11             mean_season    0.382038
6                   Season    0.333520
1                 PROVINCE    0.190283


In [75]:
processed_df = preprocess(df.copy())
train = processed_df[:len(df_train)]
X = train.drop('Count', axis=1)
y = train['Count']
cat_features = ['PROVINCE', 'Season']

cat = CatBoostRegressor(iterations=1000, learning_rate=0.01, eval_metric='RMSE')

cat.fit(X, y, cat_features=cat_features)

  df['mean_province'] = df.groupby(['PROVINCE'])['Count'].transform('mean')
  df['median_province'] = df.groupby(['PROVINCE'])['Count'].transform('median')
  df['std_province'] = df.groupby(['PROVINCE'])['Count'].transform('std')
  df['sum_province'] = df.groupby(['PROVINCE'])['Count'].transform('sum')
  df['mean_season'] = df.groupby(['Season'])['Count'].transform('mean')
  df['median_season'] = df.groupby(['Season'])['Count'].transform('median')
  df['std_season'] = df.groupby(['Season'])['Count'].transform('std')
  df['sum_season'] = df.groupby(['Season'])['Count'].transform('sum')
  df['mean_province_season'] = df.groupby(['PROVINCE', 'Season'])['Count'].transform('mean')
  df['median_province_season'] = df.groupby(['PROVINCE', 'Season'])['Count'].transform('median')
  df['std_province_season'] = df.groupby(['PROVINCE', 'Season'])['Count'].transform('std')
  df['sum_province_season'] = df.groupby(['PROVINCE', 'Season'])['Count'].transform('sum')


0:	learn: 33.0818596	total: 31.4ms	remaining: 31.4s
1:	learn: 32.9104246	total: 62.2ms	remaining: 31.1s
2:	learn: 32.7385017	total: 92.6ms	remaining: 30.8s
3:	learn: 32.5792023	total: 123ms	remaining: 30.5s
4:	learn: 32.4133719	total: 154ms	remaining: 30.6s
5:	learn: 32.2534193	total: 189ms	remaining: 31.2s
6:	learn: 32.0879439	total: 224ms	remaining: 31.8s
7:	learn: 31.9297364	total: 259ms	remaining: 32.1s
8:	learn: 31.7709411	total: 291ms	remaining: 32.1s
9:	learn: 31.6122892	total: 322ms	remaining: 31.9s
10:	learn: 31.4581782	total: 353ms	remaining: 31.7s
11:	learn: 31.3028107	total: 384ms	remaining: 31.6s
12:	learn: 31.1540575	total: 417ms	remaining: 31.7s
13:	learn: 31.0053348	total: 445ms	remaining: 31.4s
14:	learn: 30.8606595	total: 477ms	remaining: 31.3s
15:	learn: 30.7159266	total: 512ms	remaining: 31.5s
16:	learn: 30.5706673	total: 543ms	remaining: 31.4s
17:	learn: 30.4271260	total: 574ms	remaining: 31.3s
18:	learn: 30.2842592	total: 605ms	remaining: 31.2s
19:	learn: 30.14141

<catboost.core.CatBoostRegressor at 0x2bff385f8b0>

In [76]:
test = processed_df[len(df_train):]
test = test.drop('Count', axis=1)

y_pred = cat.predict(test).astype(int)

In [77]:
submission = pd.read_csv('sample_submission.csv')
submission = submission.drop(['Count'], axis=1)
submission['Pred'] = y_pred.flatten()
submission

Unnamed: 0,_id,Pred
0,40195,85
1,40196,88
2,40197,88
3,40198,88
4,40199,88
...,...,...
3190,43385,33
3191,43386,30
3192,43387,27
3193,43388,17


In [78]:
submission.to_csv('submission.csv', index=False)

In [80]:
!kaggle competitions submit -c epidemiology-2 -f submission.csv -m "Catboost"

Successfully submitted to Hackathon Online: Data Science and Big Data



  0%|          | 0.00/31.3k [00:00<?, ?B/s]
100%|██████████| 31.3k/31.3k [00:00<00:00, 35.4kB/s]
