In [None]:
import jinja2
from pycaret.regression import *
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import pickle

import xgboost
import catboost

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import plotly.express as px

from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_squared_error

# 1. Data

In [None]:
# Data 불러오기
df = pd.read_pickle("/content/drive/MyDrive/날씨/data_자외선/전처리_knn/knn_imp(0616).pkl")


# Time Encoding(month, hour)
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

df['month'] = df["date_time"].dt.month
df = encode(df, 'month', 12)

df['hour'] = df["date_time"].dt.hour
df = encode(df, 'hour', 23)

# Drop features
df.drop(columns=["sateza", "height", "landtype", "month", "hour"], inplace=True)

# Feature 재정렬
df = df[['date_time','stn', 'uv', 'month_sin', 'month_cos', 'hour_sin', 'hour_cos', 'lon', 'lat', 
        'band1', 'band2', 'band3', 'band4', 'band5',
       'band6', 'band7', 'band8', 'band9', 'band10', 'band11', 'band12',
       'band13', 'band14', 'band15', 'band16', 'solarza', 'esr']]

# 20, 21년 7~9월 Train Data
df_train_1 = df.loc[(df["date_time"] >= "2020-07-01") & (df["date_time"] < "2020-10-01")]
df_train_2 = df.loc[(df["date_time"] >= "2021-07-01") & (df["date_time"] < "2021-10-01")]
df_train_789 = pd.concat([df_train_1, df_train_2], axis=0)

# 19년 8월 Train Data
df_test = df.loc[(df["date_time"] >= "2019-08-01") & (df["date_time"] < "2019-09-01")]

df_train_789.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

df_train_789.head()

# 2. Pycaret

In [None]:
reg = setup(data = df_train_789, target = 'uv', session_id=198, train_size = 0.8,
            ignore_features=["date_time", "stn"])

In [None]:
best = compare_models(sort = 'RMSE', n_select=3, cross_validation=False)

## ET, Catboost, RF

In [None]:
et = create_model('et', cross_validation=False)

In [None]:
cat = create_model('catboost', cross_validation=False)

In [None]:
rf = create_model('rf', cross_validation=False)

## Tune_model

In [None]:
tuned_et = tune_model(et, optimize='RMSE', n_iter=5, fold=4)
tuned_cat = tune_model(cat, optimize='RMSE', n_iter=5, fold=4)
tuned_rf = tune_model(rf, optimize='RMSE', n_iter=5, fold=4)

## Blender

In [None]:
blender = blend_models(estimator_list=best, optimize='RMSE', fold=5, verbose=True)

## Finalize & Predict

In [None]:
final_model = finalize_model(blender)

In [None]:
pred = predict_model(final_model, data = df_test)
pred.loc[pred["Label"] < 0, "Label"] = 0
pred

## Pred graph

In [None]:
plt.plot(pred["date_time"], pred["uv"], color="blue", label='actual')
plt.plot(pred["date_time"], pred["Label"], color="red", label='pred', alpha=0.7)
plt.legend()
plt.show()