## Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import random
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

warnings.filterwarnings(action='ignore')

## Fixed Random-Seed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/competition/Jeju Prediction/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/competition/Jeju Prediction/test.csv')
inter_df = pd.read_csv('/content/drive/MyDrive/competition/Jeju Prediction/international_trade.csv')

In [None]:
train_df

In [None]:
inter_df

In [None]:
test_df

## **Heatmap Visualization**

In [None]:
# Calculate the correlation matrix
corr_matrix = train_df.corr()

# Create a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", vmin=-1, vmax=1)
plt.title('Heatmap of Feature Correlations')
plt.show()

## Data Pre-Processing

In [None]:
# 1. Handling Missing Values
# 1. 결측값 (값이 없는 값들) 열마다 총 몇 개 있는지 확인하는 코드

missing_values = train_df.isnull().sum()
print("Missing values:\n", missing_values)

In [None]:
# 2. Converting Data Types
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])

In [None]:
# 3. Encoding Categorical Variables

label_encoder = LabelEncoder()
for column in ['item', 'corporation', 'location']:
    combined = pd.concat([train_df[column], test_df[column]], axis=0)
    label_encoder.fit(combined)
    train_df[column] = label_encoder.transform(train_df[column])
    test_df[column] = label_encoder.transform(test_df[column])

In [None]:
# 4. Feature Engineering

train_df['year'] = train_df['timestamp'].dt.year
train_df['month'] = train_df['timestamp'].dt.month
train_df['day'] = train_df['timestamp'].dt.day
train_df = train_df.drop(columns=['supply(kg)'])

In [None]:
# 5. Splitting Data into Training and Validation Sets
train_data, valid_data = train_test_split(train_df, test_size=0.2, random_state=42)
print("Training Data Shape:", train_data.shape)
print("Validation Data Shape:", valid_data.shape)

In [None]:
# Initialize the QuantileTransformer
quantile_transformer = QuantileTransformer(output_distribution='uniform', random_state=42)

In [None]:
# Columns to be transformed
columns_to_transform = ['year', 'month', 'day']

In [None]:
# Apply the quantile transformation
train_df[columns_to_transform] = quantile_transformer.fit_transform(train_df[columns_to_transform])

In [None]:
# Show the first few rows of the preprocessed and transformed data
print(train_df.head())

## **Pycaret**

In [None]:
!pip install --upgrade scipy
!pip install --upgrade pycaret

In [None]:
from pycaret.regression import *

In [None]:
# Initialize the setup with validation set
regression_setup = setup(data=train_data,
                         target='price(원/kg)',
                         session_id=42,
                         normalize=True,
                         transformation=True,
                         transform_target=True,
                         data_split_shuffle=False,  # No additional shuffling, as we've already split the data
                         train_size=0.8)  # Adjust as needed

# Create and train the Extra Trees Regressor model
et_model = create_model('et')

# Compare models to find the best one
# best_model = compare_models(sort='RMSE')  # Sorting by R-squared value

# Show the best model
# print(best_model)

## Inference

In [None]:
# Preprocessing test data
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])

In [None]:
test_df['year'] = test_df['timestamp'].dt.year
test_df['month'] = test_df['timestamp'].dt.month
test_df['day'] = test_df['timestamp'].dt.day
test_df[columns_to_transform] = quantile_transformer.transform(test_df[columns_to_transform])

In [None]:
# Use the test set for final testing
preds = predict_model(et_model, data=test_df)['prediction_label']

## Submission

In [None]:
submission = pd.read_csv('/content/drive/MyDrive/competition/Jeju Prediction/sample_submission.csv')
submission

In [None]:
submission['answer'] = preds
submission

In [None]:
submission.to_csv('/content/drive/MyDrive/competition/Jeju Prediction/baseline_submission.csv', index=False)