## Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import random
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

warnings.filterwarnings(action='ignore')

## Fixed Random-Seed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/competition/Jeju Prediction/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/competition/Jeju Prediction/test.csv')
inter_df = pd.read_csv('/content/drive/MyDrive/competition/Jeju Prediction/international_trade.csv')

In [None]:
train_df

In [None]:
inter_df

In [None]:
test_df

## **Heatmap Visualization**

In [None]:
# Calculate the correlation matrix
corr_matrix = train_df.corr()

# Create a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", vmin=-1, vmax=1)
plt.title('Heatmap of Feature Correlations')
plt.show()

## **Box Plot Visualization**

In [None]:
# Plotting boxplots to visualize outliers
plt.figure(figsize=(15, 5))

# Boxplot for supply(kg)
plt.subplot(1, 2, 1)
sns.boxplot(x=train_df['supply(kg)'])
plt.title('Boxplot for supply(kg)')

# Boxplot for price(원/kg)
plt.subplot(1, 2, 2)
sns.boxplot(x=train_df['price(원/kg)'])
plt.title('Boxplot for price(원/kg)')

plt.show()

## **Checking Outliers Using Z-Score**

In [None]:
from scipy import stats

# Calculate the Z-scores of `supply(kg)` and `price(원/kg)`
train_df['supply_z'] = np.abs(stats.zscore(train_df['supply(kg)']))
train_df['price_z'] = np.abs(stats.zscore(train_df['price(원/kg)']))

# Define a threshold for identifying an outlier
threshold = 3

# Identify outliers
outliers_supply = train_df[train_df['supply_z'] > threshold]
outliers_price = train_df[train_df['price_z'] > threshold]

# Print the number of outliers
print(f"Number of outliers in supply(kg): {outliers_supply.shape[0]}")
print(f"Number of outliers in price(원/kg): {outliers_price.shape[0]}")

# Visualize the Z-scores
plt.figure(figsize=(15, 5))

# Histogram for Z-scores of supply(kg)
plt.subplot(1, 2, 1)
sns.histplot(train_df['supply_z'], kde=False, bins=50)
plt.title('Z-score Distribution for supply(kg)')
plt.axvline(x=threshold, color='r', linestyle='--')

# Histogram for Z-scores of price(원/kg)
plt.subplot(1, 2, 2)
sns.histplot(train_df['price_z'], kde=False, bins=50)
plt.title('Z-score Distribution for price(원/kg)')
plt.axvline(x=threshold, color='r', linestyle='--')

plt.show()


## Data Pre-Processing

In [None]:
# 1. Handling Missing Values
# 1. 결측값 (값이 없는 값들) 열마다 총 몇 개 있는지 확인하는 코드

missing_values = train_df.isnull().sum()
print("Missing values:\n", missing_values)

In [None]:
# 2. Removing Outliers
# Define a threshold for identifying an outlier
threshold = 3

# Exclude rows where the absolute Z-score is greater than the threshold
train_df = train_df[(train_df['supply_z'] <= threshold) & (train_df['price_z'] <= threshold)]

print(f"Cleaned DataFrame shape: {train_df.shape}")

# You may want to drop the Z-score columns as they were only needed for the outlier detection
train_df = train_df.drop(columns=['supply_z', 'price_z'])

In [None]:
# 3. Converting Data Types
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])

In [None]:
# 4. Encoding Categorical Variables

label_encoder = LabelEncoder()
for column in ['item', 'corporation', 'location']:
    combined = pd.concat([train_df[column], test_df[column]], axis=0)
    label_encoder.fit(combined)
    train_df[column] = label_encoder.transform(train_df[column])
    test_df[column] = label_encoder.transform(test_df[column])

In [None]:
# 5. Feature Engineering

train_df['year'] = train_df['timestamp'].dt.year
train_df['month'] = train_df['timestamp'].dt.month
train_df['day'] = train_df['timestamp'].dt.day
train_df = train_df.drop(columns=['ID', 'timestamp', 'supply(kg)'])

In [None]:
# 6. Splitting Data into Training and Validation Sets
train_data, valid_data = train_test_split(train_df, test_size=0.2, random_state=42)
print("Training Data Shape:", train_data.shape)
print("Validation Data Shape:", valid_data.shape)

In [None]:
# Show the first few rows of the preprocessed and transformed data
train_df

## **Pycaret**

In [None]:
!pip install --upgrade scipy
!pip install --upgrade pycaret

In [None]:
from pycaret.regression import *

In [None]:
# Initialize the setup with validation set
regression_setup = setup(data=train_data,
                         target='price(원/kg)',
                         session_id=42,
                         normalize=True,
                         transformation=True,
                         transform_target=True,
                         data_split_shuffle=False) # No additional shuffling, as we've already split the data

# Compare models to find the best one
best_model = compare_models(sort='RMSE')  # Sorting by RMSE

# Show the best model
print(best_model)

## **Light Gradient Boosting Machine Model**

In [None]:
!pip install --upgrade lightgbm

In [None]:
import lightgbm as lgb

In [None]:
# Assuming train_data and valid_data are already split and contain the target variable 'price(원/kg)'
X_train = train_data.drop(columns=['price(원/kg)'])
y_train = train_data['price(원/kg)']
X_valid = valid_data.drop(columns=['price(원/kg)'])
y_valid = valid_data['price(원/kg)']

In [None]:
# Create the LightGBM data containers
train_data_lgb = lgb.Dataset(X_train, label=y_train)
valid_data_lgb = lgb.Dataset(X_valid, label=y_valid, reference=train_data_lgb)

In [None]:
# Parameters for the model
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'verbose': -1,
    'seed': 42
}

In [None]:
# help(lgb.train)

In [None]:
# Training the model

lgb_model = lgb.train(
    params,
    train_data_lgb,
    num_boost_round=1000,
    valid_sets=[valid_data_lgb],
    callbacks=[lgb.early_stopping(100)]
)

In [None]:
# Prediction on validation set
y_pred = lgb_model.predict(X_valid, num_iteration=lgb_model.best_iteration)

In [None]:
# Evaluate the model
rmse = mean_squared_error(y_valid, y_pred, squared=False)
print(f'RMSE: {rmse}')

## Inference

In [None]:
# Preprocessing test data
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])

In [None]:
test_df['year'] = test_df['timestamp'].dt.year
test_df['month'] = test_df['timestamp'].dt.month
test_df['day'] = test_df['timestamp'].dt.day
test_df = test_df.drop(columns=['ID', 'timestamp'])  # Update this as per your preprocessing steps

In [None]:
# Use the test set for final testing
test_predictions = lgb_model.predict(test_df, num_iteration=lgb_model.best_iteration)

## Submission

In [None]:
submission = pd.read_csv('/content/drive/MyDrive/competition/Jeju Prediction/sample_submission.csv')
submission

In [None]:
submission['answer'] = test_predictions
submission

In [None]:
submission.to_csv('/content/drive/MyDrive/competition/Jeju Prediction/baseline_submission.csv', index=False)