In [11]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [3]:
# 1. INPUT
input_data = pd.read_csv('2023_smartFarm_AI_hackathon_dataset.csv')


In [4]:
# 2. PRE_PROCESSING

########################################################################################################
# 2-1. frmYear(연도), frmWeek(주차) date 기반 imputation
# "date" 칼럼을 datetime 형식으로 변환해서 "data1"에 저장 이후 drop
input_data["date1"] = pd.to_datetime(input_data["date"], format="%Y%m%d")
input_data["date1"]

# "frmYear"와 "frmWeek" 칼럼의 값이 0인 경우를 확인하고 처리
for index, row in input_data.iterrows():
    if row["frmYear"] == 0 or row["frmWeek"] == 0:
        input_data.at[index, "frmYear"] = row["date1"].year
        input_data.at[index, "frmWeek"] = row["date1"].week

# "frmYear"와 "frmWeek" 칼럼을 정수(int) 형식으로 변환
input_data["frmYear"] = input_data["frmYear"].astype(int)
input_data["frmWeek"] = input_data["frmWeek"].astype(int)
input_data.drop("date1", axis=1, inplace=True)
########################################################################################################
# 2-2. inCo2(내부CO2), inTp(내부온도), inHd(내부습도), OutTp(외부온도) KNN 기반 imputation
# 사용할 칼럼 선택
selected_columns = ["frmAr", "frmDov", "date", "frmYear", "frmWeek", "inCo2", "inTp", "inHd", "outTp"]

# 0인 값을 NaN(결측치)로 변환
input_data[selected_columns] = input_data[selected_columns].replace(0, np.nan)

# KNN Imputer 객체 생성
imputer = KNNImputer(n_neighbors=5)  # n_neighbors 값은 필요에 따라 조정

# 결측치를 채울 칼럼 선택 및 결측치 처리
input_data[selected_columns] = imputer.fit_transform(input_data[selected_columns])

########################################################################################################
# 2-3. acSlrdQy(누적 일사량) mode value로 imputation
input_data['acSlrdQy'].replace(0, 995, inplace=True)
########################################################################################################
# 2-4. WaterUsage(물 사용량), WaterCost(물 사용비용), FertilizerUsage(비료 사용량), FertilizerCost(비료 사용 비용), Mist Cost(미스트 사용비용), MistUsageTime(미스트 사용시간), CO2Cost(CO2사용비용), CO2Usage(CO2사용량) mean value로 imputation
input_data['WaterUsage'].replace(0, input_data['WaterUsage'].mean(), inplace=True)
input_data['WaterCost'].replace(0, input_data['WaterCost'].mean(), inplace=True)

input_data['FertilizerUsage'].replace(0, input_data['FertilizerUsage'].mean(), inplace=True)
input_data['FertilizerCost'].replace(0, input_data['FertilizerCost'].mean(), inplace=True)

input_data['Mist Cost'].replace(0, input_data['Mist Cost'].mean(), inplace=True)
input_data['MistUsageTime'].replace(0, input_data['MistUsageTime'].mean(), inplace=True)

input_data['CO2Cost'].replace(0, input_data['CO2Cost'].mean(), inplace=True)
input_data['CO2Usage'].replace(0, input_data['CO2Usage'].mean(), inplace=True)

In [13]:
# 3. TRAIN/TEST SPLIT
input_data = input_data.drop(columns=['frmDist'])

# Split the data into training and testing sets
X = input_data[input_data.drop(columns=['outtrn_cumsum','HeatingEnergyUsage_cumsum']).columns]
Y = input_data[['outtrn_cumsum','HeatingEnergyUsage_cumsum']]
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [6]:
# from sklearn.linear_model import Ridge
# import numpy as np
# from sklearn.linear_model import RidgeCV
# from sklearn.model_selection import cross_val_score
# alphas = [0.001,0.01,0.1,0.2,0.3,1,10]
# ridge_cv_model = RidgeCV(alphas=alphas, store_cv_values=True)  # store_cv_values=True로 설정하여 CV 결과를 저장
# ridge_cv_model.fit(X_train, y_train)

# best_alpha = ridge_cv_model.alpha_

# # Predict 'y' values using the trained model
# model = Ridge(alpha=best_alpha)
# model.fit(X_train, y_train)

In [14]:
# 4. MODEL_TRAIN
model = Ridge(alpha=0.01)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [15]:
# 5. MODEL_PREDICT
def calculate_rmse(targets, predictions):
    """
    Calculate the Root Mean Squared Error (RMSE) between predicted and target values.

    :param predictions: Predicted values.
    :type predictions: array-like
    :param targets: Target values.
    :type targets: array-like
    :return: RMSE value.
    :rtype: float
    """
    from sklearn.metrics import mean_squared_error
    return np.sqrt(mean_squared_error(targets, predictions))


def calculate_R2_score(y_test,y_pred):
    return r2_score(y_test, y_pred)


rmse = calculate_rmse(y_test, y_pred)
r2score = calculate_R2_score(y_test, y_pred)

In [16]:
# 6.OUTPUT
print("RMSE:", rmse)
print("R2_score:", r2score)

RMSE: 238949.51649122284
R2_score: 0.522913605503115
