<a href="https://colab.research.google.com/github/Roni81/smartfarm/blob/main/prdct_grw.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [2]:
import os
from glob import glob

import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import groupby
import random

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from catboost import CatBoostRegressor
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanAbsoluteError
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping

In [3]:

train_imgs = glob("/content/drive/MyDrive/growing2_temp/growing2_temp/images/*.jpg")
train_imgs = sorted(train_imgs)
test_imgs = glob("/content/drive/MyDrive/growing2_temp/growing2_temp/test/images/*.jpg")
test_imgs = sorted(test_imgs)
train_data = glob("/content/drive/MyDrive/growing2_temp/growing2_temp/metas/*.csv")
train_data = sorted(train_data)
train_label = pd.read_csv("/content/drive/MyDrive/growing2_temp/growing2_temp/combined_dataset.csv")
test_data = glob("/content/drive/MyDrive/growing2_temp/growing2_temp/test/metas/*.csv")
test_data = sorted(test_data)

In [5]:
main_path = "/content/drive/MyDrive/growing2_temp/growing2_temp"

preprocessing_train_images = main_path + "/preprocessing_train"
preprocessing_test_images = main_path + "/preprocessing_test"

if not os.path.exists(preprocessing_train_images):
    os.mkdir(preprocessing_train_images)
if not os.path.exists(preprocessing_test_images):
    os.mkdir(preprocessing_test_images)

In [8]:
def automatic_brightness_and_contrast(image, clip_hist_percent = 0.025):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

     # Calculate grayscale histogram
    hist = cv2.calcHist([gray],[0],None,[256],[0,256])
    hist_size = len(hist)

    # Calculate cumulative distribution from the histogram
    accumulator = []
    accumulator.append(float(hist[0]))
    for index in range(1, hist_size):
        accumulator.append(accumulator[index -1] + float(hist[index]))

    # Locate points to clip
    maximum = accumulator[-1]
    clip_hist_percent *= (maximum/100.0)
    clip_hist_percent /= 2.0

    # Locate left cut
    minimum_gray = 0
    while accumulator[minimum_gray] < clip_hist_percent:
        minimum_gray += 1

    # Locate right cut
    maximum_gray = hist_size -1
    while accumulator[maximum_gray] >= (maximum - clip_hist_percent):
        maximum_gray -= 1

    # Calculate alpha and beta values
    alpha = 255 / (maximum_gray - minimum_gray)
    beta = -minimum_gray * alpha

    auto_result = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
    return (auto_result)

In [15]:
def get_image_data(dir_in, dir_out):

    ratio_lst = []

    for i in tqdm(dir_in):
        name = i.split("\\")[-1] #i.split("/")[-1]
        img = cv2.imread(i,cv2.IMREAD_COLOR)
        img = cv2.resize(img, (1000,750))
        brightscale = automatic_brightness_and_contrast(img)
        imgcopy = brightscale.copy()
        hsvimage = cv2.cvtColor(brightscale,cv2.COLOR_BGR2HSV)
        lower = np.array([22,40,0])
        upper = np.array([85,255,245])
        mask = cv2.inRange(hsvimage, lower, upper)
        number_of_white_pix = np.sum(mask == 255)
        number_of_black_pix = np.sum(mask == 0)
        ratio = number_of_white_pix / (number_of_white_pix + number_of_black_pix)
        ratio_lst.append(ratio)
        result = cv2.bitwise_and(imgcopy, imgcopy, mask = mask)
        cv2.imwrite(os.path.join(dir_out, name), result)

    return ratio_lst


In [16]:
ratio_train = get_image_data(train_imgs, preprocessing_train_images)
ratio_test = get_image_data(test_imgs, preprocessing_test_images)

processed_train_imgs = glob(main_path + "/preprocessing_train/*.jpg")
processed_train_imgs = sorted(processed_train_imgs)

processed_test_imgs = glob(main_path + "/preprocessing_test/*.jpg")
processed_test_imgs = sorted(processed_test_imgs)


100%|██████████| 1592/1592 [22:54<00:00,  1.16it/s]
100%|██████████| 246/246 [04:04<00:00,  1.01it/s]


In [27]:
train_df = []

for i in tqdm(train_data):
    name = i.split("\\")[-1].split(".")[0]
    df = pd.read_csv(i)
    df = df.drop('시간', axis=1)
    case = name.split("_")[0]
    label = pd.read_csv("/content/drive/MyDrive/growing2_temp/growing2_temp/combined_dataset.csv")

    # 이미지 이름이 일치하는 경우에만 무게 정보를 가져옴
    if name in label.img_name.values:
        leaf_weight = label[label.img_name == name].leaf_weight.values[0]
        df["무게"] = leaf_weight
        df["최근분무량"] = df["최근분무량"].fillna(method='bfill', limit=1)
        df["최근분무량"] = df["최근분무량"].fillna(method='ffill', limit=1)
        df = df.interpolate()
        water = df['최근분무량'].round(2).tolist()
        if np.mean(water) > 1000:
            nums = [list(v) for k, v in groupby(water, key=lambda x: x != 0) if k != 0]
            if len(nums) == 2:
                cumulative = nums[0][-1] - nums[0][0] + nums[1][-1]
            else:
                cumulative = nums[0][-1] - nums[0][0]
        elif 1000 > np.mean(water) > 0:
            nums = [key for key, _group in groupby(water)]
            cumulative = sum(nums[1:])
        else:
            cumulative = 0

        # df DataFrame이 비어 있지 않은 경우에만 추가
        if not df.empty:
            df = df.mean().to_frame().T
            df["이미지"] = name
            df['최근분무량'] = cumulative
            train_df.append(df)

# train_df에 정보가 있는 경우에만 pd.concat 호출
if train_df:
    train_df = pd.concat(train_df, ignore_index=True)
    train_df['비율'] = ratio_train
    train_df.head()
else:
    print("No objects to concatenate. train_df is empty.")

100%|██████████| 1592/1592 [00:17<00:00, 92.95it/s]

No objects to concatenate. train_df is empty.





In [29]:
test_df = []
for i in tqdm(test_data):
    name = i.split("\\")[-1].split(".")[0]  # i.split("/")[-1].split(".")[0]
    df = pd.read_csv(i)
    df = df.drop('시간', axis = 1)
    df["최근분무량"] = df["최근분무량"].fillna(method='bfill', limit=1)
    df["최근분무량"] = df["최근분무량"].fillna(method='ffill', limit=1)
    df = df.interpolate()
    water = df['최근분무량'].round(2).tolist()
    if np.mean(water) > 1000:
        nums = [list(v) for k,v in groupby(water, key = lambda x: x != 0) if k != 0]
        if len(nums) == 2:
            cumulative = nums[0][-1] - nums[0][0] + nums[1][-1]
        else:
            cumulative = nums[0][-1] - nums[0][0]

    elif 1000 > np.mean(water) > 0:
        nums = [key for key, _group in groupby(water)]
        cumulative = sum(nums[1:])
    else:
        cumulative = 0

    df = df.mean()
    df = df.to_frame().T
    df["이미지"] = name
    df['최근분무량'] = cumulative

    test_df.append(df)


if test_df:
    test_df = pd.concat(test_df, ignore_index=True)
    test_df['비율'] = ratio_train
    test_df.head()
else:
    print("No objects to concatenate. train_df is empty.")

0it [00:00, ?it/s]

No objects to concatenate. train_df is empty.





In [30]:
fig = px.scatter(train_df, x = '무게', y= '비율',
                 hover_name="이미지", trendline="ols")

fig.show()

ValueError: ignored

In [None]:
image_outliers = ['CASE05_21','CASE05_22','CASE05_23', 'CASE07_07', 'CASE07_08', 'CASE16_03', 'CASE23_01', 'CASE23_02',
'CASE23_03', 'CASE23_04', 'CASE23_05', 'CASE23_06', 'CASE23_07', 'CASE23_08', 'CASE23_09', 'CASE45_16', 'CASE45_17',
'CASE72_06',  'CASE73_10', 'CASE59_01','CASE59_02','CASE59_03','CASE59_04','CASE59_05','CASE59_06',
'CASE59_07','CASE59_08','CASE59_09','CASE59_10','CASE59_11','CASE59_12','CASE59_13','CASE59_14','CASE59_15','CASE59_16','CASE59_17','CASE59_18',
'CASE59_19','CASE59_20','CASE59_21','CASE59_22','CASE59_23','CASE59_24','CASE59_25','CASE59_26','CASE59_27','CASE59_28','CASE59_29','CASE59_30',
'CASE59_31','CASE59_32', 'CASE59_33']

train_df_image = train_df[~train_df['이미지'].isin(image_outliers)]
train_imgs_removed = [ x for x in processed_train_imgs if x.split(".")[1].split("\\")[1] not in image_outliers]  # x.split(".")[1].split("/")[3]

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

if torch.cuda.is_available():
    print("The code will run on GPU.")
else:
    print("The code will run on CPU. Go to Edit->Notebook Settings and choose GPU as the hardware accelerator")

In [None]:
CFG = {
    'IMG_SIZE':128,
    'EPOCHS':80,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':32,
    'SEED':42
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [None]:
train_len = int(len(train_imgs_removed)*0.8)
weight = train_df_image['무게'].round(3).tolist()

train_img_path = train_imgs_removed[:train_len]
train_label = weight[:train_len]

vali_img_path = train_imgs_removed[train_len:]
vali_label = weight[train_len:]

In [None]:
class CustomDataset(Dataset):
    def __init__(self, img_path_list, label_list, train_mode=True, transforms=None):
        self.transforms = transforms
        self.train_mode = train_mode
        self.img_path_list = img_path_list
        self.label_list = label_list

    def __getitem__(self, index): # Use index when calling images
        img_path = self.img_path_list[index]
        # Get image data
        image = cv2.imread(img_path)
        if self.transforms is not None:
            image = self.transforms(image)

        if self.train_mode:
            label = self.label_list[index]
            return image, label
        else:
            return image

    def __len__(self): # Returns number of training data
        return len(self.img_path_list)

In [None]:
train_transform = transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Resize((CFG['IMG_SIZE'], CFG['IMG_SIZE'])),
                    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
                    ])

test_transform = transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Resize((CFG['IMG_SIZE'], CFG['IMG_SIZE'])),
                    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
                    ])

In [None]:
train_dataset = CustomDataset(train_img_path, train_label, train_mode=True, transforms=train_transform)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

vali_dataset = CustomDataset(vali_img_path, vali_label, train_mode=True, transforms=test_transform)
vali_loader = DataLoader(vali_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
class CNNRegressor(torch.nn.Module):
    def __init__(self):
        super(CNNRegressor, self).__init__()
        self.layer1 = torch.nn.Sequential(
            nn.Conv2d(3, 8, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))

        self.layer2 = torch.nn.Sequential(
            nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))

        self.layer3 = torch.nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))

        self.layer4 = torch.nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=4, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))

        self.regressor = nn.Linear(3136,1)


    def forward(self, x):
        # Simple CNN Model (Batch, 3, 128, 128 -> Batch, 64, 7, 7)
        # (Batch, 3, 128, 128)
        x = self.layer1(x)
        # (Batch, 8, 64, 64)
        x = self.layer2(x)
        # (Batch, 16, 32, 32)
        x = self.layer3(x)
        # (Batch, 32, 16, 16)
        x = self.layer4(x)
        # (Batch, 64, 7, 7) -> Flatten (Batch, 64*7*7(=3136))
        x = torch.flatten(x, start_dim=1)
        # Regressor (Batch, 3136) -> (Batch, 1)
        out = self.regressor(x)
        return out

In [None]:
def train(model, optimizer, train_loader, vali_loader, scheduler, device):
    model.to(device)

    # Loss Function
    criterion = nn.L1Loss().to(device)
    best_mae = 9999

    for epoch in range(1,CFG["EPOCHS"]+1):
        model.train()
        train_loss = []
        for img, label in tqdm(iter(train_loader)):
            img, label = img.float().to(device), label.float().to(device)

            optimizer.zero_grad()

            # Data -> Model -> Output
            logit = model(img)
            # Calc loss
            loss = criterion(logit.squeeze(1), label)

            # backpropagation
            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        if scheduler is not None:
            scheduler.step()

        # Evaluation Validation set
        vali_mae = validation(model, vali_loader, criterion, device)

        print(f'Epoch [{epoch}] Train MAE : [{np.mean(train_loss):.5f}] Validation MAE : [{vali_mae:.5f}]\n')

        # Model Saved
        if best_mae > vali_mae:
            best_mae = vali_mae
            torch.save(model.state_dict(), './best_model.pth')
            print('Model Saved.')

In [None]:
def validation(model, vali_loader, criterion, device):
    model.eval() # Evaluation
    vali_loss = []
    with torch.no_grad():
        for img, label in tqdm(iter(vali_loader)):
            img, label = img.float().to(device), label.float().to(device)

            logit = model(img)
            loss = criterion(logit.squeeze(1), label)

            vali_loss.append(loss.item())

    vali_mae_loss = np.mean(vali_loss)
    return vali_mae_loss

In [None]:
CNNmodel = CNNRegressor().to(device)

optimizer = torch.optim.SGD(params = CNNmodel.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = None

train(CNNmodel, optimizer, train_loader, vali_loader, scheduler, device)

In [None]:
def predict(model, test_loader, device):
    model.eval()
    model_pred = []
    with torch.no_grad():
        for img in tqdm(iter(test_loader)):
            img = img.float().to(device)

            pred_logit = model(img)
            pred_logit = pred_logit.squeeze(1).detach().cpu()

            model_pred.extend(pred_logit.tolist())
    return model_pred

In [None]:
test_dataset = CustomDataset(processed_test_imgs, None, train_mode=False, transforms=test_transform)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

# Validation Score가 가장 뛰어난 모델을 불러옵니다.
checkpoint = torch.load('./best_model.pth')
CNNmodel = CNNRegressor().to(device)
CNNmodel.load_state_dict(checkpoint)

# Inference
preds = predict(CNNmodel, test_loader, device)

In [None]:
submission = pd.read_csv('./open/sample_submission.csv')
submission['leaf_weight'] = preds
submission.to_csv('./CNNsubmit.csv', index=False)

In [None]:
# https://dacon.io/competitions/official/235897/codeshare/5017?page=1&dtype=recent

Metadata EDA
각 환경 변수 시각화하면서 이상값 판단하기
CASE01, CASE02 경우 EC 관측치, 외부온도 값이 다른 케이스에 비해 매우 다르므로 메타데이터에 제외하기
음수 값이 나오는 최근분무량 (일간누적분무량) 제외하기 (일부 CASE04)
CO2관측지가 0인 케이스는 누락 데이터로 판단하여 메타데이터에 제외하기

In [None]:
firstfeats = ['내부온도관측치', '외부온도관측치', '내부습도관측치', '외부습도관측치', 'CO2관측치', 'EC관측치','최근분무량']

secondfeats = ['냉방온도', '냉방부하','난방온도', '난방부하', '비율']

thirdfeats = ['화이트 LED동작강도', '레드 LED동작강도', '블루 LED동작강도', '총추정광량', '백색광추정광량', '적색광추정광량', '청색광추정광량']

for feat in firstfeats:
    fig = make_subplots(rows=1, cols=2)

    fig.add_trace(
        go.Scatter(x = train_df[feat].index, y =  train_df[feat], text=train_df["이미지"]),
        row=1, col=1
    )

    fig.add_trace(
        go.Scatter(x = test_df[feat].index, y =  test_df[feat], text=test_df["이미지"]),
        row=1, col=2
    )
    fig.update_layout(showlegend=False, title_text=feat)
    fig.show()

In [None]:
meta_outliers = ['CASE01_01','CASE01_02','CASE01_03','CASE01_04','CASE01_05','CASE01_06','CASE01_07',
'CASE01_08','CASE01_09','CASE02_01','CASE02_02','CASE02_03','CASE02_04','CASE02_05','CASE02_06','CASE02_07',
'CASE02_08','CASE02_09','CASE02_10','CASE02_11']

train_df_meta = train_df_image[~train_df_image['이미지'].isin(meta_outliers)]

train_df_meta = train_df_meta[train_df_meta['CO2관측치'] > 0]
train_df_meta = train_df_meta[train_df_meta['최근분무량'] >= 0]

In [None]:
corr = train_df_meta.corr()

corr.style.background_gradient(cmap='coolwarm')

In [None]:
fig = px.scatter(train_df_meta, x = train_df_meta['총추정광량'],
                 y= train_df_meta['백색광추정광량'] + train_df_meta['적색광추정광량']+ train_df_meta['청색광추정광량'],
                 trendline="ols")

fig.show()

In [None]:
features = ['내부온도관측치', '외부온도관측치', '내부습도관측치', '외부습도관측치', 'CO2관측치', 'EC관측치',
         '최근분무량', '냉방온도', '냉방부하',
         '난방온도', '난방부하', '백색광추정광량', '적색광추정광량', '청색광추정광량', '비율']

train_col = train_df_meta[features]

test_col = test_df[features]

train_target = train_df_meta["무게"]

train_x, val_x, train_y, val_y = train_test_split(train_col, train_target, test_size=0.2, random_state=32)

CatBoost Fit
50번 이상 validation loss 개선 없을 경우 조기종료

In [None]:
CATmodel = CatBoostRegressor(verbose=50,
                             n_estimators=10000,
                             eval_metric='MAE',
                             early_stopping_rounds=50)
CATmodel.fit(train_x, train_y, eval_set=[(val_x, val_y)],
                   use_best_model=True)

val_pred = CATmodel.predict(val_x)
plt.figure(figsize=(20,10))
plt.plot(np.array(val_pred),label = "pred")
plt.plot(np.array(val_y),label = "true")
plt.legend()
plt.show()

train_score = CATmodel.score(train_x, train_y) # train (learn) score

val_score = CATmodel.score(val_x, val_y) # val (test) score

In [None]:
CATresult = CATmodel.predict(test_col)

submission = pd.read_csv('./open/sample_submission.csv')
submission['leaf_weight'] = CATresult
submission.to_csv('./CATsubmit.csv', index=False)

ANN Fit
싸이킷런(스케일러)로 메타데이터 스케일 조정
50번 이상 validation loss 개선 없을 경우 조기종료

In [None]:
def scale_datasets(x_train, x_test):
  """
  Standard Scale test and train data
  Z - Score normalization
  """
  standard_scaler = StandardScaler()
  x_train_scaled = pd.DataFrame(
      standard_scaler.fit_transform(x_train),
      columns=x_train.columns
  )
  x_test_scaled = pd.DataFrame(
      standard_scaler.transform(x_test),
      columns = x_test.columns
  )
  return x_train_scaled, x_test_scaled

train_scaled, test_scaled = scale_datasets(train_col, test_col)

train_x_scale, val_x_scale, train_y_scale, val_y_scale = train_test_split(train_scaled,
                                                                          train_target,
                                                                          test_size=0.2,
                                                                          random_state=32)

In [None]:
# Creating model using the Sequential in tensorflow

tf.random.set_seed(42)

def build_model_using_sequential():
  model = Sequential([
    Dense(100, kernel_initializer='normal', activation='relu'),
    Dense(50, kernel_initializer='normal', activation='relu'),
    Dense(25, kernel_initializer='normal', activation='relu'),
    Dense(1, kernel_initializer='normal', activation='linear')
  ])
  return model
# build the model
ANNmodel = build_model_using_sequential()

# loss function
mae = MeanAbsoluteError()
ANNmodel.compile(
    loss=mae,
    optimizer=Adam(learning_rate=0.001),
    metrics=[mae]
)

early_stopping_monitor = EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=50,
    verbose=1,
    mode='auto',
    baseline=None,
    restore_best_weights=True
)

# train the model
history = ANNmodel.fit(
    train_x_scale,
    train_y_scale,
    epochs=1000,
    batch_size=32,
    validation_data=(val_x_scale, val_y_scale),
    callbacks=[early_stopping_monitor],
    verbose= 2
)

In [None]:
val_pred = ANNmodel.predict(val_x_scale)
plt.figure(figsize=(20,10))
plt.plot(np.array(val_pred),label = "pred")
plt.plot(np.array(val_y_scale),label = "true")
plt.legend()
plt.show()

In [None]:
ANNresult = ANNmodel.predict(test_scaled)

submission = pd.read_csv('./open/sample_submission.csv')
submission['leaf_weight'] = ANNresult
submission.to_csv('./ANNsubmit.csv', index=False)

Ensemble
각 모델 결과 비교하면서 public score가 더 좋을수록 가중치 높게 적용
CNN (0.152884733) * 0.65 + CatBoost (0.2221573479) * 0.25 + Ann (0.2557698871) * 0.1

In [None]:
CNN = pd.read_csv('./CNNsubmit.csv')
CAT = pd.read_csv('./CATsubmit.csv')
ANN = pd.read_csv('./ANNsubmit.csv')

submission_final = pd.read_csv('./open/sample_submission.csv')
submission_final['leaf_weight'] = (CNN['leaf_weight'] * 0.65 + CAT['leaf_weight'] * 0.25 + ANN['leaf_weight'] * 0.1)
submission_final.to_csv('ENSEMBLEsubmit.csv', index=False)