# Corn Dataset Creation

In [1]:
import pandas as pd
import numpy as np
from glob import glob
from PIL import Image

In [2]:
%load_ext lab_black

In [11]:
df = pd.concat(
    [
        pd.DataFrame(glob("data/train/broken/*.png"))
        .rename(columns={0: "image"})
        .assign(label="broken")
        .assign(group="train"),
        pd.DataFrame(glob("data/train/discolored/*.png"))
        .rename(columns={0: "image"})
        .assign(label="discolored")
        .assign(group="train"),
        pd.DataFrame(glob("data/train/pure/*.png"))
        .rename(columns={0: "image"})
        .assign(label="pure")
        .assign(group="train"),
        pd.DataFrame(glob("data/train/silkcut/*.png"))
        .rename(columns={0: "image"})
        .assign(label="silkcut")
        .assign(group="train"),
        pd.DataFrame(glob("data/validation/broken/*.png"))
        .rename(columns={0: "image"})
        .assign(label="broken")
        .assign(group="test"),
        pd.DataFrame(glob("data/validation/discolored/*.png"))
        .rename(columns={0: "image"})
        .assign(label="discolored")
        .assign(group="test"),
        pd.DataFrame(glob("data/validation/pure/*.png"))
        .rename(columns={0: "image"})
        .assign(label="pure")
        .assign(group="test"),
        pd.DataFrame(glob("data/validation/silkcut/*.png"))
        .rename(columns={0: "image"})
        .assign(label="silkcut")
        .assign(group="test"),
    ]
)

df = df.assign(fn=df["image"].str.split("/").str[-1])

df = df.assign(view=df["fn"].str.split("_").str[-2]).assign(
    number=df["fn"].str.split("_").str[-1].str.split(".").str[0].astype("int")
)

In [35]:
np.random.seed(529)
df["seed_id"] = np.random.choice(range(17801), replace=False, size=17801)
df["seed_id"] = df["seed_id"].astype("str").str.zfill(5)

In [36]:
df

Unnamed: 0,image,label,group,fn,view,number,seed_id
0,data/train/broken/new_dc354_discolored_002_top...,broken,train,new_dc354_discolored_002_top_116.png,top,116,03287
1,data/train/broken/new_dc354_discolored_001_bot...,broken,train,new_dc354_discolored_001_bottom_331.png,bottom,331,13484
2,data/train/broken/dc354_broken_003_bottom_018.png,broken,train,dc354_broken_003_bottom_018.png,bottom,18,10701
3,data/train/broken/dc354_discolored_001_bottom_...,broken,train,dc354_discolored_001_bottom_200.png,bottom,200,05661
4,data/train/broken/new_dc354_broken_002_bottom_...,broken,train,new_dc354_broken_002_bottom_290.png,bottom,290,13366
...,...,...,...,...,...,...,...
319,data/validation/silkcut/dc354_silkcut_004_top_...,silkcut,test,dc354_silkcut_004_top_089.png,top,89,01461
320,data/validation/silkcut/dc354_discolored_002_t...,silkcut,test,dc354_discolored_002_top_280.png,top,280,02566
321,data/validation/silkcut/dc354_silkcut_001_top_...,silkcut,test,dc354_silkcut_001_top_278.png,top,278,11504
322,data/validation/silkcut/dc354_broken_000_botto...,silkcut,test,dc354_broken_000_bottom_180.png,bottom,180,05140


# Move Files to new folder

In [37]:
from tqdm.notebook import tqdm
import os
import shutil

In [38]:
for i, d in tqdm(df.iterrows(), total=len(df)):
    out_img = "corn/" + d["group"] + "/" + d["seed_id"] + ".png"
    shutil.copy(d["image"], out_img)

  0%|          | 0/17801 [00:00<?, ?it/s]

In [49]:
df["image"] = df["group"] + "/" + df["seed_id"] + ".png"

In [123]:
df.query('group == "train"').sort_values("seed_id")[
    ["seed_id", "view", "image", "label"]
].to_csv("corn/train.csv", index=False)
df.query('group == "test"').sort_values("seed_id")[["seed_id", "view", "image"]].to_csv(
    "corn/test.csv", index=False
)
solution = df.query('group == "test"')[["seed_id", "label"]]

solution["Usage"] = "Private"
public_seeds = solution["seed_id"].sample(frac=0.2, random_state=529).unique()
solution.loc[solution["seed_id"].isin(public_seeds), "Usage"] = "Public"
solution.to_csv("corn/score.csv", index=False)

In [103]:
train = pd.read_csv("corn/train.csv")
test = pd.read_csv("corn/test.csv")

In [None]:
ss = pd.read_csv("corn/score.csv")

ss["label"] = ss.sample(frac=1, random_state=529)["label"].values

In [97]:
ss.drop("Usage", axis=1).to_csv("corn/sample_submission.csv", index=False)

In [114]:
ss["label"].value_counts()

silkcut       905
broken        903
pure          868
discolored    803
Name: label, dtype: int64

In [127]:
ss.sort_values("seed_id").head()

Unnamed: 0,seed_id,label,Usage
801,2,discolored,Private
2369,11,silkcut,Private
1723,13,discolored,Private
1397,19,silkcut,Public
867,27,pure,Private


In [99]:
ss["label"] = np.random.choice(ss["label"].unique(), size=len(ss))

In [100]:
ss.drop("Usage", axis=1).to_csv("corn/sample_submission.csv", index=False)