In [21]:
import pandas as pd

df_full = pd.read_csv("merged_fruit_dataset.csv")

veg_keywords = ['capsicum', 'tomato', 'onion', 'potato', 'chilli', 'brinjal', 'bitter_gourd']

def is_vegetable(label):
    return any(veg in label.lower() for veg in veg_keywords)

df_veg_from1 = df_full[df_full['label'].apply(is_vegetable)].copy()
df_veg_from1['label'] = df_veg_from1['label'].apply(lambda x: 'fresh' if 'F' in x else 'stale')
df_veg_from1['type'] = 'vegetable'


In [22]:
df_veg_from1.head()

Unnamed: 0,image_path,label,type
7000,FRUIT-16K\F_Tomato\1.jpg,fresh,vegetable
7001,FRUIT-16K\F_Tomato\10.jpg,fresh,vegetable
7002,FRUIT-16K\F_Tomato\100.jpg,fresh,vegetable
7003,FRUIT-16K\F_Tomato\1000.jpg,fresh,vegetable
7004,FRUIT-16K\F_Tomato\101.jpg,fresh,vegetable


In [10]:
import os

veg_folders = [
    'Potato__Healthy', 'Potato__Rotten',
    'Tomato__Healthy', 'Tomato__Rotten',
    'Bellpepper__healthy', 'Bellpepper__Rotten',
]

label_map = {
    'Healthy': 'fresh',
    'healthy': 'fresh',
    'Fresh': 'fresh',
    'Rotten': 'stale',
    'Bacterial_spot': 'stale'
}

def extract_from_dataset2(base_dir):
    data = []
    for folder in veg_folders:
        folder_path = os.path.join(base_dir, folder)
        for fname in os.listdir(folder_path):
            if fname.lower().endswith(('.jpg', '.jpeg', '.png')):
                for key in label_map:
                    if key in folder:
                        data.append({
                            'image_path': os.path.join(folder_path, fname),
                            'label': label_map[key],
                            'type': 'vegetable'
                        })
                        break
    return pd.DataFrame(data)

df_veg_from2 = extract_from_dataset2("Fruit And Vegetable Diseases Dataset/")


In [11]:
df_veg_from2.head()

Unnamed: 0,image_path,label,type
0,Fruit And Vegetable Diseases Dataset/Potato__H...,fresh,vegetable
1,Fruit And Vegetable Diseases Dataset/Potato__H...,fresh,vegetable
2,Fruit And Vegetable Diseases Dataset/Potato__H...,fresh,vegetable
3,Fruit And Vegetable Diseases Dataset/Potato__H...,fresh,vegetable
4,Fruit And Vegetable Diseases Dataset/Potato__H...,fresh,vegetable


In [15]:

df_fruits = df_full[~df_full['label'].apply(is_vegetable)].copy()
df_fruits['type'] = 'fruit'


In [17]:
df_fruits.tail(20)

Unnamed: 0,image_path,label,type
52204,Dataset3TrainTest\valid\CSV_20180427_144528920...,Grape Unripe,fruit
52205,Dataset3TrainTest\valid\CSV_20180427_144528920...,Grape Unripe,fruit
52206,Dataset3TrainTest\valid\CSV_20180427_144528920...,Grape Unripe,fruit
52207,Dataset3TrainTest\valid\CSV_20180427_144528920...,Grape Unripe,fruit
52208,Dataset3TrainTest\valid\CSV_20180427_144528920...,Grape Unripe,fruit
52209,Dataset3TrainTest\valid\CSV_20180427_144528920...,Grape Unripe,fruit
52210,Dataset3TrainTest\valid\orange_2-229-_png.rf.c...,Orange Unripe,fruit
52211,Dataset3TrainTest\valid\rottenOrange-523-_jpg....,Stale Orange,fruit
52212,Dataset3TrainTest\valid\Screen-Shot-2018-06-08...,Apple Unripe,fruit
52213,Dataset3TrainTest\valid\images-4-_png.rf.2e4d8...,Orange Ripe,fruit


In [23]:

veg_df = pd.concat([df_veg_from1, df_veg_from2], ignore_index=True)

In [26]:
veg_df.head(15)

Unnamed: 0,image_path,label,type
0,FRUIT-16K\F_Tomato\1.jpg,fresh,vegetable
1,FRUIT-16K\F_Tomato\10.jpg,fresh,vegetable
2,FRUIT-16K\F_Tomato\100.jpg,fresh,vegetable
3,FRUIT-16K\F_Tomato\1000.jpg,fresh,vegetable
4,FRUIT-16K\F_Tomato\101.jpg,fresh,vegetable
5,FRUIT-16K\F_Tomato\102.jpg,fresh,vegetable
6,FRUIT-16K\F_Tomato\103.jpg,fresh,vegetable
7,FRUIT-16K\F_Tomato\104.jpg,fresh,vegetable
8,FRUIT-16K\F_Tomato\105.jpg,fresh,vegetable
9,FRUIT-16K\F_Tomato\106.jpg,fresh,vegetable


In [27]:

full_df = pd.concat([veg_df, df_fruits], ignore_index=True)

# Save it
veg_df.to_csv("vegetables_fresh_stale.csv", index=False)
full_df.to_csv("fruit_veg_fresh_stale.csv", index=False)

# Preview
print(veg_df['label'].value_counts())
print(full_df['type'].value_counts())


label
fresh    4800
stale    4653
Name: count, dtype: int64
type
fruit        46370
vegetable     9453
Name: count, dtype: int64
