In [None]:
import pandas as pd
from pathlib import Path

data = '/kaggle/input/plantvillage-dataset'

In [None]:
paths = [path.parts[-2:] for path in
         Path(data).rglob('*.*')]                          
#writing purpose ('*.*') so that all image formats can be retrieved

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
import dask.bag as db
from dask.diagnostics import ProgressBar


In [None]:
df = pd.DataFrame(data=paths, columns=['Class','Images'])    
#create column names for dataframe
df = df.sort_values('Class',ascending=True)                   #sort class name
df.reset_index(drop=True, inplace=True)                       #sort index of each row
df                                                            #display dataframe

In [None]:
#counting the number of datasets

print('Counting the number of image datasets')
print("Image Count : {}".format(len(df.Images)))
print("Class Count : {} \n".format(len(df['Class'].value_counts())))
print('Count the number of images in each class')
print(df['Class'].value_counts())

It can be seen that this dataset has 38 image classes with a total of 325832 images.

In [None]:
#Identify the count of each class using a visualization

import seaborn as sns
from matplotlib import pyplot as plt

fig, ax = plt.subplots(figsize=(30,30))
sns.countplot(y='Class', data=df)
plt.title('Graph of the count of each class on the PlantVillage image dataset')
plt.xlabel('Count Image')
plt.ylabel('\n Image Class')

After looking at the number of classes with a histogram graphic display, it can be seen that the number in each class has a different number of images, but the number is not very similar to that of other classes. therefore using bar chart visualization can identify class imbalances quickly.

In [None]:
Class_Id_Dist_Total = df['Class'].value_counts(sort=False).reset_index()
Class_Id_Dist_Total.columns = ['Class', 'Count']

import plotly.express as px
fig=px.pie(Class_Id_Dist_Total, values= 'Count', names=df['Class'].unique(),hole=0.500)
fig.update_layout(title='Data Distribution of PlantVillage Dataset',font_size=15,title_x=0.45,annotations=[dict(text='PlantVillage Dataset',font_size=12, showarrow=False,height=2000,width=4000)])
fig.update_traces(textfont_size=15,textinfo='percent')
fig.show()

After visualizing the dataset using a pie chart, it can be seen that the Orange_Huanglongbing class has the highest data distribution and the Potato_healthy class has the least distribution of data.

In [None]:
import os

print(os.listdir("/kaggle/input"))


In [None]:
print(os.listdir("/kaggle/input/plantvillage-dataset"))


In [None]:
from pathlib import Path

DATA_ROOT = Path("/kaggle/input/plantvillage-dataset/color")


In [None]:
directories = class_dirs 
class_dirs = [d for d in DATA_ROOT.iterdir() if d.is_dir()]

print(len(class_dirs))
print(class_dirs)


In [None]:
from PIL import Image
import numpy as np

def get_dims(file):
    try:
        with Image.open(file) as img:
            arr = np.array(img)
            if arr.ndim == 2:          # grayscale image
                h, w = arr.shape
            else:                       # color image
                h, w = arr.shape[:2]
            return h, w
    except Exception as e:
        # Skip corrupted images
        return None


In [None]:
all_dims = []

for filepath in directories:
    filelist = [
        os.path.join(filepath, f)
        for f in os.listdir(filepath)
        if f.lower().endswith(('.jpg', '.png'))
    ]
    all_dims.extend(db.from_sequence(filelist).map(get_dims).compute())

df_all = pd.DataFrame(all_dims, columns=['height', 'width'])
df_all.plot.scatter(x='width', y='height')


In [None]:
sizes = (
    df_all
    .groupby(['height', 'width'])
    .size()
    .reset_index(name='count')
)

sizes.sort_values('count', ascending=False).head(10)
