In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
import pylab as py
from scipy import stats
from scipy.stats import pearsonr
import sklearn
import sklearn.preprocessing as preprocessing
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from typing import List
import cv2
import os

DATA = "data\\raw-img"
METAS = "data\\metas.csv"
IS_FULL_EXECUTE = True
R_SEED = 42

In [100]:
translations = {"cane": "dog", "ragno": "spider", "cavallo": "horse", "elefante": "elephant", "farfalla": "butterfly", "gallina": "chicken", "gatto": "cat", "mucca": "cow", "pecora": "sheep", "scoiattolo": "squirrel", "dog": "cane", "cavallo": "horse", "elephant" : "elefante", "butterfly": "farfalla", "chicken": "gallina", "cat": "gatto", "cow": "mucca", "spider": "ragno", "squirrel": "scoiattolo"}

The translation was missing a mapping for 'rango' -> 'spider', the english -> italian was present

In [None]:
def scan_images(root_folder: str) -> pd.DataFrame:
    records = []

    for dirpath, _, filenames in os.walk(root_folder):
        for fname in filenames:
            file_path = os.path.join(dirpath, fname)
            ext = os.path.splitext(fname)[1].lower()

            try:
                img = cv2.imread(file_path)
                height, width, num_channels = img.shape
            except Exception as e:
                width, height, num_channels = None, None, None

            records.append({
                "folder": os.path.basename(dirpath),
                "file_path": file_path,
                "extension": ext,
                "width": width,
                "height": height,
                "num_channels": num_channels
            })

    return pd.DataFrame(records)

In [102]:
if IS_FULL_EXECUTE:
    df = scan_images(DATA)
    assert df["file_path"].is_unique
    df["label"] = df["folder"].map(translations)
    df.to_csv(METAS)
else:
    df = pd.read_csv(METAS)

Scan images called


In [103]:
df.isna().sum(axis=0)

folder          0
file_path       0
extension       0
width           0
height          0
num_channels    0
label           0
dtype: int64

No missing values after translation dictionary correction

In [104]:
df.head()

Unnamed: 0,folder,file_path,extension,width,height,num_channels,label
0,cane,data\raw-img\cane\OIF-e2bexWrojgtQnAPPcUfOWQ.jpeg,.jpeg,300,225,3,dog
1,cane,data\raw-img\cane\OIP---A27bIBcUgX1qkbpZOPswHa...,.jpeg,300,214,3,dog
2,cane,data\raw-img\cane\OIP---cByAiEbIxIAleGo9AqOQAA...,.jpeg,153,300,3,dog
3,cane,data\raw-img\cane\OIP---ZIdwfUcJeVxnh47zppcQHa...,.jpeg,300,225,3,dog
4,cane,data\raw-img\cane\OIP---ZRsOF7zsMqhW30WeF8-AHa...,.jpeg,300,225,3,dog


In [105]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26179 entries, 0 to 26178
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   folder        26179 non-null  object
 1   file_path     26179 non-null  object
 2   extension     26179 non-null  object
 3   width         26179 non-null  int64 
 4   height        26179 non-null  int64 
 5   num_channels  26179 non-null  int64 
 6   label         26179 non-null  object
dtypes: int64(3), object(4)
memory usage: 1.4+ MB


In [106]:
df.describe()

Unnamed: 0,width,height,num_channels
count,26179.0,26179.0,26179.0
mean,320.03881,252.630162,3.0
std,196.935326,148.403298,0.0
min,60.0,57.0,3.0
25%,300.0,200.0,3.0
50%,300.0,225.0,3.0
75%,300.0,300.0,3.0
max,6720.0,6000.0,3.0


In [107]:
df["extension"].value_counts()

extension
.jpeg    24209
.jpg      1919
.png        51
Name: count, dtype: int64

.jpeg is the most common, yet they all are using 3-channels, and cv2 can work with all of them so there is no need for convertion

In [108]:
df["folder"].value_counts()

folder
cane          4863
ragno         4821
gallina       3098
cavallo       2623
farfalla      2112
mucca         1866
scoiattolo    1862
pecora        1820
gatto         1668
elefante      1446
Name: count, dtype: int64

In [109]:
df["label"].value_counts()

label
dog          4863
spider       4821
chicken      3098
horse        2623
butterfly    2112
cow          1866
squirrel     1862
sheep        1820
cat          1668
elephant     1446
Name: count, dtype: int64

In [110]:
df["num_channels"].value_counts()

num_channels
3    26179
Name: count, dtype: int64

In [111]:
df["ration"] = df["width"] / df["height"]
df["ration"].value_counts(), df["ration"].min(), df["ration"].max()

(ration
 1.333333    4591
 1.500000    3315
 1.000000    1279
 1.775148    1048
 1.502347     694
             ... 
 1.602210       1
 1.299435       1
 1.443299       1
 1.461039       1
 0.860656       1
 Name: count, Length: 907, dtype: int64,
 np.float64(0.29333333333333333),
 np.float64(4.225352112676056))

Images have very different proportions, the most common one has only 4591/26181

In [112]:
def image_to_8bit_array(path: str) -> np.ndarray:
    img = Image.open(path).convert("RGB")
    arr = np.array(img, dtype=np.uint8)
    return arr