# 统计图像尺寸、比例分布

同济子豪兄 https://space.bilibili.com/1900783

2022-8-1

## 导入工具包

In [1]:
import os
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm

# import matplotlib.pyplot as plt
# %matplotlib inline


In [7]:

import matplotlib
import matplotlib.pyplot as plt

matplotlib.use('TkAgg')

## 指定数据集路径

In [3]:
# 指定数据集路径
dataset_path = r'F:\SewerData'
os.chdir(dataset_path)
os.listdir()

['AJ',
 'BX',
 'CJ',
 'CK',
 'CQ',
 'CR',
 'FS',
 'FZ',
 'JG',
 'PL',
 'QF',
 'SG',
 'SL',
 'TJ',
 'TL',
 'ZW']

In [4]:
df = pd.DataFrame()
for fruit in tqdm(os.listdir()): # 遍历每个类别    
    os.chdir(fruit)
    for file in os.listdir(): # 遍历每张图像
        try:
            img = cv2.imread(file)
            df = df.append({'类别':fruit, '文件名':file, '图像宽':img.shape[1], '图像高':img.shape[0]}, ignore_index=True)
        except:
            print(os.path.join(fruit, file), '读取错误')
    os.chdir('../')
os.chdir('../')

100%|██████████| 16/16 [00:15<00:00,  1.02it/s]


In [5]:
df

Unnamed: 0,类别,文件名,图像宽,图像高
0,AJ,11.jpg,1280.0,720.0
1,AJ,12.jpg,1280.0,720.0
2,AJ,13.jpg,1280.0,720.0
3,AJ,14.jpg,1280.0,720.0
4,AJ,15.jpg,1280.0,720.0
...,...,...,...,...
1137,ZW,d6102.mp4_20230504_221442.127.jpg,1514.0,852.0
1138,ZW,d6120.mp4_20230504_221447.562.jpg,1514.0,852.0
1139,ZW,d6142.mp4_20230504_221457.257.jpg,1514.0,852.0
1140,ZW,d6142.mp4_20230504_221502.241.jpg,1514.0,852.0


## 可视化图像尺寸分布

In [12]:
from scipy.stats import gaussian_kde
from matplotlib.colors import LogNorm

x = df['图像宽']
y = df['图像高']

xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)

# Sort the points by density, so that the densest points are plotted last
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]

plt.figure(figsize=(10,10))
# plt.figure(figsize=(12,12))
plt.scatter(x, y, c=z,  s=5, cmap='Spectral_r')
# plt.colorbar()
# plt.xticks([])
# plt.yticks([])

plt.tick_params(labelsize=15)

xy_max = max(max(df['图像宽']), max(df['图像高']))
plt.xlim(xmin=0, xmax=xy_max)
plt.ylim(ymin=0, ymax=xy_max)

plt.ylabel('height', fontsize=25)
plt.xlabel('width', fontsize=25)

plt.savefig('图像尺寸分布.pdf', dpi=120, bbox_inches='tight')

plt.show()

[1280. 2154.  314. 1188. 1514. 1166. 1272.  704.]
[1280. 2154.  314. 1188. 1514. 1166. 1272.  704.]


In [14]:
import pandas as pd

# 将x和y组合成一个新列
df['xy组合'] = list(zip(df['图像宽'], df['图像高']))

# 获取不同类别的具体类型
combination_counts = df['xy组合'].value_counts()

print("组合类型的计数：\n", combination_counts)

组合类型的计数：
 (1514.0, 852.0)     525
(1280.0, 720.0)     509
(2154.0, 1212.0)     50
(314.0, 177.0)       27
(1272.0, 954.0)      15
(1166.0, 954.0)       8
(704.0, 576.0)        7
(1188.0, 664.0)       1
Name: xy组合, dtype: int64


In [22]:
import os
from PIL import Image

root_dir = "F:\\SewerData_split\\train"
subfolders = ["AJ", "BX", "CJ", "CK", "CQ", "CR", "FS", "FZ", "JG", "PL", "QF", "SG", "SL", "TJ", "TL", "ZW"]

unwanted_sizes = [(314.0, 177.0), (704.0, 576.0)]

for subfolder in subfolders:
    subfolder_path = os.path.join(root_dir, subfolder)
    for filename in os.listdir(subfolder_path):
        if filename.lower().endswith(".jpg"):
            img_path = os.path.join(subfolder_path, filename)
            try:
                with Image.open(img_path) as img:
                    width, height = img.size
                img.close()  # 关闭图像文件
                if (width, height) in unwanted_sizes:
                    print(f"Deleting {img_path} with size ({width}, {height})")
                    os.remove(img_path)
            except Exception as e:
                print(f"Error processing {img_path}: {e}")


Deleting F:\SewerData_split\train\AJ\30.jpg with size (314, 177)
Deleting F:\SewerData_split\train\AJ\36.jpg with size (314, 177)
Deleting F:\SewerData_split\train\AJ\38.jpg with size (314, 177)
Deleting F:\SewerData_split\train\AJ\46.jpg with size (314, 177)
Deleting F:\SewerData_split\train\AJ\48.jpg with size (314, 177)
Deleting F:\SewerData_split\train\AJ\51.jpg with size (314, 177)
Deleting F:\SewerData_split\train\BX\21.jpg with size (314, 177)
Deleting F:\SewerData_split\train\BX\23.jpg with size (314, 177)
Deleting F:\SewerData_split\train\BX\24.jpg with size (314, 177)
Deleting F:\SewerData_split\train\BX\27.jpg with size (314, 177)
Deleting F:\SewerData_split\train\BX\29.jpg with size (314, 177)
Deleting F:\SewerData_split\train\BX\30.jpg with size (314, 177)
Deleting F:\SewerData_split\train\BX\31.jpg with size (314, 177)
Deleting F:\SewerData_split\train\CR\10.jpg with size (314, 177)
Deleting F:\SewerData_split\train\CR\11.jpg with size (314, 177)
Deleting F:\SewerData_spl

In [23]:
# 指定数据集路径
dataset_path = r'F:\SewerData_split\train'
os.chdir(dataset_path)
os.listdir()

['AJ',
 'BX',
 'CJ',
 'CK',
 'CQ',
 'CR',
 'FS',
 'FZ',
 'JG',
 'PL',
 'QF',
 'SG',
 'SL',
 'TJ',
 'TL',
 'ZW']

In [24]:
df = pd.DataFrame()
for fruit in tqdm(os.listdir()): # 遍历每个类别
    os.chdir(fruit)
    for file in os.listdir(): # 遍历每张图像
        try:
            img = cv2.imread(file)
            df = df.append({'类别':fruit, '文件名':file, '图像宽':img.shape[1], '图像高':img.shape[0]}, ignore_index=True)
        except:
            print(os.path.join(fruit, file), '读取错误')
    os.chdir('../')
os.chdir('../')

100%|██████████| 16/16 [00:12<00:00,  1.26it/s]


In [25]:
df['xy组合'] = list(zip(df['图像宽'], df['图像高']))

# 获取不同类别的具体类型
combination_counts = df['xy组合'].value_counts()

print("组合类型的计数：\n", combination_counts)

组合类型的计数：
 (1514.0, 852.0)     419
(1280.0, 720.0)     412
(2154.0, 1212.0)     39
(1272.0, 954.0)      12
(1166.0, 954.0)       6
(1188.0, 664.0)       1
Name: xy组合, dtype: int64
