In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


# Load the CSV containing the printabels
label_file = os.path.join('label.csv')  # Adjust the path as needed
df = pd.read_csv(label_file)

# Print the first few rows of the label file
print(df.head())

# 1. Count number of images per class
class_distribution = df['label'].value_counts()
print("\nNumber of images per class:")
print(class_distribution)

# 2. Visualize the distribution of images per class
plt.figure(figsize=(10,6))
sns.barplot(x=class_distribution.index, y=class_distribution.values, palette='viridis')
plt.title("Distribution of Images per Class")
plt.xlabel("Class")
plt.ylabel("Number of Images")
plt.xticks(rotation=45)
plt.show()

# 3. Check image sizes by directly analyzing images
image_folder = '/kaggle/input/data222/data'  # The folder where images are stored
image_sizes = []

for img_name in df['filename']:  # Assuming 'filename' is a column in the CSV
    img_path = os.path.join(image_folder, img_name)  # Path to image
    
    try:
        # Open the image using PIL and append the size (width, height)
        with Image.open(img_path) as img:
            image_sizes.append(img.size)
    except Exception as e:
        print(f"Error loading image {img_name}: {e}")

# Convert image sizes to a DataFrame for statistics
size_df = pd.DataFrame(image_sizes, columns=['Width', 'Height'])
print("\nImage Size Statistics:")
print(size_df.describe())

# 4. Visualize image sizes (width and height)
plt.figure(figsize=(12,6))
plt.subplot(1, 2, 1)
sns.histplot(size_df['Width'], bins=20, kde=True, color='blue')
plt.title("Distribution of Image Widths")
plt.xlabel("Width")

plt.subplot(1, 2, 2)
sns.histplot(size_df['Height'], bins=20, kde=True, color='green')
plt.title("Distribution of Image Heights")
plt.xlabel("Height")

plt.tight_layout()
plt.show()

In [None]:
import random

total_root = "/kaggle/input/data222/data"


# Load the CSV containing the labels
label_file = os.path.join('/kaggle/input/data222/label.csv')  # Adjust the filename as needed
df = pd.read_csv(label_file)

# Print the first few rows of the label file
print(df.head())

# 1. Count number of images per class
class_distribution = df['label'].value_counts()
print("\nNumber of images per class:")
print(class_distribution)

# 2. Visualize the distribution of images per class
plt.figure(figsize=(10,6))
sns.barplot(x=class_distribution.index, y=class_distribution.values, palette='viridis')
plt.title("Distribution of Images per Class")
plt.xlabel("Class")
plt.ylabel("Number of Images")
plt.xticks(rotation=45)
plt.show()

# 3. Display a Single Sample Image from Each Class
num_samples = 7  # Number of samples to display per class
plt.figure(figsize=(20, len(class_distribution) * 3))

for i, cls in enumerate(class_distribution.index):
    class_images = df[df['label'] == cls]['filename'].tolist()
    sample_images = random.sample(class_images, min(num_samples, len(class_images)))
    
    for j, img_name in enumerate(sample_images):
        img_path = os.path.join(total_root, img_name)
        
        try:
            img = Image.open(img_path)
            plt.subplot(len(class_distribution), num_samples, i * num_samples + j + 1)
            plt.imshow(img)
            plt.title(cls)
            plt.axis('off')
        except Exception as e:
            print(f"Error loading image {img_name}: {e}")

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Load the CSV containing the labels
label_file = os.path.join('//kaggle/input/data222/label.csv')  # Adjust the filename as needed
df = pd.read_csv(label_file)

# 1. Count number of images per class
class_distribution = df['label'].value_counts()
print("\nNumber of images per class:")
print(class_distribution)

# 2. Visualize the distribution of images per class
plt.figure(figsize=(10, 6))
sns.barplot(x=class_distribution.index, y=class_distribution.values, palette='viridis')
plt.title("Distribution of Images per Class")
plt.xlabel("Class")
plt.ylabel("Number of Images")
plt.xticks(rotation=45)
plt.show()

# 3. Identify Class Imbalances
total_images = class_distribution.sum()
class_percentages = (class_distribution / total_images) * 100
print("\nClass Percentages:")
print(class_percentages)

# Discuss Class Imbalances
threshold = 10  # Define a threshold percentage for imbalance
imbalanced_classes = class_percentages[class_percentages < threshold]
if not imbalanced_classes.empty:
    print("\nImbalanced Classes (less than {}% of total images):".format(threshold))
    print(imbalanced_classes)
else:
    print("\nNo significant class imbalances detected.")


In [None]:
%pip install opencv
%pip instasll mahotas opencv
%pip install seaborn

In [None]:
import pandas as pd
import os
import cv2
import numpy as np
import mahotas
from PIL import Image
import matplotlib.pyplot as plt


total_root = "/kaggle/input/data222/data"


# Load the CSV containing the labels
label_file = os.path.join('/kaggle/input/data222/label.csv')  # Adjust the filename as needed
df = pd.read_csv(label_file)

# Initialize lists to store features
hog_features = []
color_histograms = []
glcm_features = []

# Define a function to extract HOG features using OpenCV
def extract_hog_features(image):
    winSize = (64, 64)
    blockSize = (16, 16)
    blockStride = (8, 8)
    cellSize = (8, 8)
    nbins = 9
    hog = cv2.HOGDescriptor(winSize, blockSize, blockStride, cellSize, nbins)
    h = hog.compute(image)
    return h.flatten()

# Define a function to extract color histogram
def extract_color_histogram(image, bins=(8, 8, 8)):
    # Compute a 3D color histogram in the RGB color space
    hist = cv2.calcHist([image], [0, 1, 2], None, bins, [0, 256, 0, 256, 0, 256])
    # Normalize the histogram
    hist = cv2.normalize(hist, hist).flatten()
    return hist

# Define a function to extract GLCM features using Mahotas
def extract_glcm_features(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    textures = mahotas.features.haralick(gray_image).mean(axis=0)
    return textures

# Iterate through the images and extract features
for img_name in df['filename']:
    img_path = os.path.join(total_root, img_name)
    image = cv2.imread(img_path)
    
    # Extract HOG features
    hog_feature = extract_hog_features(image)
    hog_features.append(hog_feature)
    
    # Extract color histogram
    color_histogram = extract_color_histogram(image)
    color_histograms.append(color_histogram)
    
    # Extract GLCM features
    glcm_feature = extract_glcm_features(image)
    glcm_features.append(glcm_feature)

# Convert lists to DataFrames
hog_df = pd.DataFrame(hog_features)
color_hist_df = pd.DataFrame(color_histograms)
glcm_df = pd.DataFrame(glcm_features, columns=['Angular Second Moment', 'Contrast', 'Correlation', 'Variance', 'Inverse Difference Moment', 'Sum Average', 'Sum Variance', 'Sum Entropy', 'Entropy', 'Difference Variance', 'Difference Entropy', 'Information Measure of Correlation 1', 'Information Measure of Correlation 2', 'Maximal Correlation Coefficient'])

# Print feature DataFrames
print("HOG Features:")
print(hog_df.head())

print("\nColor Histogram Features:")
print(color_hist_df.head())

print("\nGLCM Features:")
print(glcm_df.head())