# Boilerplate code whilst we're getting gcloud setup with data
### Subject to change

In [None]:
from google.cloud import storage
import pandas as pd
from io import BytesIO
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
import sys

sys.path.append('..')

import config


In [None]:
''' Initialising G-Cloud '''
client = storage.Client()
bucket = client.get_bucket(config.BUCKET_NAME)

In [None]:
''' Collect statistics of the dataset '''

def collect_dataset_statistics(bucket):
    blobs = bucket.list_blobs(prefix=config.DATA_FOLDER)
    data = []

    for blob in blobs:
        path_parts = blob.name.split('/')

        # Check if the blob is an image file
        if len(path_parts) >= 3 and blob.name.lower().endswith(('.jpg', '.jpeg', '.png')):
            category = path_parts[-3]
            type_name = path_parts[-2]
            image_name = path_parts[-1]
            data.append({
                'blob_name': blob.name,
                'category': category,
                'type': type_name,
                'image_name': image_name
            })

    df = pd.DataFrame(data)
    return df

df = collect_dataset_statistics(bucket)


In [None]:
print("Dataset Overview:")
display(df.head())

type_counts = df.groupby('category')['type'].nunique().reset_index(name='type_count')
print("\nNumber of types per category:")
display(type_counts)

image_counts = df.groupby(['category', 'type']).size().reset_index(name='image_count')
print("\nNumber of images per type:")
display(image_counts)

category_image_counts = df.groupby('category').size().reset_index(name='total_images')
print("\nTotal number of images per category:")
display(category_image_counts)

In [None]:
''' Display sample images to view what the dataset looks like '''
def display_sample_images(df, bucket, num_samples=5):
    sampled_images = df.sample(n=num_samples, random_state=42).reset_index(drop=True)

    for index, row in sampled_images.iterrows():
        blob = bucket.blob(row['blob_name'])
        image_data = blob.download_as_bytes()
        image = Image.open(BytesIO(image_data))

        plt.figure(figsize=(5, 5))
        plt.imshow(image)
        plt.axis('off')
        plt.title(f"Category: {row['category']}\nType: {row['type']}")
        plt.show()

print("\nSample Images:")
display_sample_images(df, bucket, num_samples=5)

In [None]:
%matplotlib inline

sns.set(style="whitegrid")

''' Plotting the total number of images per category '''
def plot_total_images_per_category(category_image_counts):
    plt.figure(figsize=(8, 6))
    sns.barplot(
        x='category',
        y='total_images',
        data=category_image_counts.sort_values('total_images', ascending=False),
        palette='viridis'
    )
    plt.title('Total Number of Images per Category')
    plt.xlabel('Category')
    plt.ylabel('Number of Images')
    plt.show()

plot_total_images_per_category(category_image_counts)

In [None]:
''' Class imbalance analysis '''
category_percentages = category_image_counts.copy()
total_images = category_percentages['total_images'].sum()
category_percentages['percentage'] = (category_percentages['total_images'] / total_images) * 100

plt.figure(figsize=(8, 8))
plt.pie(
    category_percentages['total_images'],
    labels=category_percentages['category'],
    autopct='%1.1f%%',
    startangle=140,
    colors=sns.color_palette('pastel')
)
plt.title('Proportion of Images per Category')
plt.axis('equal')
plt.show()


In [None]:
''' Plotting the distribution of images per species in each category '''

def plot_species_distribution(df):
    categories = df['category'].unique()
    for category in categories:
        category_df = df[df['category'] == category]
        species_counts = category_df.groupby('type').size().reset_index(name='image_count')
        
        plt.figure(figsize=(12, 6))
        sns.histplot(
            species_counts['image_count'],
            bins=20,
            kde=True,
            color='skyblue'
        )
        plt.title(f'Distribution of Images per Species in "{category.capitalize()}" Category')
        plt.xlabel('Number of Images per Species')
        plt.ylabel('Number of Species')
        plt.show()
        
        # Box Plot
        plt.figure(figsize=(6, 8))
        sns.boxplot(
            y=species_counts['image_count'],
            color='lightgreen'
        )
        plt.title(f'Box Plot of Images per Species in "{category.capitalize()}" Category')
        plt.ylabel('Number of Images per Species')
        plt.show()
        
        # Statistical Summary
        print(f"Statistical Summary for {category.capitalize()} Category:")
        display(species_counts['image_count'].describe())

# Call the function
plot_species_distribution(df)


In [None]:
def analyze_image_dimensions(df, bucket):
    widths = []
    heights = []
    aspect_ratios = []
    
    for index, row in df.iterrows():
        blob = bucket.blob(row['blob_name'])
        image_data = blob.download_as_bytes()
        img = Image.open(BytesIO(image_data))
        width, height = img.size
        widths.append(width)
        heights.append(height)
        aspect_ratios.append(width / height)
    
    df_dimensions = pd.DataFrame({
        'width': widths,
        'height': heights,
        'aspect_ratio': aspect_ratios
    })
    
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x='width', y='height', data=df_dimensions, alpha=0.5)
    plt.title('Image Width vs. Height')
    plt.xlabel('Width (pixels)')
    plt.ylabel('Height (pixels)')
    plt.show()
    
    plt.figure(figsize=(8, 6))
    sns.histplot(df_dimensions['aspect_ratio'], bins=30, kde=True, color='coral')
    plt.title('Distribution of Image Aspect Ratios')
    plt.xlabel('Aspect Ratio (Width/Height)')
    plt.ylabel('Number of Images')
    plt.show()
    
    print("Statistical Summary of Image Dimensions:")
    display(df_dimensions.describe())

analyze_image_dimensions(df.sample(n=500, random_state=42), bucket)


In [None]:
def analyze_color_distribution(df, bucket):
    avg_colors = []
    labels = []
    
    for index, row in df.sample(n=500, random_state=42).iterrows():
        blob = bucket.blob(row['blob_name'])
        image_data = blob.download_as_bytes()
        img = Image.open(BytesIO(image_data)).resize((50, 50))
        img_array = np.array(img)
        avg_color = img_array.mean(axis=(0, 1))  # Average over width and height
        avg_colors.append(avg_color)
        labels.append(row['category'])
    
    avg_colors = np.array(avg_colors)
    
    color_df = pd.DataFrame({
        'R': avg_colors[:, 0],
        'G': avg_colors[:, 1],
        'B': avg_colors[:, 2],
        'category': labels
    })
    
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='category', y='R', data=color_df)
    plt.title('Distribution of Average Red Channel by Category')
    plt.show()
    
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='category', y='G', data=color_df)
    plt.title('Distribution of Average Green Channel by Category')
    plt.show()
    
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='category', y='B', data=color_df)
    plt.title('Distribution of Average Blue Channel by Category')
    plt.show()

analyze_color_distribution(df, bucket)
