In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import matplotlib.pyplot as plt
import seaborn as sns
import re
spark = SparkSession.builder.appName("Read HDFS Image Data").getOrCreate()

In [1]:
# Function to process each HDFS directory and generate the plots
def process_hdfs_and_plot(hdfs_path, title_prefix):
    # Define schema
    schema = StructType([StructField("Colour", StringType(), True),
                         StructField("Count", IntegerType(), True)])
    
    # Load data from HDFS
    df = spark.read.csv(hdfs_path, sep='\t', header=False, schema=schema)
    
    # Filter out irrelevant rows
    df = df.filter(~df["Colour"].contains("Height:"))
    df = df.filter(~df["Colour"].contains("Width:"))
    df = df.withColumn("Colour", df["Colour"].substr(8, 100)) 
    df = df.filter(F.col("Colour").isNotNull() & (F.col("Colour") != "") &
                   (F.col("Colour").rlike("^\d{1,3},\d{1,3},\d{1,3}$")))

    # Convert the "Count" column to int
    df = df.withColumn("Count", F.col("Count").cast("int"))
    
    # Aggregate counts by Colour
    clean_df = df.groupBy("Colour").agg(F.sum("Count").alias("TotalCount"))
    
    # Convert to pandas DataFrame
    pandas_df = clean_df.toPandas()

    # Exclude white ('250,250,250') colour and get top and bottom colours
    top_20_colours = pandas_df[pandas_df['Colour'] != '250,250,250'].nlargest(20, 'TotalCount')
    bottom_20_colours = pandas_df.nsmallest(20, 'TotalCount')

    # Create a bar plot for top 20 colours
    plt.figure(figsize=(15, 10))
    sns.barplot(x='TotalCount', y='Colour', data=top_20_colours, hue='Colour', palette="viridis")
    plt.title(f'{title_prefix}: Top 20 Colour Distribution')
    plt.xlabel('Count of Occurrences')
    plt.ylabel('Colour (RGB)')
    plt.savefig(f'/mnt/Results/{title_prefix}_top.png', bbox_inches='tight')
    plt.tight_layout()
    plt.show()

    # Create a bar plot for bottom 20 colours
    plt.figure(figsize=(15, 10))
    sns.barplot(x='TotalCount', y='Colour', data=bottom_20_colours, hue='Colour', palette="viridis")
    plt.title(f'{title_prefix}: Bottom 20 Colour Distribution')
    plt.xlabel('Count of Occurrences')
    plt.ylabel('Colour (RGB)')
    plt.savefig(f'/mnt/Results/{title_prefix}_btm.png', bbox_inches='tight')
    plt.tight_layout()
    plt.show()

    # Plot a pie chart for top 15 colours (excluding '250,250,250')
    valid_data = pandas_df[pandas_df['Colour'].apply(is_valid_rgb)]
    valid_data['Colour'] = valid_data['Colour'].apply(lambda x: tuple(map(int, x.split(','))))
    valid_data = valid_data[valid_data['Colour'] != (250, 250, 250)] #drop background
    valid_data = valid_data[(valid_data['Colour'] != (0, 0, 0)) & (valid_data['Colour'] != (200, 200, 200)) & (valid_data['Colour'] != (50, 50, 50)) & (valid_data['Colour'] != (150, 150, 150)) & (valid_data['Colour'] != (100, 100, 100))]  #drop coomon colours acrross season
    top_colors = valid_data.nlargest(15, 'TotalCount')

    plt.figure(figsize=(15, 10))
    plt.pie(
        top_colors['TotalCount'], 
        labels=None,
        colors=[(r/255, g/255, b/255) for r, g, b in top_colors['Colour']],  # Normalize RGB to 0-1 for matplotlib
        autopct='%1.1f%%', 
        startangle=140,
        pctdistance=0.85,  # Move percentage labels out of the pie chart
        wedgeprops={'linewidth': 1},
        textprops={'color': 'white'}
    )
    plt.title(f'{title_prefix}: Top 15 Colour Distribution (RGB Matching Pie Chart)')
    plt.savefig(f'/mnt/Results/{title_prefix}_pie.png', bbox_inches='tight')
    plt.tight_layout()
    plt.show()


In [2]:
process_hdfs_and_plot("/user/thokozile/output/Summer/*", "Summer")

In [3]:
process_hdfs_and_plot("/user/thokozile/output/Winter/*", "Winter")

In [4]:
process_hdfs_and_plot("/user/thokozile/output/Fall/*", "Fall")

In [5]:
process_hdfs_and_plot("/user/thokozile/output/Spring/*", "spring")

In [6]:
process_hdfs_and_plot("/user/thokozile/output/Other/*", "Other")