 ## Histogram of each cluster by file

In [None]:
import pandas as pd
import plotly.express as px
import numpy as np

# Read the CSV file
df = pd.read_csv('clusters_with_time_filename_mini.csv')

# Extract the short filename and then the time_of_day
df['short_filename'] = df['filename'].str.split('.').str[0]  # Extract portion before the first dot
df['time_of_day'] = df['short_filename'].str.split('_').str[1]  # Extract the time part after the last underscore

# Filter the data for cluster 1
id = 1
cluster_1_data = df[df['Cluster'] == id]

# Define custom buckets (000000-010000, 010000-020000, ...)
def assign_bucket(time):
    if pd.isna(time):  # Handle any missing or malformed time values
        return None
    hour = int(time[:2])  # Extract the hour from HHMMSS
    return f"{hour:02}0000-{hour + 1:02}0000" if hour < 24 else "230000-000000"

cluster_1_data['time_bucket'] = cluster_1_data['time_of_day'].apply(assign_bucket)

# Create a histogram based on the buckets
fig = px.histogram(cluster_1_data, x='time_bucket', 
                   title='Time of Day Distribution in Cluster_'+str(id),
                   labels={'time_bucket': 'Time Bucket', 'count': 'Frequency'},
                   category_orders={'time_bucket': [f"{i:02}0000-{i + 1:02}0000" for i in range(24)]},
                   color='time_bucket', color_discrete_sequence=px.colors.qualitative.Set2)

# Update layout for better readability
fig.update_layout(xaxis_title='Time Bucket', 
                  yaxis_title='Frequency', 
                  xaxis_tickangle=45)

# Show the plot
fig.show()

## Time of Day Distribution

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Read the CSV file
df = pd.read_csv('clusters_with_time_filename_mini.csv')

# Extract the short filename and then the time_of_day
df['short_filename'] = df['filename'].str.split('.').str[0]  # Extract portion before the first dot
df['time_of_day'] = df['short_filename'].str.split('_').str[1]  # Extract the time part after the last underscore

# Define custom buckets (000000-010000, 010000-020000, ...)
def assign_bucket(time):
    if pd.isna(time):  # Handle any missing or malformed time values
        return None
    hour = int(time[:2])  # Extract the hour from HHMMSS
    return f"{hour:02}0000-{hour + 1:02}0000" if hour < 24 else "230000-000000"

df['time_bucket'] = df['time_of_day'].apply(assign_bucket)

# Specify clusters to include in the subplot
clusters_to_plot = [1, 2, 3, 4, 5, 6]  # Adjust as needed
cluster_titles = [f"Cluster {cluster}" for cluster in clusters_to_plot]

# Create a subplot grid
fig = make_subplots(rows=2, cols=3, subplot_titles=cluster_titles)

# Define colors for clusters
color_sequence = px.colors.qualitative.Set2

# Add histograms for each cluster
for i, cluster in enumerate(clusters_to_plot):
    cluster_data = df[df['Cluster'] == cluster]
    bucket_counts = cluster_data['time_bucket'].value_counts().reindex(
        [f"{i:02}0000-{i + 1:02}0000" for i in range(24)], fill_value=0
    )
    
    row = i // 3 + 1
    col = i % 3 + 1
    fig.add_trace(
        go.Bar(
            x=bucket_counts.index,
            y=bucket_counts.values,
            name=f"Cluster {cluster}",
            marker_color=color_sequence[i % len(color_sequence)]
        ),
        row=row, col=col
    )

# Update layout for better readability
fig.update_layout(
    title="Time of Day Distribution for Selected Clusters",
    xaxis_title="Time Bucket",
    yaxis_title="Frequency",
    showlegend=False,
    height=800,  # Adjust height for better visualization
    width=1000   # Adjust width as needed
)

# Update x-axis labels for all subplots
fig.update_xaxes(title_text="Time Bucket", tickangle=45)
fig.update_yaxes(title_text="Frequency")

# Show the plot
fig.show()