In [None]:
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import pandas as pd

# Load the penguins dataset
penguins = sns.load_dataset('penguins')

# Drop rows with missing 'flipper_length_mm'
penguins = penguins.dropna(subset=['flipper_length_mm'])

# Create a figure with histograms of flipper_length_mm for each species
fig = px.histogram(penguins, x="flipper_length_mm", color="species", barmode="overlay", histnorm='density', nbins=30)

# Define a function to add statistical annotations (lines and rectangles) to the figure
def add_stat_annotations(species_name, species_data, fig):
    # Calculate statistics
    flipper_length = species_data['flipper_length_mm']
    mean = flipper_length.mean()
    median = flipper_length.median()
    min_val = flipper_length.min()
    max_val = flipper_length.max()
    iqr = flipper_length.quantile(0.75) - flipper_length.quantile(0.25)
    iqr_range = (flipper_length.quantile(0.25), flipper_length.quantile(0.75))
    std_dev = flipper_length.std()
    std_range = (mean - 2 * std_dev, mean + 2 * std_dev)

    # Add lines for mean and median
    fig.add_vline(x=mean, line=dict(color="blue", dash="dash"), annotation_text=f"{species_name} Mean", annotation_position="top left")
    fig.add_vline(x=median, line=dict(color="green", dash="dash"), annotation_text=f"{species_name} Median", annotation_position="top right")

    # Add rectangles for range, IQR, and ±2 std deviations
    fig.add_vrect(x0=min_val, x1=max_val, line_width=0, fillcolor="red", opacity=0.1, annotation_text=f"{species_name} Range", annotation_position="top left")
    fig.add_vrect(x0=iqr_range[0], x1=iqr_range[1], line_width=0, fillcolor="purple", opacity=0.1, annotation_text=f"{species_name} IQR", annotation_position="top left")
    fig.add_vrect(x0=std_range[0], x1=std_range[1], line_width=0, fillcolor="orange", opacity=0.1, annotation_text=f"{species_name} ±2 Std Dev", annotation_position="top left")

# Loop through each species and add annotations
for species, species_data in penguins.groupby('species'):
    add_stat_annotations(species, species_data, fig)

# Show the plot
fig.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Load the penguins dataset
penguins = sns.load_dataset('penguins')

# Drop rows with missing 'flipper_length_mm'
penguins = penguins.dropna(subset=['flipper_length_mm'])

# Create a list of species in the dataset
species_list = penguins['species'].unique()

# Create a figure with 3 subplots in one row
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Define a function to add statistical annotations to the plots
def add_stat_annotations(ax, species_data, color):
    # Calculate statistics
    flipper_length = species_data['flipper_length_mm']
    mean = flipper_length.mean()
    median = flipper_length.median()
    min_val = flipper_length.min()
    max_val = flipper_length.max()
    iqr_range = (flipper_length.quantile(0.25), flipper_length.quantile(0.75))
    std_dev = flipper_length.std()
    std_range = (mean - 2 * std_dev, mean + 2 * std_dev)

    # Add vertical lines for the mean and median
    ax.axvline(mean, color="blue", linestyle="--", label="Mean")
    ax.axvline(median, color="green", linestyle="--", label="Median")
    
    # Add rectangles for the range, IQR, and ±2 standard deviations
    ax.axvspan(min_val, max_val, color="red", alpha=0.1, label="Range")
    ax.axvspan(iqr_range[0], iqr_range[1], color="purple", alpha=0.1, label="IQR")
    ax.axvspan(std_range[0], std_range[1], color="orange", alpha=0.1, label="±2 Std Dev")

# Plot KDE for each species on separate subplots
for ax, species in zip(axes, species_list):
    species_data = penguins[penguins['species'] == species]
    
    # Plot KDE for flipper_length_mm
    sns.kdeplot(species_data['flipper_length_mm'], ax=ax, fill=True, color="skyblue", label=f'{species} KDE')
    
    # Add statistical annotations
    add_stat_annotations(ax, species_data, color="skyblue")
    
    # Set title and labels
    ax.set_title(f'{species} Flipper Length KDE')
    ax.set_xlabel('Flipper Length (mm)')
    ax.set_ylabel('Density')

# Add a legend to each subplot
for ax in axes:
    ax.legend()

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()


I certainly prefer the method of seaborn kernel density estimation (KDE) plots, because the comparison it made is the clearest, and I won't see any wasted information hanging on the visualization.

https://chatgpt.com/share/66f35e81-c5ec-8011-b5b4-87423c29f126

In [None]:
from scipy import stats
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

n = 1500
data1 = stats.uniform.rvs(0, 10, size=n)
data2 = stats.norm.rvs(5, 1.5, size=n)
data3 = np.r_[stats.norm.rvs(2, 0.25, size=int(n/2)), stats.norm.rvs(8, 0.5, size=int(n/2))]
data4 = stats.norm.rvs(6, 0.5, size=n)

fig = make_subplots(rows=1, cols=4)

fig.add_trace(go.Histogram(x=data1, name='A', nbinsx=30, marker=dict(line=dict(color='black', width=1))), row=1, col=1)
fig.add_trace(go.Histogram(x=data2, name='B', nbinsx=15, marker=dict(line=dict(color='black', width=1))), row=1, col=2)
fig.add_trace(go.Histogram(x=data3, name='C', nbinsx=45, marker=dict(line=dict(color='black', width=1))), row=1, col=3)
fig.add_trace(go.Histogram(x=data4, name='D', nbinsx=15, marker=dict(line=dict(color='black', width=1))), row=1, col=4)

fig.update_layout(height=300, width=750, title_text="Row of Histograms")
fig.update_xaxes(title_text="A", row=1, col=1)
fig.update_xaxes(title_text="B", row=1, col=2)
fig.update_xaxes(title_text="C", row=1, col=3)
fig.update_xaxes(title_text="D", row=1, col=4)
fig.update_xaxes(range=[-0.5, 10.5])

for trace in fig.data:
    trace.xbins = dict(start=0, end=10)
    
# This code was produced by just making requests to Microsoft Copilot
# https://github.com/pointOfive/stat130chat130/blob/main/CHATLOG/wk3/COP/SLS/0001_concise_makeAplotV1.md

fig.show() # USE `fig.show(renderer="png")` FOR ALL GitHub and MarkUs SUBMISSIONS

Answer: 1. B and D 2. A and C 3. C and D 4. A and D

Right skewness always have greater mean than the median, since the extreme number from the right side leads the mean to be greater, but the extreme value is limited in amount, hence median is not being affectted that much. Whereas for left skewness median is greater than mean, this leads extreme number to be small which leads the mean being smaller, median is still not changed much, so mean becomes smaller.

https://chatgpt.com/share/66f430d4-a5a0-8011-81c8-4bbdc2749550