import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Generate synthetic penguins data
np.random.seed(0)

# Define species
species_list = ['Adelie', 'Chinstrap', 'Gentoo']

# Create synthetic flipper lengths for each species
flipper_lengths = {
    'Adelie': np.random.normal(190, 5, 50),  # Adelie penguins with mean flipper length 190 mm
    'Chinstrap': np.random.normal(200, 6, 50),  # Chinstrap penguins with mean flipper length 200 mm
    'Gentoo': np.random.normal(210, 7, 50)  # Gentoo penguins with mean flipper length 210 mm
}

# Construct the synthetic DataFrame
penguins_df = pd.DataFrame({
    'species': np.concatenate([[species] * 50 for species in species_list]),
    'flipper_length_mm': np.concatenate([flipper_lengths[species] for species in species_list])
})

# Calculate required statistics for flipper_length_mm grouped by species
penguin_stats = penguins_df.groupby('species')['flipper_length_mm'].agg(
    mean='mean',
    median='median',
    min='min',
    max='max',
    q25=lambda x: x.quantile(0.25),
    q75=lambda x: x.quantile(0.75),
    std='std'
).reset_index()

# Set up the figure for seaborn KDE plots
plt.figure(figsize=(15, 5))

# Loop through each species and create a KDE plot
for i, species in enumerate(penguin_stats['species'], start=1):
    # Filter data for current species
    species_data = penguins_df[penguins_df['species'] == species]['flipper_length_mm']
    
    # Create a subplot for each species
    plt.subplot(1, 3, i)
    sns.kdeplot(species_data, color='blue', label='Density')

    # Plot mean and median lines
    plt.axvline(penguin_stats[penguin_stats['species'] == species]['mean'].values[0], color='blue', linestyle='--', label='Mean')
    plt.axvline(penguin_stats[penguin_stats['species'] == species]['median'].values[0], color='green', linestyle=':', label='Median')
    
    # Plot shaded areas for ranges
    plt.axvspan(penguin_stats[penguin_stats['species'] == species]['min'].values[0], penguin_stats[penguin_stats['species'] == species]['max'].values[0], color='gray', alpha=0.2, label='Range')
    plt.axvspan(penguin_stats[penguin_stats['species'] == species]['q25'].values[0], penguin_stats[penguin_stats['species'] == species]['q75'].values[0], color='purple', alpha=0.2, label='IQR')
    plt.axvspan(penguin_stats[penguin_stats['species'] == species]['mean'].values[0] - 2 * penguin_stats[penguin_stats['species'] == species]['std'].values[0], 
                penguin_stats[penguin_stats['species'] == species]['mean'].values[0] + 2 * penguin_stats[penguin_stats['species'] == species]['std'].values[0], 
                color='red', alpha=0.2, label='2 Std Dev')

    # Set title and labels
    plt.title(f'{species} - Flipper Length Distribution')
    plt.xlabel('Flipper Length (mm)')
    plt.ylabel('Density')
    plt.legend()

# Adjust layout and show the plots
plt.tight_layout()
plt.show()


https://chatgpt.com/c/66f4d4fb-b260-800a-889c-39ffd9b42191

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Generate synthetic penguins data
np.random.seed(0)

# Define species
species_list = ['Adelie', 'Chinstrap', 'Gentoo']

# Create synthetic flipper lengths for each species
flipper_lengths = {
    'Adelie': np.random.normal(190, 5, 50),  # Adelie penguins with mean flipper length 190 mm
    'Chinstrap': np.random.normal(200, 6, 50),  # Chinstrap penguins with mean flipper length 200 mm
    'Gentoo': np.random.normal(210, 7, 50)  # Gentoo penguins with mean flipper length 210 mm
}

# Construct the synthetic DataFrame
penguins_df = pd.DataFrame({
    'species': np.concatenate([[species] * 50 for species in species_list]),
    'flipper_length_mm': np.concatenate([flipper_lengths[species] for species in species_list])
})

# Calculate required statistics for flipper_length_mm grouped by species
penguin_stats = penguins_df.groupby('species')['flipper_length_mm'].agg(
    mean='mean',
    median='median',
    min='min',
    max='max',
    q25=lambda x: x.quantile(0.25),
    q75=lambda x: x.quantile(0.75),
    std='std'
).reset_index()

# Set up the figure for seaborn KDE plots
plt.figure(figsize=(15, 5))

# Loop through each species and create a KDE plot
for i, species in enumerate(penguin_stats['species'], start=1):
    # Filter data for current species
    species_data = penguins_df[penguins_df['species'] == species]['flipper_length_mm']
    
    # Create a subplot for each species
    plt.subplot(1, 3, i)
    sns.kdeplot(species_data, fill=True, color='skyblue', label='Density')

    # Plot mean and median lines
    plt.axvline(penguin_stats[penguin_stats['species'] == species]['mean'].values[0], color='blue', linestyle='--', label='Mean')
    plt.axvline(penguin_stats[penguin_stats['species'] == species]['median'].values[0], color='green', linestyle=':', label='Median')
    
    # Plot shaded areas for ranges
    plt.axvspan(penguin_stats[penguin_stats['species'] == species]['min'].values[0], penguin_stats[penguin_stats['species'] == species]['max'].values[0], color='gray', alpha=0.2, label='Range')
    plt.axvspan(penguin_stats[penguin_stats['species'] == species]['q25'].values[0], penguin_stats[penguin_stats['species'] == species]['q75'].values[0], color='purple', alpha=0.2, label='IQR')
    plt.axvspan(penguin_stats[penguin_stats['species'] == species]['mean'].values[0] - 2 * penguin_stats[penguin_stats['species'] == species]['std'].values[0], 
                penguin_stats[penguin_stats['species'] == species]['mean'].values[0] + 2 * penguin_stats[penguin_stats['species'] == species]['std'].values[0], 
                color='red', alpha=0.2, label='2 Std Dev')

    # Set title and labels
    plt.title(f'{species} - Flipper Length Distribution')
    plt.xlabel('Flipper Length (mm)')
    plt.ylabel('Density')
    plt.legend()

# Adjust layout and show the plots
plt.tight_layout()
plt.show()


from scipy import stats
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

n = 1500
data1 = stats.uniform.rvs(0, 10, size=n)
data2 = stats.norm.rvs(5, 1.5, size=n)
data3 = np.r_[stats.norm.rvs(2, 0.25, size=int(n/2)), stats.norm.rvs(8, 0.5, size=int(n/2))]
data4 = stats.norm.rvs(6, 0.5, size=n)

fig = make_subplots(rows=1, cols=4)

fig.add_trace(go.Histogram(x=data1, name='A', nbinsx=30, marker=dict(line=dict(color='black', width=1))), row=1, col=1)
fig.add_trace(go.Histogram(x=data2, name='B', nbinsx=15, marker=dict(line=dict(color='black', width=1))), row=1, col=2)
fig.add_trace(go.Histogram(x=data3, name='C', nbinsx=45, marker=dict(line=dict(color='black', width=1))), row=1, col=3)
fig.add_trace(go.Histogram(x=data4, name='D', nbinsx=15, marker=dict(line=dict(color='black', width=1))), row=1, col=4)

fig.update_layout(height=300, width=750, title_text="Row of Histograms")
fig.update_xaxes(title_text="A", row=1, col=1)
fig.update_xaxes(title_text="B", row=1, col=2)
fig.update_xaxes(title_text="C", row=1, col=3)
fig.update_xaxes(title_text="D", row=1, col=4)
fig.update_xaxes(range=[-0.5, 10.5])

for trace in fig.data:
    trace.xbins = dict(start=0, end=10)
    
# This code was produced by just making requests to Microsoft Copilot
# https://github.com/pointOfive/stat130chat130/blob/main/CHATLOG/wk3/COP/SLS/0001_concise_makeAplotV1.md

fig.show() # USE `fig.show(renderer="png")` FOR ALL GitHub and MarkUs SUBMISSIONS

Question 4:
    None of the data sets have similar means and variances
    A and B has similar means but different variance
    D has different mean compared to A,b and C

Chatgpt link:https://chatgpt.com/c/66f4d4fb-b260-800a-889c-39ffd9b42191

Post lecture homework

Question1


from scipy import stats
import pandas as pd
import numpy as np
  
sample1 = stats.gamma(a=2,scale=2).rvs(size=1000)
fig1 = px.histogram(pd.DataFrame({'data': sample1}), x="data")
# USE fig1.show(renderer="png") FOR ALL GitHub and MarkUs SUBMISSIONS

sample1.mean()
np.quantile(sample1, [0.5]) # median

sample2 = -stats.gamma(a=2,scale=2).rvs(size=1000)

from scipy import stats
import pandas as pd
import numpy as np


fig1 = px.histogram(pd.DataFrame({'data': sample1}), x="data")
# USE fig1.show(renderer="png") FOR ALL GitHub and MarkUs SUBMISSIONS


sample1.mean()
np.quantile(sample1, [0.5])  # median


sample2 = -stats.gamma(a=2, scale=2).rvs(size=1000)


pip install plotly


# Creating an alternative scatter plot with modified parameters
fig2 = px.scatter(
    df, 
    x="gdpPercap",  # Placeholder, will change in your provided dataset
    y="lifeExp",  # Placeholder, will change in your provided dataset
    animation_frame="year", 
    animation_group="country",  # Placeholder, will be "name" in your dataset
    size="pop",  # Placeholder, will change to "percent" in your dataset
    color="continent",  # Placeholder, will change to "sex" in your dataset
    hover_name="country",  # Placeholder, will be "name" in your dataset
    size_max=50, 
    range_x=[-0.005, 0.005]
)

# Displaying the modified figure
fig2.show()


fig2 = px.scatter(
    df, 
    x="percent change",  # Change to your column name for percent change
    y="rank",  # Change to your column name for rank
    animation_frame="year", 
    animation_group="name",  # Animation grouped by name
    size="percent",  # Change to your column name for percent
    color="sex",  # Change to your column name for sex
    hover_name="name",  # Hover over each point to show the name
    size_max=50, 
    range_x=[-0.005, 0.005]
)


import plotly.express as px

# Assume df is your dataset with the specified columns
fig = px.scatter(
    df, 
    x="percent change", 
    y="rank", 
    animation_frame="year", 
    animation_group="name",
    size="percent", 
    color="sex", 
    hover_name="name",
    size_max=50, 
    range_x=[-0.005, 0.005]
)

fig.show()


Chatgpt link:https://chatgpt.com/c/66f591cb-daf0-800a-8cdc-e02d2764bd38

Summary of the chatbot session:

In this session, we covered several main topics:

Exploring Skewness: We discussed how right and left skewness affect the relationship between the mean and median, with examples using a Gamma distribution. You explored generating right- and left-skewed samples, visualizing them, and analyzing their summary statistics to understand how skewness influences central tendency.

Dataset Exploration: You attempted to load and analyze a nutrition dataset from an online source, aiming to explore summary statistics and visualize its aspects. However, due to limitations in accessing external URLs, I suggested uploading the dataset locally for further analysis.

Interactive Plotly Visualization: You aimed to recreate and modify an interactive animated scatter plot using Plotly. The first plot was based on the Gapminder dataset, illustrating the relationship between GDP per capita and life expectancy over time. The second version of the plot required customizing the parameters (e.g., percent change, rank, percent, and sex) to visualize a different dataset.

Customizing Scatter Plot Parameters: You specified new parameters for the scatter plot to reflect a different dataset structure, emphasizing features like "percent change," "rank," "sex," and "name," and adjusting the animation to show changes over "year."

If you have any data files or additional details for the modified scatter plot, feel free to share them, and I can assist further!