# Data Science Packages â€“ Descriptive Statistics & Visualization

Dataset: Palmer Penguins

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)


In [None]:

url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
df = pd.read_csv(url)
df.head()


In [None]:

df.info()


In [None]:

df.describe()


In [None]:

df.isna().sum()


In [None]:

df_clean = df.dropna().reset_index(drop=True)
df_clean.shape


In [None]:

numerical_cols = df_clean.select_dtypes(include="number").columns
stats_df = df_clean[numerical_cols].agg(["mean", "median", "std", "min", "max"]).T
stats_df


In [None]:

df_clean["species"].value_counts()


In [None]:

for col in numerical_cols:
    sns.histplot(df_clean[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()


In [None]:

for col in numerical_cols:
    sns.boxplot(data=df_clean, x="species", y=col)
    plt.title(f"{col} by Species")
    plt.show()


In [None]:

corr = df_clean[numerical_cols].corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()


In [None]:

stats.pearsonr(df_clean["bill_length_mm"], df_clean["flipper_length_mm"])



## Summary

- Clear species differences across measurements
- Strong correlations between body mass and flipper length
- Suitable for hypothesis testing and ML in later assignments
