# Python workshop - 2025

<div>
    <img src="../images/qcbs_logo_v2.svg" style="background-color: #f0f0f0; padding: 20px;"/>
</div>

<div>
    <img src="../images/python_logo_generic.svg" style="background-color: #f0f0f0; padding: 20px;"/>
</div>

**Last update**: 2025-05-19  
**Author**: El-Amine Mimouni  
**Affiliation**: Québec Centre for Biodiversity Science

**Overview**: In this notebook, we will see how to use Matplotlib.

---

# Matplotlib

Matplotlib is the main plotting library for Python.

Read more at [https://matplotlib.org/](https://matplotlib.org/)

In [None]:
# Today's star
import matplotlib.pyplot as plt

# Other actors
import numpy as np
import pandas as pd

# Simple scatter plot

In [None]:
# Generate two random variables from a normal distribution
# The means are both 0.0, but different std (3.0 and 1.0)
var1 = np.random.normal(loc=0.0, scale=3.0, size=1000)
var2 = np.random.normal(loc=0.0, scale=1.0, size=1000)

# Plot the scatterplot
plt.scatter(x=var1, y=var2)

# Show the plot
plt.show()

# Note: The last line is optional in Jupter Notebooks but MANDATORY if you work with scripts

In [None]:
# Catch the plot item in a variable?
ploto = plt.scatter(x=var1, y=var2)

# What type of object is it?
print(ploto)
print(type(ploto))

# Not very helpful, but keep it in mind

# Plots are imperative

In [None]:
# You can fine tune the plot to your liking
plt.scatter(x=var1, y=var2, color="red", marker="^", s=20)

# Add title and axes labels
plt.title(label="My title!")
plt.xlabel(xlabel="Name of my x variable (in x units)")
plt.ylabel(ylabel="Name of my y variable (in y units)")

# Add abcissa and ordinate
plt.axvline(x=0, color="black", linestyle="dashed", linewidth=1)
plt.axhline(y=0, color="black", linestyle="dashed", linewidth=1)

# Maximize space
plt.tight_layout()

# Optionally save the plot
# Note: Plotting in this manner means you have to save it at this point
# (If you want to do so that is...)
plt.savefig(fname="../data/my_first_fig.png", dpi=300, bbox_inches="tight")

# Show the plot
plt.show()

In [None]:
# Matplotlib uses symbols instead of numbers (as in R)
# to determine which symbols to plot.

# Here are the most common ones
# 'o'	Circle
# '+'	Cross
# ','	Pixel
# '.'	Point
# 'x'	X
# 'D'	Diamond
# 's'	Square
# '^'	Upwards triangle
# 'v'	Downwards triangle
# '>'	Rightwards triangle
# '<'	Leftwards triangle
# '*'	Star
# 'p'	Pentagon
# 'h'	Hexagon 1
# 'H'   Hexagon 2

In [None]:
# The following code is preferred over using plt directly:
fig, ax = plt.subplots(nrows=1, ncols=1)
ax.set_aspect(aspect="equal", adjustable="datalim")

# Draw scatterplot
ax.scatter(x=var1, y=var2, color="red", marker="^", s=8)

# Add title and axes labels
ax.set_title(label="My title!")
ax.set_xlabel(xlabel="Name of my x variable (in x units)")
ax.set_ylabel(ylabel="Name of my y variable (in y units)")

# Add abcissa and ordinate
ax.axvline(x=0, color="black", linestyle="dashed", linewidth=1)
ax.axhline(y=0, color="black", linestyle="dashed", linewidth=1)

# Maximize space
fig.tight_layout()

# Show the plot
plt.show()

In [None]:
# Optionally save the plot
# Separating the figure from the axes means that you can save the
# figure whenever you want.
# (That is unless you overwrite it...)
fig.savefig(fname="../data/my_second_fig.png", dpi=300, bbox_inches="tight")

# How Matplotlib considers plots

In [None]:
# You can plot 
fig, ax = plt.subplots(nrows=1, ncols=2)

In [None]:
# What is in fig?
print(fig)
print(type(fig))

In [None]:
# What is in ax?
print(ax)
print(type(ax))
print(ax.shape)

In [None]:
# Data
x = range(0, 100)
y1 = np.cumsum(np.random.normal(loc=0.0, scale=1.0, size=100))
y2 = np.cumsum(np.random.normal(loc=0.0, scale=1.0, size=100))

# Create the figure and two subplots (1 row, 2 columns)
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(18, 10))

# Plot on the first subplot (ax[0])
ax[0].plot(x, y1, color="cornflowerblue", linestyle="dashed")
ax[0].set_title(label="The second random walk")
ax[0].set_xlabel(xlabel="Time")
ax[0].set_ylabel(ylabel="Position")
ax[0].axhline(y=0, color="black", linestyle="dashed", linewidth=1)

# Plot on the second subplot (ax[1])
ax[1].plot(x, y2, color="darkgoldenrod", linestyle="dashed")
ax[1].set_title(label="The second random walk")
ax[1].set_xlabel(xlabel="Time")
ax[1].set_ylabel(ylabel="Position")
ax[1].axhline(y=0, color="black", linestyle="dashed", linewidth=1)

# Display the plot
fig.tight_layout()  # Adjust spacing between subplots

# Show the plot
plt.show()

# https://matplotlib.org/stable/gallery/color/named_colors.html

# Scatterplots and labels

In [None]:
# Read the Palmer penguins dataset
palmer = pd.read_csv(filepath_or_buffer="../data/penguins.csv")

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 10))

# Set aspect ratio
ax.set_aspect(aspect="equal", adjustable="datalim")

# Scatterplot
ax.scatter(x=palmer["bill_length_mm"],
           y=palmer["bill_depth_mm"],
           marker="s",
           color="rosybrown",
           alpha=0.9,
)

# Set title and axes labels
ax.set_title(label="Scatteplot of penguin bill length versus bill depth")
ax.set_xlabel(xlabel="Bill Length (in mm)")
ax.set_ylabel(ylabel="Bill Depth (in mm)")

# Add grid (if you so desire)
ax.grid(visible=True, linestyle="dashed")

# Maximize plot space
fig.tight_layout()

# Show the figure
plt.show()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 10))

# Set aspect ratio
ax.set_aspect(aspect="equal", adjustable="datalim")

# Scatterplot
# Note the addition of the label parameter!!!!
ax.scatter(x=palmer.loc[palmer["species"] == "Adelie", "bill_length_mm"],
           y=palmer.loc[palmer["species"] == "Adelie", "bill_depth_mm"],
           marker="s",
           color="rosybrown",
           alpha=0.9,
           label="Adelie"
)

# Scatterplot
# Note the addition of the label parameter!!!!
ax.scatter(x=palmer.loc[palmer["species"] == "Chinstrap", "bill_length_mm"],
           y=palmer.loc[palmer["species"] == "Chinstrap", "bill_depth_mm"],
           marker="s",
           color="orange",
           alpha=0.9,
           label="Chinstrap"
)

# Scatterplot
# Note the addition of the label parameter!!!!
ax.scatter(x=palmer.loc[palmer["species"] == "Gentoo", "bill_length_mm"],
           y=palmer.loc[palmer["species"] == "Gentoo", "bill_depth_mm"],
           marker="s",
           color="purple",
           alpha=0.9,
           label="Gentoo"
)

# Add grid (if you so desire)
ax.grid(visible=True, linestyle="dashed")

# Set title and axes labels
ax.set_title(label="Scatteplot of penguin bill length versus bill depth")
ax.set_xlabel(xlabel="Bill Length (in mm)")
ax.set_ylabel(ylabel="Bill Depth (in mm)")

# Add legend
# Note: It will use the label tags you set as terms
ax.legend()

# Maximize plot space
fig.tight_layout()

# Show the figure
plt.show()

# Barplots

In [None]:
# You can get the count of penguins by island
penguin_count = palmer.groupby(by=["species"]).size()
penguin_count

In [None]:
print(penguin_count.index)
print(penguin_count.values)

In [None]:
# Create the figure and two subplots (2 rows, 2 columns)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))

# Draw barplot
ax.bar(x=penguin_count.index, height=penguin_count.values, color="rebeccapurple")

# Set title and axes labels
ax.set_title(label="Penguin counts in the Palmer penguins dataset by species")
ax.set_xlabel(xlabel="Penguin species")
ax.set_ylabel(ylabel="Number of individuals")

# Maximize space
fig.tight_layout()

# Show the plot
plt.show()

In [None]:
# Group by species and compute mean and std
grouped_penguins = palmer.dropna(subset=["species", "body_mass_g"]).groupby(by=["species"])["body_mass_g"]
mean_vals = grouped_penguins.mean()
sem_vals = grouped_penguins.sem()

# See some values
print("Mean body mass:")
print(mean_vals)

print("\nStandard error of body mass:")
print(sem_vals)

In [None]:
# Create the figure and two subplots (2 rows, 2 columns)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))

# Draw barplot
ax.bar(x=mean_vals.index, height=mean_vals.values, yerr=sem_vals.values, capsize=5, color="lightcoral")

# Set title and axes labels
ax.set_title(label="Average body mass (in g) of penguins by species (with sem)")
ax.set_xlabel(xlabel="Penguin species")
ax.set_ylabel(ylabel="Body mass (in g)")

# Add grid if you want
# Note: It accepts both "grey" and "gray" (whoopee??...)
ax.grid(visible=True, axis="y", color="grey", linestyle="dashed", alpha=0.9)

# Maximize space
fig.tight_layout()

# Show the plot
plt.show()

# More complex barplot

In [None]:
# Group by species and sex
penguins_grouped = palmer.dropna(subset=["species", "sex", "body_mass_g"]).groupby(["species", "sex"])["body_mass_g"]
mean_vals = penguins_grouped.mean()
sem_vals = penguins_grouped.sem()

# See the values
print("Stacked mean values:")
print(mean_vals)

# Compare stacked versus unstacked
mean_vals = mean_vals.unstack()
sem_vals = sem_vals.unstack()

print("\nUnstacked mean values:")
print(mean_vals)

In [None]:
# Get info for the plot

# Species
species = mean_vals.index
print(species)

# A sequence
seq = range(len(species))
print(seq)

# Some bar width
bar_width = 0.35

In [None]:
# Create the figure and two subplots (2 rows, 2 columns)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))

# Draw barplot

# Barplot for males
ax.bar(
    x=[i - bar_width/2 for i in seq],
    height=mean_vals["male"],
    yerr=sem_vals["male"],
    width=bar_width,
    label="male",
    capsize=5,
    color="sandybrown"
)

# Barplot for females
ax.bar(
    x=[i + bar_width/2 for i in seq],
    height=mean_vals["female"],
    yerr=sem_vals["female"],
    width=bar_width,
    label="female",
    capsize=5,
    color="mediumseagreen"
)

# Set title and axes labels
ax.set_title(label="Average body mass (in g) of penguins by species and sex (with sem)")
ax.set_xlabel(xlabel="Penguin species")
ax.set_ylabel(ylabel="Body mass (in g)")

# Add grid if you want
ax.grid(visible=True, axis="y", linestyle="dashed", alpha=0.9)

# Set the ticks for species
ax.set_xticks(ticks=seq, labels=species)

# Set legend
ax.legend()

# Maximize space
fig.tight_layout()

# Show the plot
plt.show()

# PCA MAN

In [None]:
def pca(X):
    n = X.shape[0]
    Xc = X - X.mean(axis=0)
    S = 1.0 / (n - 1.0) * Xc.T @ Xc
    lam, U = np.linalg.eig(S)
    F = Xc @ U
    return [lam, U, F]

In [None]:
# Create a small matrix
X1 = np.array([[2, 4],
               [1, 6],
               [5, 3]])

# See what it contains
print(X1)
print("-" * 50)
print(X1.shape)

In [None]:
# Total variance in the dataset
# Note the bias parameter defines whether we want the
# biased version (division by N) or the unbiased estimate
# of it (division by N - 1).
cov_mat = np.cov(X1, rowvar=False, bias=False)

print("Variance-covariance matrix of X1:")
print(cov_mat)

print("\nTrace of the variance-covariance matrix (total variance):")
print(sum(cov_mat.diagonal()))

In [None]:
# Carry out your PCA
# Note that you use list unpacking to catch several items
eigenval, U_mat, F_scores = pca(X = X1)

# Print
print("The eigenvalues:")
print(eigenval)

In [None]:
# If you did PCA correctly (and you did)
print("Total variance:", sum(cov_mat.diagonal()))
print("Sum of eigenvalues:", sum(eigenval))

In [None]:
# Center the data
# It will be necessary for the plot
X1c = X1 - X1.mean(axis=0)

In [None]:
# Create a figure and axis
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 10))
ax.set_aspect(aspect="equal", adjustable="box")

# Scatter plot for original and rotated data points
ax.scatter(x=X1c[:, 0], y=X1c[:, 1], color="blue", marker="^", s=8, label="Original Data")
ax.scatter(x=F_scores[:, 0], y=F_scores[:, 1], color="red", marker="^", s=8, label="Rotated Data")

# Draw lines from the origin (0, 0) to each point for original and rotated data
for i in range(X1c.shape[0]):
    # Original data (from origin to mat1_c[i])
    ax.plot([0, X1c[i, 0]], [0, X1c[i, 1]], color="blue", linestyle="--", linewidth=1)

    # Rotated data (from origin to F_scores[i])
    ax.plot([0, F_scores[i, 0]], [0, F_scores[i, 1]], color="red", linestyle="--", linewidth=1)


for i in range(U_mat.shape[0]):
    # Draw arrows
    ax.quiver(0,
              0,
              U_mat[i, 0],
              U_mat[i, 1],
              angles="xy",
              scale_units="xy",
              scale=1,
              width=0.0025,
              color="black")

    # Give them names
    # Again, use f-strings since they are unnamed variables
    # Add a text label at a specific position
    ax.text(x=U_mat[i, 0],
            y=U_mat[i, 1],
            s=f"Variable {i}",
            fontsize=12,
            ha="center",
            color="black")

# Add axis labels and title
ax.set_title(label="My PCA!")
# Create labels
# Use f-strings to insert relative eigenvalues
ax.set_xlabel(xlabel=f"PCA axis #1 ({100 * eigenval[0] / sum(eigenval):.2f}%)")
ax.set_ylabel(ylabel=f"PCA axis #2 ({100 * eigenval[1] / sum(eigenval):.2f}%)")

# Draw abcissa and ordinate
ax.axvline(x=0, color="black", linestyle="dashed", linewidth=1)
ax.axhline(y=0, color="black", linestyle="dashed", linewidth=1)

# Add legend
ax.legend()

# Adjust layout and show the plot
fig.tight_layout()
plt.show()


# While we're at it, do it for palmer penguins

In [None]:
palmer_clean = palmer.dropna(subset=["bill_length_mm", "bill_depth_mm", "species"])[["bill_length_mm", "bill_depth_mm", "species"]]

In [None]:
eigen_palmer, U_palmer, F_palmer = pca(X=palmer_clean.loc[:, ["bill_length_mm", "bill_depth_mm"]])

In [None]:
# Create a figure and axis
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 10))
ax.set_aspect(aspect="equal", adjustable="datalim")

# Scatter plot for original and rotated data points
# Note the change in notation since this is a Pandas DataFrame that needs to
# be indexed by .iloc
#ax.scatter(x=F_palmer.iloc[:, 0], y=F_palmer.iloc[:, 1], color="red", marker="o", s=8)
ax.scatter(x=F_palmer.loc[palmer_clean["species"] == "Adelie", 0],
           y=F_palmer.loc[palmer_clean["species"] == "Adelie", 1],
           color="red",
           marker="o",
           s=8,
           label="Adelie")

ax.scatter(x=F_palmer.loc[palmer_clean["species"] == "Chinstrap", 0],
           y=F_palmer.loc[palmer_clean["species"] == "Chinstrap", 1],
           color="green",
           marker="o",
           s=8,
           label="Chinstrap")

ax.scatter(x=F_palmer.loc[palmer_clean["species"] == "Gentoo", 0],
           y=F_palmer.loc[palmer_clean["species"] == "Gentoo", 1],
           color="blue",
           marker="o",
           s=8,
           label="Gentoo")

# Set a value k
k = 8.5

for i in range(U_palmer.shape[0]):
    # Draw arrows from the origin to the values in U (principal components)
    ax.quiver(0,
              0,
              k * U_palmer[i, 0],
              k * U_palmer[i, 1],
              angles="xy",
              scale_units="xy",
              scale=1,
              width=0.0025,
              color="black")
        
    # Add a text label at a specific position
    ax.text(x=k * U_palmer[i, 0],
            y=k * U_palmer[i, 1],
            s=palmer_clean.columns[i],
            fontsize=12,
            ha="center",
            color="black")


# Add axis labels and title
ax.set_title(label="My PCA!")
# Create labels
# Use f-strings to insert relative eigenvalues
ax.set_xlabel(xlabel=f"PCA axis #1 ({100 * eigen_palmer[0] / sum(eigen_palmer):.2f}%)")
ax.set_ylabel(ylabel=f"PCA axis #2 ({100 * eigen_palmer[1] / sum(eigen_palmer):.2f}%)")

# Draw abcissa and ordinate
ax.axvline(x=0, color="black", linestyle="dashed", linewidth=1)
ax.axhline(y=0, color="black", linestyle="dashed", linewidth=1)

# Add legend
ax.legend()

# Adjust layout
fig.tight_layout()

# Show the plot
plt.show()