First we import all the relevant packages for our analysis

In [42]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.linear_model import LinearRegression

This chunk will read in our dataset, and do some simple data cleaning, removing unwanted values and shortening some strings

In [47]:
penguins = pd.read_csv('palmer_penguins.csv')
penguins["Species"] = [penguins["Species"][i].split()[0] for i in range(len(penguins["Species"]))]
penguins = penguins[["Species", "Island", "Culmen Length (mm)", "Culmen Depth (mm)", "Flipper Length (mm)", "Body Mass (g)", "Sex"]]
penguins = penguins.dropna()
penguins = penguins[penguins["Sex" ] != "."]

Now we begin our first analysis. We will examine the relationship between Culmen Length and Depth, grouped by species. In addition to plotting all the points in a scatterplot, we will add lines of best fit to each Species to demonstrate the relationship in clearer visual terms.

In [60]:
# Create scatterplot in plotly, colored by species
fig = px.scatter(penguins, x="Culmen Length (mm)", y="Culmen Depth (mm)", color="Species", title="Culmen Length vs Culmen Depth")

# Define function that takes in a dataframe and species, and returns the coefficients of the line of best fit
def regression_by_species(data, species):
    '''
    data: pandas DataFrame
    species: str
    This function takes in a pandas DataFrame and a species, and returns the coefficients of the line of best fit for the Culmen Length vs Culmen Depth for that species.
    returns: tuple
    '''
    data = data[data["Species"] == species]
    X = data["Culmen Length (mm)"].values.reshape(-1, 1)
    y = data["Culmen Depth (mm)"].values.reshape(-1,1)
    LR = LinearRegression()
    LR.fit(X,y)
    return LR.coef_, LR.intercept_

# Now define x indices and colors for the line of best fit for each species
x_range = np.linspace(penguins["Culmen Length (mm)"].min(), penguins["Culmen Length (mm)"].max(), 100)
colors = px.colors.qualitative.Plotly

# Finally, loop through each species and add the line of best fit to the plot
for i, species in enumerate(penguins["Species"].unique()):
    coef, intercept = regression_by_species(penguins, species)
    y_range = coef[0][0] * x_range + intercept[0]
    fig.add_trace(px.line(x=x_range, y=y_range).data[0].update(line=dict(color=colors[i])), row=None, col=None)

# Show our figure
fig.show()




For our second analysis, we will examine the two remaining quantitative variables, body mass and flipper length, in another scatterplot. We will add marginal plots to show the univariate distributions of each variable as well.

In [65]:
# Create scatterplot in plotly, colored by species, with marginal violin plots
fig3 = px.scatter(penguins, x="Flipper Length (mm)", y="Body Mass (g)", color = "Species", marginal_x="violin", marginal_y="violin", title="Flipper Length vs Body Mass")
fig3.show()