# Learning statistics by playing around with plots

I'm especially interested in understanding more about the use of linear algebra for statistics

In [1]:
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import sys
sys.path.append('..')
import utils.utils as utils
from pathlib import Path

# Saving figures
OUTPUTPATH = Path('./output/')
DATAPATH = Path('./data/')


# Colors
PLOTLY_BLUE, PLOTLY_RED = ['#636EFA', '#EF553B']
blue, red = PLOTLY_BLUE, PLOTLY_RED

pd.options.plotting.backend = "plotly"

def darkmode() -> bool:
    return False if int(utils.get_timestamp(date=False).split('-')[0]) <= 18 else True
def dark_or_light() -> str:
    return 'plotly_dark' if darkmode() else 'plotly'
def black_or_white() -> str:
    return 'white' if darkmode() else 'black'

# Global variable to control saving of figures
global save
save = False
def toggle_save() -> bool:
    global save
    save = not save
    print(f'Saving figures is now {"ON" if save else "OFF"}')
    return save
def set_save(setting: bool) -> bool:
    global save
    save = setting
    print(f'Saving figures is now {"ON" if save else "OFF"}')
    return save
def save_is_on():
    global save
    return save

plotly_config = {
    'displaylogo': False,
    'scrollZoom': False,
    'toImageButtonOptions': {
        'format': 'png', # one of png, svg, jpeg, webp
        'filename': 'custom_image',
        #'height': 500,
        #'width': 700,
        'scale':2 # Multiply title/legend/axis/canvas sizes by this factor
    }
}

In [2]:
# Visualizing the covariance between two variables
# plot shaded rectangles for each point in random dataset 
# with one corner at the point and one at the x and y mean

n = 100
np.random.seed(0)
x = np.random.normal(size=n, loc=5, scale=1)
y = 1 + 0.5 * x + np.random.normal(size=n, scale=.5)
points = np.array([x, y])

xy_mean = points.mean(axis=1)
xy_prod_dev = (x - x.mean()) * (y - y.mean())

color_s = px.colors.sample_colorscale('viridis', xy_prod_dev/xy_prod_dev.max())
color_s = [c.replace('-','') for c in color_s] # remove negative rgb values
color = [red if c < 0 else blue for c in xy_prod_dev]

fig = go.Figure()

fig.add_trace(go.Scatter(x=points[0], y=points[1], mode="markers", marker_color=black_or_white()))

for i, point in enumerate(points.T):
    fig.add_shape(
        type="rect",
        x0=point[0],
        y0=point[1],
        x1=xy_mean[0],
        y1=xy_mean[1],
        line=dict(width=0),
        fillcolor=color[i],
        opacity=0.1,
        layer='between',
        #yref='paper',
    )

fig.update_layout(
    xaxis=dict(
        scaleanchor="y",
        scaleratio=1,
    ),
    width=500,
    height=500,
    template=dark_or_light(),
)

''

''

In [3]:
fig.show(config=plotly_config)

In [4]:
x = np.random.normal(size=n, loc=7, scale=2)
y = -3 + 0.5 * x + np.random.normal(size=n, scale=.5)

x_c, y_c = x - x.mean(), y - y.mean()
x_s, y_s = x_c/x_c.std()*y_c.std(), y_c/y_c.std()*x_c.std()
colors = [black_or_white(), blue, red]
names = ['original', 'centered', 'standardized']



fig = go.Figure(
    data=[
        go.Scatter(
            x=x,
            y=y,
            mode="markers",
            marker_color=c,
            name=name,
        ) for x, y, c, name in zip([x, x_c, x_s], [y, y_c, y_s], colors, names)
    ],
    layout=go.Layout(
        width=500,
        height=500,
        template=dark_or_light(),
    )
)

def draw_mean_line(fig, x, y, c):
    fig.add_shape(
        type='line',
        xref='x',
        yref='y',
        x0=x.min(),
        y0=y.mean(),
        x1=x.max(),
        y1=y.mean(),
        line_color=c,
        opacity=0.5,
    )
    fig.add_shape(
        type='line',
        xref='x',
        yref='y',
        x0=x.mean(),
        y0=y.min(),
        x1=x.mean(),
        y1=y.max(),
        line_color=c,
        opacity=0.5,
    )

def draw_std_circle(fig, x, y, c):
    fig.add_shape(
        type='circle',
        xref='x',
        yref='y',
        x0=x.mean()-x.std(),
        y0=y.mean()-y.std(),
        x1=x.mean()+x.std(),
        y1=y.mean()+y.std(),
        line_color=c,
        opacity=0.5,
    )

for x, y, c in zip([x, x_c, x_s], [y, y_c, y_s], colors):
    draw_mean_line(fig, x, y, c)
    draw_std_circle(fig, x, y, c)

fig.update_layout(
    xaxis=dict(
        scaleanchor="y",
        scaleratio=1,
    ),
    width=1000,
    height=1000,
    template=dark_or_light(),
)

fig.show(config=plotly_config)

The exploration above leads me to believe there is a but in plotly, which the next figure is supposed to test / demonstrate.

In [5]:
import plotly
import plotly.graph_objects as go

fig = go.Figure()
fig.add_shape(layer='above',   fillcolor='#EF553B', opacity=0.7, x0=0, x1=0.8, y0=0, y1=0.7, label_text='above')
fig.add_shape(layer='between', fillcolor='#EF553B', opacity=0.7, x0=1, x1=1.8, y0=0, y1=0.7, label_text='"between"')
fig.add_shape(layer='below',   fillcolor='#EF553B', opacity=0.7, x0=2, x1=2.8, y0=0, y1=0.7, label_text='below')
fig.add_trace(go.Scatter(x=[-0.5,3.5], y=[0.5,0.5], line_width=10))
#print(plotly.__version__) # 5.22.0
fig.show()
#print(fig)


In [6]:
n = 100
np.random.seed(0)
x = np.random.normal(size=n, loc=5, scale=1)
y = 1 + 0.5 * x + np.random.normal(size=n, scale=.5)

def plot(x, y):
    points = np.array([x, y])

    cov = np.cov(points)
    print(cov)

    # the covariance matrix can be seen as a transformation matrix
    # that stretches the data along its principal axes

    # the eigenvectors of the covariance matrix are the principal axes
    # the eigenvalues are the amount of variance along each axis

    eigenvalues, eigenvectors = np.linalg.eig(cov)

    # transformed points
    points_transformed = eigenvectors.T @ points


    data = [
        go.Scatter(
            x=points[0],
            y=points[1],
            mode="markers",
            marker_color=black_or_white(),
        ),
        go.Scatter(
            x=[x.mean(), x.mean() + eigenvectors[0, 0]],
            y=[y.mean(), y.mean() + eigenvectors[1, 0]],
            mode="lines",
            line_color=blue,
            line_width=6,
            legendgroup="principal axes",
            legendgrouptitle_text="Principal axes",
            name="Component 1",
        ),
        go.Scatter(
            x=[x.mean(), x.mean() + eigenvectors[0, 1]],
            y=[y.mean(), y.mean() + eigenvectors[1, 1]],
            mode="lines",
            line_color=red,
            line_width=6,
            legendgroup="principal axes",
            name="Component 2",
        ),
        go.Scatter(
            x=points_transformed[0],
            y=points_transformed[1],
            mode="markers",
            marker = dict(
                color=black_or_white(),
                symbol='circle-open',
            ),
            legendgroup="transformed",
            legendgrouptitle_text="",
            name="Transformed points",
        ),
        go.Scatter(
            x=[points_transformed[0].mean(), points_transformed[0].mean() + eigenvectors[0, 1]],
            y=[points_transformed[1].mean(), points_transformed[1].mean() + eigenvectors[1, 1]],
            mode="lines",
            showlegend=False,
            line=dict(
                color=red, 
                width=6,
                dash='dot',
            ),

        ),
        go.Scatter(
            x=[points_transformed[0].mean(), points_transformed[0].mean() + eigenvectors[0, 0]],
            y=[points_transformed[1].mean(), points_transformed[1].mean() + eigenvectors[1, 0]],
            mode="lines",
            showlegend=False,
            line=dict(
                color=blue, 
                width=6,
                dash='dot',
            ),
        ),

    ]

    layout = go.Layout(
        width=700,
        height=700,
        template=dark_or_light(),
        xaxis=dict(
            scaleanchor="y",
            scaleratio=1,
        ),
    )

    fig = go.Figure(data=data, layout=layout)



    # add unit vector before ...
    fig.add_trace(go.Scatter(
        x=[1, 0],
        y=[0, 0],
        mode="lines",
        line_color=black_or_white(),
        line_width=6,
        legendgroup="unit vectors",
        legendgrouptitle_text="Unit vectors",
        name="Unit vector 1",
    ))
    fig.add_trace(go.Scatter(
        x=[0, 0],
        y=[1, 0],
        mode="lines",
        line_color=black_or_white(),
        line_width=6,
        legendgroup="unit vectors",
        name="Unit vector 2",
    ))
    # ... and after
    transformed_unit_vectors = eigenvectors.T @ np.eye(2)
    fig.add_trace(go.Scatter(
        x=[0, transformed_unit_vectors[0, 0]],
        y=[0, transformed_unit_vectors[1, 0]],
        mode="lines",
        line_color=blue,
        line_width=6,
        legendgroup="unit vectors",
        showlegend=False,
    ))
    fig.add_trace(go.Scatter(
        x=[0, transformed_unit_vectors[0, 1]],
        y=[0, transformed_unit_vectors[1, 1]],
        mode="lines",
        line_color=red,
        line_width=6,
        legendgroup="unit vectors",
        showlegend=False,
    ))

    return fig

fig = plot(x, y)
fig.show(config=plotly_config)

[[1.02608749 0.57188906]
 [0.57188906 0.58570451]]


In [7]:
# same plot with no relation between x and y

n = 100
np.random.seed(0)
x = np.random.normal(size=n, loc=5, scale=1)
y = np.random.normal(size=n, loc=5, scale=1)
fig = plot(x, y)
fig.show(config=plotly_config)

[[1.02608749 0.11769063]
 [0.11769063 1.08134929]]
