In [1]:
import sys 
import warnings
import itertools 
import pandas as pd 
import altair as alt 
import numpy as np 
from pandas import DataFrame
from typing import List, Union, Optional, Dict, Callable 
from collections import defaultdict 
from altair import datum 
from IPython.display import display, HTML
from scipy.signal import argrelextrema
from scipy.spatial import ConvexHull
from sklearn.ensemble import HistGradientBoostingRegressor 
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import Ridge, ElasticNet, LinearRegression
from sklearn.model_selection import cross_validate, ShuffleSplit, StratifiedKFold, StratifiedShuffleSplit, cross_val_score, GridSearchCV 
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline

alt.data_transformers.disable_max_rows()

warnings.filterwarnings("ignore")
# warnings.filterwarnings("always")

In [2]:
def layer_charts(*plots): 
    """ Overlay all charts in the input into a single output chart 
    """
    assert len(plots) >= 1
    c = plots[0]
    for p in plots[1:]: 
        c = c + p 
    return c 

def create_scatter(x, y):
    xmax = np.max(x)
    df = pd.DataFrame(data=dict(place_in_line=np.ravel(x), price_per_pod=np.ravel(y)))
    return alt.Chart(df, width=800).mark_point().encode(
        x=alt.X(
            "place_in_line:Q"
            , scale=alt.Scale(domain=(0, xmax))
        )
        , y=alt.Y(
            "price_per_pod:Q"
            , scale=alt.Scale(domain=(0, 1), clamp=True)
        )
#         , color=alt.Color(
#             'amount:Q'
#             , scale=alt.Scale(
#                 domain=(0, 150000)
#                 , range=('#71f0df', '#0023bd')
#                 , clamp=True
#             )
#         )
    )

def create_lines(df): 
    xmax = np.max(df.place_in_line.values)
    selection = alt.selection_multi(fields=['hyp_params'], bind='legend')
    return alt.Chart(df, width=800).mark_line().encode(
        x=alt.X(
            "place_in_line:Q"
            , scale=alt.Scale(domain=(0, xmax))
        )
        , y=alt.Y(
            "price_per_pod:Q"
            , scale=alt.Scale(domain=(0, 1), clamp=True)
        )
        , color=alt.Color(
            "hyp_params:N"
            , legend=alt.Legend(title="Hyper Parameters", orient="bottom", direction="vertical")
        )
        , opacity=alt.condition(selection, alt.value(1), alt.value(0.025))
    ).add_selection(
        selection
    )

def get_chart_scores(df_scores): 
    dmin = np.min(
        np.union1d(df_scores.train_nmse.values, df_scores.test_nmse)
    )
    dmax = np.max(
        np.union1d(df_scores.train_nmse, df_scores.test_nmse)
    )
    yscale = alt.Scale(domain=(dmin, dmax))
    scores = alt.Chart(df_scores, title="Training vs Testing Negative-MSE (higher is better)").mark_line().transform_fold(
        fold=['test_nmse', 'train_nmse'], 
        as_=['type', 'nmse']
    ).encode(
        x="alpha:Q",
        y=alt.Y("nmse:Q", scale=yscale), 
        color='type:N'
    )
    return scores 

def get_chart_weights_boxplot(df_weights): 
    wlr_df = pd.melt(
        df_weights
        , value_vars=df_weights.columns
        , var_name="feature"
        , value_name="weight"
    )
    return alt.Chart(wlr_df).mark_boxplot(extent='min-max').encode(
        x='weight:Q'
        , y='feature:O'
    )

def ddf(df): 
    display(HTML(df.to_html(float_format=lambda v: f'{v:15f}')))

In [3]:
def normalize(x, feature_range=[0,1]):
    scaler = MinMaxScaler(feature_range=feature_range)
    if len(x.shape) == 1: 
        data = x.reshape((len(x), 1))
    else: 
        data = x 
    scaler.fit(data)
    return scaler.transform(data) 

def get_df(nbins=4, amount_binsize=None): 
    """ Get the underlying dataframe for the regression problem 
    
    Args: 
        nbins: The number of bins to be used for stratification 
    """
    file = './pod-market-history.csv'
    df = pd.read_csv(file)
    df = df[['evt_block_time', 'place_in_line', 'amount', 'price_per_pod' ]]
    # impute a synthetic observation at place in line 1 with price per pod of 1, as this is logical 
    first_row = df.sort_values('evt_block_time', ascending=False).iloc[[0],:].copy()
    first_row.place_in_line = 1 
    first_row.price_per_pod = 1
    first_row.amount = 2000000
    df: DataFrame = pd.concat([df, first_row], axis=0)
    # if amount binsize selected, convert weighted points to unweighted points 
    if amount_binsize is not None: 
        new_records = []
        records = df.to_dict(orient='records')
        amount_dropped = 0 
        for r in records: 
            remaining = r['amount'] 
            i = 0 
            while remaining > amount_binsize: 
                new_records.append({
                    'evt_block_time': r['evt_block_time']
                    , 'place_in_line': r['place_in_line'] + i * amount_binsize
                    , 'amount': amount_binsize
                    , 'price_per_pod': r['price_per_pod']
                }) 
                remaining -= amount_binsize
                i += 1
            amount_dropped += remaining
        print(f"True Volume: {df.amount.sum()} Dropped Volume: {amount_dropped} Dropped Percent: {(amount_dropped / df.amount.sum()) * 100}%")
        df = pd.DataFrame(data=new_records)
    # computed columns 
    df['stratify'] = pd.cut(df['place_in_line'], nbins, labels=False)
    df['amount_log'] = np.log(df.amount.values)
    df['amount_log10'] = np.log10(df.amount.values)
    df['epoch'] = pd.to_datetime(df['evt_block_time']).astype(int)
    df['age_days'] = pd.Timestamp.now(tz='UTC') - pd.to_datetime(df['evt_block_time'])
    time_interval = np.max(df.epoch.values) - np.min(df.epoch.values) 
    time_elapsed = df.epoch.values - np.min(df.epoch.values)
    df[f'tw_decay'] = np.exp(-time_elapsed / time_interval) 
    return df 

def get_twd(df, k): 
    # Change value of k for tw_decay
    return np.exp(np.log(df.tw_decay.values) * k) 

In [4]:
df = get_df(amount_binsize=2500)

True Volume: 14159606.769839 Dropped Volume: 444606.76983900013 Dropped Percent: 3.139965516458022%


In [5]:
alt.Chart(
    df[['place_in_line', 'price_per_pod', 'evt_block_time', 'amount']]
    , width=800
    , title='Place in Line vs Price Per Pod (colored by age, sized by amount)'
).mark_point().encode(
    x='place_in_line:Q'
    , y='price_per_pod:Q'
    , color=alt.Color(
        "evt_block_time:O", legend=None
    )
).interactive()

In [6]:
class CustomStratifiedKFold(): 
    
    def __init__(self, n_splits, random_state, stratify): 
        self.stratify = stratify
        self.n_splits = n_splits 
        self.cv = StratifiedKFold(
            n_splits=self.n_splits, 
            random_state=random_state, 
            shuffle=True 
        )
        
    def get_folds(self, X): 
        assert X.shape[0] == self.stratify.shape[0] 
        for train_idx, test_idx in self.cv.split(X, self.stratify):
            yield train_idx, test_idx 
        
def run_experiment(
    X
    , y
    , stratify
    , sw 
    , feature_names: List[str] 
    , degrees: List[int]
    , alphas: List[float]
    , model_params 
    , train_p: float
    , row_height: int = 300 
    , half_width: int = 600 
    , n_fit_samples: int = 50
    , n_folds: int = 4 
): 
    """
    args: 
        degrees: values of degree to test for hyperparameter optimality 
        alphas: values of alpha to test for hyperparameter optimality 
        train_percent: values of train_percent to test for hyperparameter optimality 
    returns: 
        alt.Chart object containg all plots to be used for model validation 
    """
    # TODO: Add weighted exponential decay to sample weights for regression 
    rs = 32
    scoring = make_scorer(mean_squared_error)
    rows = []
    dfs = defaultdict(list) 
    model = Pipeline([
        ('polyfeatures', PolynomialFeatures()), 
        ('scaler', StandardScaler()), # since polynomial features have wildly different ranges, scaling is important 
        ('regressor', Ridge(**model_params)) 
    ])
    folds = list(CustomStratifiedKFold(n_folds, rs, stratify).get_folds(X))
    cols = ['degree', 'alpha', 'train_nmse', 'test_nmse']
    df_scores = None 
    df_weights = None 
    for d, a in itertools.product(degrees, alphas): 
        model = Pipeline([
            # TODO: Add in component to downscale input feature 
            ('polyfeatures', PolynomialFeatures(degree=d)), 
            ('scaler', StandardScaler()), # since polynomial features have wildly different ranges, scaling is important 
            ('regressor', Ridge(**model_params, alpha=a)) 
            # TODO: Add in component to re-scale downscaled input feature 
        ])
        scores_train = []
        scores_test = []
        weights = []
        for i, (train_idx, test_idx) in enumerate(folds): 
            X_train, y_train, sw_train = X[train_idx], y[train_idx], sw[train_idx]
            X_test, y_test, sw_test = X[test_idx], y[test_idx], sw[test_idx]
            model.fit(X_train, y_train, regressor__sample_weight=sw_train) 
            y_pred_train = model.predict(X_train) 
            y_pred_test = model.predict(X_test) 
            score_train = mean_squared_error(y_train, y_pred_train, sample_weight=sw_train)
            score_test = mean_squared_error(y_test, y_pred_test, sample_weight=sw_test)
            scores_train.append(score_train) 
            scores_test.append(score_test) 
            feature_names = model[0].get_feature_names_out(input_features=['place_in_line'])
            weights.append(model[-1].coef_)
            
        if df_scores is None: 
            df_scores = pd.DataFrame(columns=cols)
        df_scores = pd.concat([
            df_scores
            , pd.DataFrame(data=dict(
                degree=[d]
                , alpha=[a]
                , train_nmse=[np.array(scores_train).mean()]
                , test_nmse=[np.array(scores_test).mean()]
            )) 
        ]) 
        
        if df_weights is None: 
            df_weights = pd.DataFrame(columns=feature_names)
        weights = np.ravel(np.array(weights).mean(axis=0))
        df_weights = pd.concat([
            df_weights
            , pd.DataFrame(data={f: [weights[i]] for i, f in enumerate(feature_names)})
        ])
    
    return df_scores, df_weights 

In [7]:
# visualize number of samples in each stratification cut 
# alt.Chart(df, width=600).mark_bar().encode(
#     x=alt.X('stratify:O', bin=True), 
#     y="count()"
# ).interactive()

# degrees = list(range(8, 12))
# alphas = np.logspace(-14, -3, num=30) # for ridge 
# alphas = np.logspace(-8, -3, num=30) # for ridge 
# train_percent = .98

# df_scores, _ = run_experiment(X, y, stratify, sw, feature_names, degrees, alphas, dict(tol=1e-14), train_percent)

In [8]:
def compute_hull(sdf): 
    points = list(zip(sdf.place_in_line.values, sdf.price_per_pod.values))
    ch = ConvexHull(points) 
    sdf['hull_lo'] = [i in ch.vertices for i in range(0, len(points))]

    # for each data point, we find the preceding and following points that were part of the convex hull 
    # we construct a line between these two points, then reflect our original point over this line. 
    # we will then compute the convex hull of this reflected set of points 
    # https://stackoverflow.com/questions/3306838/algorithm-for-reflecting-a-point-across-a-line
    sdf['lo_m'] = None 
    verts = list(sorted(ch.vertices))
    for i, j in zip(verts[:-1], verts[1:]):
        p0 = points[i]
        p1 = points[j] 
        slope = (p1[1] - p0[1]) / (p1[0] - p0[0])
        row_inds = list(range(i, j+1))
        sdf.loc[row_inds, 'lo_m'] = slope
        sdf.loc[row_inds, 'lo_b'] = (p1[0] * p0[1] - p0[0] * p1[1]) / (p1[0] - p0[0])
    sdf['lo_d'] = (sdf.place_in_line + (sdf.price_per_pod - sdf.lo_b) * sdf.lo_m) / (1 + sdf.lo_m ** 2)
    sdf['x_reflect'] = 2 * sdf.lo_d - sdf.place_in_line
    sdf['y_reflect'] = 2 * sdf.lo_d * sdf.lo_m - sdf.price_per_pod + 2 * sdf.lo_b

    # We use these reflected points to compute a second convex hull, which in reality is a concave hull 
    points_reflect = list(zip(sdf.x_reflect.values, sdf.y_reflect.values))
    ch = ConvexHull(points_reflect) 
    sdf['hull_hi'] = [i in ch.vertices for i in range(0, len(points_reflect))]

    # compute the line equations for the concave hull 
    sdf['hi_m'] = None 
    verts = list(sorted(ch.vertices))
    for i, j in zip(verts[:-1], verts[1:]):
        p0 = points[i]
        p1 = points[j] 
        slope = (p1[1] - p0[1]) / (p1[0] - p0[0])
        row_inds = list(range(i, j+1))
        sdf.loc[row_inds, 'hi_m'] = slope
        sdf.loc[row_inds, 'hi_b'] = (p1[0] * p0[1] - p0[0] * p1[1]) / (p1[0] - p0[0])

    # having the line equations for both the convex and concave hulls, we compute the midpoint between these 
    # two hulls as a smoothed approximation of our initial regression line 
    sdf['y_lo'] = sdf.lo_m * sdf.place_in_line + sdf.lo_b
    sdf['y_hi'] = sdf.hi_m * sdf.place_in_line + sdf.hi_b
    sdf['y_mid'] = (sdf.y_hi - sdf.y_lo) / 2 + sdf.y_lo

In [132]:
nsamples = 200

from sklearn.tree import plot_tree

def tree_regression_amount_age_weighted(): 
    df = get_df()
    # df = df.loc[df.age_days < pd.Timedelta(31, 'd')]
    X = df[['place_in_line']].to_numpy()
    y = df.price_per_pod.values 
    dfs = []
    configs = [
        (df.amount.values, '01. amount')
        , (df.amount_log.values, '02. amount log')
        , (df.tw_decay.values, '03. time')
    ]
    for a, aname in [
        (df.amount.values, ''), 
        (df.amount_log.values, 'log')
    ]: 
        for i, k in enumerate(np.linspace(1, 20, num=10)): 
            configs.append((
                np.ravel(a * get_twd(df, k)),
                f'{"0" + str(i+4) if i+4 < 10 else i+4} amount {aname} time decay {k}'
            ))

    for sw, weight_name in configs: 
        model = Pipeline([
            ('regressor', HistGradientBoostingRegressor(
                monotonic_cst=[-1], loss="poisson"
            )) 
        ])
        model.fit(X, y, regressor__sample_weight=sw)
        Xfit = np.linspace(0, np.max(X[:,0]), num=nsamples).reshape((nsamples, 1))
        yfit = model.predict(Xfit)
        df_fit = pd.DataFrame(
            data=dict(
                place_in_line=np.ravel(Xfit)
                , price_per_pod=np.ravel(yfit)
                , weight_name=weight_name
            )
        )
        compute_hull(df_fit)
        dfs.append(df_fit)
    return df, dfs 

def tree_regression_age_weighted(): 
    df = get_df(amount_binsize=2500)
    # df = df.loc[df.age_days < pd.Timedelta(31, 'd')]
    X = df[['place_in_line']].to_numpy()
    y = df.price_per_pod.values 
    dfs = []
    configs = [
        (get_twd(df, k), f'{"0" + str(i+1) if i+1 < 10 else i+1} time decay {k}')
        for i, k in enumerate(np.linspace(.1, 20, num=10))
    ]
    for sw, weight_name in configs: 
        model = Pipeline([
            ('regressor', HistGradientBoostingRegressor(
                monotonic_cst=[-1], loss="poisson"
            )) 
        ])
        model.fit(X, y, regressor__sample_weight=sw)
        Xfit = np.linspace(0, np.max(X[:,0]), num=nsamples).reshape((nsamples, 1))
        yfit = model.predict(Xfit)
        df_fit = pd.DataFrame(
            data=dict(
                place_in_line=np.ravel(Xfit)
                , price_per_pod=np.ravel(yfit)
                , weight_name=weight_name
            )
        )
        compute_hull(df_fit)
        dfs.append(df_fit)
    return df, dfs 

def tree_regression_viz(df, dfs, encode_size=False): 
    selection = alt.selection_multi(fields=['weight_name'], bind='legend')
    size_params = {} if not encode_size else dict(size=alt.Size('amount:Q', legend=None))
    return (
        alt.Chart(
            df[['place_in_line', 'price_per_pod', 'evt_block_time', 'amount']]
            , width=800
            , height=500
            , title='Place in Line vs Price Per Pod (colored by age, sized by amount)'
        ).mark_point().encode(
            x='place_in_line:Q'
            , y='price_per_pod:Q'
            , color=alt.Color(
                "evt_block_time:O", legend=None
            )
            , **size_params
        ) + 
        alt.Chart(pd.concat(dfs), width=800, height=500).mark_line().encode(
            x='place_in_line:Q'
            , y='y_mid:Q'
            , color=alt.Color(
                'weight_name:O'
                , scale=alt.Scale(scheme='category20')
                , legend=alt.Legend(title="Weight Strategy", direction="vertical", symbolLimit=100, columns=2)
            )
            , opacity=alt.condition(selection, alt.value(1), alt.value(0.025))
        ).transform_filter(
            (datum.hull != -5)
            # (datum.hull == True)
        ).add_selection(
            selection
        )
    ).resolve_scale(
        color='independent'
    ).interactive()

In [133]:
df, dfs = tree_regression_amount_age_weighted()

In [134]:
tree_regression_viz(df, dfs, encode_size=True)

In [135]:
df, dfs = tree_regression_age_weighted()

True Volume: 14159606.769839 Dropped Volume: 444606.76983900013 Dropped Percent: 3.139965516458022%


In [136]:
tree_regression_viz(df, dfs, encode_size=False)

In [120]:
# from scipy.spatial import ConvexHull

# compute a convex hull on the regression line 
sdf = dfs[0].copy()
points = list(zip(sdf.place_in_line.values, sdf.price_per_pod.values))
ch = ConvexHull(points) 
sdf['hull_lo'] = [i in ch.vertices for i in range(0, len(points))]

# for each data point, we find the preceding and following points that were part of the convex hull 
# we construct a line between these two points, then reflect our original point over this line. 
# we will then compute the convex hull of this reflected set of points 
# https://stackoverflow.com/questions/3306838/algorithm-for-reflecting-a-point-across-a-line
sdf['lo_m'] = None 
verts = list(sorted(ch.vertices))
for i, j in zip(verts[:-1], verts[1:]):
    p0 = points[i]
    p1 = points[j] 
    slope = (p1[1] - p0[1]) / (p1[0] - p0[0])
    row_inds = list(range(i, j+1))
    sdf.loc[row_inds, 'lo_m'] = slope
    sdf.loc[row_inds, 'lo_b'] = (p1[0] * p0[1] - p0[0] * p1[1]) / (p1[0] - p0[0])
sdf['lo_d'] = (sdf.place_in_line + (sdf.price_per_pod - sdf.lo_b) * sdf.lo_m) / (1 + sdf.lo_m ** 2)
sdf['x_reflect'] = 2 * sdf.lo_d - sdf.place_in_line
sdf['y_reflect'] = 2 * sdf.lo_d * sdf.lo_m - sdf.price_per_pod + 2 * sdf.lo_b

# We use these reflected points to compute a second convex hull, which in reality is a concave hull 
points_reflect = list(zip(sdf.x_reflect.values, sdf.y_reflect.values))
ch = ConvexHull(points_reflect) 
sdf['hull_hi'] = [i in ch.vertices for i in range(0, len(points_reflect))]

# compute the line equations for the concave hull 
sdf['hi_m'] = None 
verts = list(sorted(ch.vertices))
for i, j in zip(verts[:-1], verts[1:]):
    p0 = points[i]
    p1 = points[j] 
    slope = (p1[1] - p0[1]) / (p1[0] - p0[0])
    row_inds = list(range(i, j+1))
    sdf.loc[row_inds, 'hi_m'] = slope
    sdf.loc[row_inds, 'hi_b'] = (p1[0] * p0[1] - p0[0] * p1[1]) / (p1[0] - p0[0])

# having the line equations for both the convex and concave hulls, we compute the midpoint between these 
# two hulls as a smoothed approximation of our initial regression line 
sdf['y_lo'] = sdf.lo_m * sdf.place_in_line + sdf.lo_b
sdf['y_hi'] = sdf.hi_m * sdf.place_in_line + sdf.hi_b
sdf['y_mid'] = (sdf.y_hi - sdf.y_lo) / 2 + sdf.y_lo
    
(
    alt.Chart(
        df[['place_in_line', 'price_per_pod', 'evt_block_time', 'amount']]
        , width=800
        , height=500
        , title='Place in Line vs Price Per Pod (colored by age, sized by amount)'
    ).mark_point().encode(
        x='place_in_line:Q'
        , y='price_per_pod:Q'
        , color=alt.Color(
            "evt_block_time:O", legend=None
        )
        # , size=alt.Size('amount:Q', legend=None)
    ) + 
    alt.Chart(sdf, width=800, height=500).mark_line().encode(
        x='place_in_line:Q'
        , y='price_per_pod:Q'
    ) + 
    alt.Chart(sdf, width=800, height=500).mark_point(color='red').encode(
        x='place_in_line:Q'
        , y='price_per_pod:Q'
    ).transform_filter(
        (datum.hull_lo == True)
    ) + 
    alt.Chart(sdf, width=800, height=500).mark_point(color='green').encode(
        x='place_in_line:Q'
        , y='price_per_pod:Q'
    ).transform_filter(
        (datum.hull_hi == True)
    ) + 
    alt.Chart(sdf, width=800, height=500).mark_line(color='purple').encode(
        x='place_in_line:Q'
        , y='y_mid:Q'
    ).transform_filter(
        (datum.hull_hi == True)
    )
).interactive()

In [82]:
from statsmodels.stats.diagnostic import het_white

df = get_df(amount_binsize=2500)
df['ones'] = 1
df['age_seconds'] = df.age_days.dt.total_seconds()

for f in [None, 'age_seconds']: 
    xvars = ['place_in_line', 'ones']
    if f: 
        xvars.append(f) 
        monotonic_cst = [-1, 0, 0] 
    else: 
        monotonic_cst = [-1, 0] 
    X = df[xvars].to_numpy()
    y = df.price_per_pod.values 
    model = Pipeline([
        ('regressor', HistGradientBoostingRegressor(
            monotonic_cst=monotonic_cst, loss="poisson"
        )) 
    ])
    model.fit(X, y)
    y_pred = model.predict(X)
    resid = y - y_pred
    resid = StandardScaler().fit_transform(resid.reshape((len(resid), 1)))
    exog_het = df[xvars].to_numpy()
    labels = ['Test Statistic', 'Test Statistic p-value', 'F-Statistic', 'F-Test p-value']
    print(f'\nWhite Test')
    for n, stat in zip(labels, het_white(resid, exog_het)): 
        print(f"{n}: {stat}")
        if n == 'Test Statistic p-value': 
            prefix = 'include age: ' if f is not None else 'no age: '
            if stat < .01: 
                msg = f"{prefix}Reject the null hypothesis, model is heteroskedastic"
            else: 
                msg = f"{prefix}Fail to reject null hypothesis, insufficient evidence to conclude heteroskedastic"
    print(msg)

True Volume: 14159606.769839 Dropped Volume: 444606.76983900013 Dropped Percent: 3.139965516458022%

White Test
Test Statistic: -236.8755246457238
Test Statistic p-value: 1.0
F-Statistic: -226.9882284810953
F-Test p-value: 1.0
no age: Fail to reject null hypothesis, insufficient evidence to conclude heteroskedastic

White Test
Test Statistic: 200.38341083112664
Test Statistic p-value: 3.1077073920898096e-42
F-Statistic: 51.947651529246976
F-Test p-value: 5.222417508401953e-43
include age: Reject the null hypothesis, model is heteroskedastic


In [160]:
df = get_df(amount_binsize=2500)
df['age_seconds'] = df.age_days.dt.total_seconds()

xvars = ['place_in_line', 'age_seconds']
monotonic_cst = [-1, 0] 
X = df[xvars].to_numpy()
y = df.price_per_pod.values 
model = Pipeline([
    ('regressor', HistGradientBoostingRegressor(
        monotonic_cst=monotonic_cst, loss="poisson"
    )) 
])
model.fit(X, y)
y_pred = model.predict(X)
resid = (y - y_pred) ** 2
rdf = pd.DataFrame(
    data=dict(
        place_in_line=X[:,0]
        , age_seconds=X[:,1]
        , residual=resid 
    )
) 

Xfit = pd.DataFrame(data=dict(
    x=np.linspace(0, np.max(X[:,0]), num=nsamples), 
    y=[0 for i in range(nsamples)] 
)).to_numpy()
yfit = model.predict(Xfit)
df_fit = pd.DataFrame(
    data=dict(
        place_in_line=np.ravel(X[:,0])
        , price_per_pod=np.ravel(y_pred)
        , evt_block_time=np.ravel(df.evt_block_time.values)
    )
)

alt.vconcat(
    (
        alt.vconcat(
            alt.Chart(
                df[['place_in_line', 'price_per_pod', 'evt_block_time', 'amount']]
                , width=800
                , height=250
                , title='Place in Line vs Price Per Pod (colored by age)'
            ).mark_point().encode(
                x='place_in_line:Q'
                , y='price_per_pod:Q'
                , color=alt.Color(
                    "evt_block_time:O"
                    , legend=None
                )
            ), 
            alt.Chart(df_fit, width=800, height=250).mark_point(color='red').encode(
                x='place_in_line:Q'
                , y='price_per_pod:Q'
                , color=alt.Color(
                    "evt_block_time:O"
                    , legend=None
                    , scale=alt.Scale(range=['#ffd1d1', 'red'])
                )
            )
        ).resolve_scale(color='independent')
    ),
    alt.hconcat(
        alt.Chart(rdf).mark_bar().encode(
            x=alt.X('place_in_line:Q', bin=alt.Bin(step=25000000)), 
            y='mean(residual)'
        ),
        alt.Chart(rdf).mark_bar().encode(
            x=alt.X('age_seconds:Q', bin=alt.Bin(step=200000)), 
            y='mean(residual)'
        )
    ).resolve_scale(
        y='shared'
    ), 
    alt.hconcat(
        alt.Chart(rdf).mark_bar(color='orange').encode(
            x=alt.X('place_in_line:Q', bin=alt.Bin(step=25000000)), 
            y='count(residual)'
        ),
        alt.Chart(rdf).mark_bar(color='orange').encode(
            x=alt.X('age_seconds:Q', bin=alt.Bin(step=200000)), 
            y='count(residual)'
        )
    ).resolve_scale(
        y='shared'
    )
)

True Volume: 14159606.769839 Dropped Volume: 444606.76983900013 Dropped Percent: 3.139965516458022%


In [None]:
# dfs_all = pd.concat(dfs)
# sdf = dfs_all.loc[dfs_all.weight_name == '06 time decay 11.155555555555553'].copy()

# sdf['place_in_line'] = normalize(sdf['place_in_line'].values)
# weights = np.exp((sdf.price_per_pod.shift(1) - sdf.price_per_pod).fillna(.3).values * 10)

# model = Pipeline([
#     ('polyfeatures', PolynomialFeatures(degree=20)), 
#     ('scaler', StandardScaler()), # since polynomial features have wildly different ranges, scaling is important 
#     ('regressor', Ridge(alpha=1e-14)) 
# ])
# model.fit(sdf.place_in_line.values.reshape(
#     (len(sdf), 1)), sdf.price_per_pod.values
#     , regressor__sample_weight=weights
# )
# sdf['y_pred'] = model.predict(
#     sdf.place_in_line.values.reshape((len(sdf), 1))
# )

# alt.Chart(
#     sdf
#     , width=800
#     , height=500
# ).mark_point().encode(
#     x='place_in_line:Q'
#     , y='y_mid:Q'
# ) + alt.Chart(
#     sdf
#     , width=800
#     , height=500
# ).mark_line(color='red').encode(
#     x='place_in_line:Q'
#     , y='y_pred:Q'
# ) 

In [78]:
from sklearn.linear_model import LinearRegression
from statsmodels.stats.diagnostic import het_white
import statsmodels.api as sm
import pandas as pd

url = "https://raw.githubusercontent.com/Statology/Python-Guides/main/mtcars.csv"
data = pd.read_csv(url)
y = data['mpg']
x = data[['disp', 'hp']]
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
white_test = het_white(model.resid,  model.model.exog)

#define labels to use for output of White's test
labels = ['Test Statistic', 'Test Statistic p-value', 'F-Statistic', 'F-Test p-value']

#print results of White's test
print(dict(zip(labels, white_test)))

{'Test Statistic': 7.076620330416624, 'Test Statistic p-value': 0.21500404394263936, 'F-Statistic': 1.4764621093131864, 'F-Test p-value': 0.23147065943879694}


In [80]:
model.model.exog == x

Unnamed: 0,const,disp,hp
0,True,True,True
1,True,True,True
2,True,True,True
3,True,True,True
4,True,True,True
5,True,True,True
6,True,True,True
7,True,True,True
8,True,True,True
9,True,True,True
