In [None]:
import numpy as np
import sys
import pandas as pd
sys.path.append('../')
from src.Mondrian_matrix_utils import simulate_y, simulate, simulate_best
import pickle
import matplotlib.pyplot as plt

n = 5000
dim_in = 25
x_train = np.random.rand(n,dim_in)*2 - 1
y_train = simulate_y(x_train)

x_test = np.random.rand(5000,dim_in)*2 - 1
y_test = simulate_y(x_test)

In [4]:
df_describe = pd.DataFrame(x_train)
df_describe.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,0.008929,-0.004596,0.016232,-0.003324,-0.00307,-0.000665,-0.00819,-0.001232,-0.001989,-0.007755,...,0.014114,-0.001872,-0.007918,-0.019689,0.003551,0.005946,0.000681,-0.003998,0.009124,-0.001417
std,0.575901,0.579615,0.57261,0.574914,0.574554,0.572662,0.58321,0.578194,0.576729,0.575359,...,0.583042,0.579596,0.578482,0.572102,0.580448,0.57009,0.576646,0.579707,0.580698,0.578148
min,-0.99988,-0.999631,-0.999685,-0.999806,-0.999774,-0.999709,-0.999768,-0.999047,-0.999836,-0.999878,...,-0.998974,-0.99918,-0.999089,-0.999989,-0.999966,-0.999992,-0.998833,-0.999999,-0.999631,-0.999971
25%,-0.484947,-0.518917,-0.480059,-0.503365,-0.495631,-0.490949,-0.524932,-0.494993,-0.513561,-0.509841,...,-0.48931,-0.516563,-0.511715,-0.51272,-0.500121,-0.476687,-0.498778,-0.499953,-0.495555,-0.498611
50%,0.019112,0.000686,0.034651,-0.001252,-0.012876,-0.006942,-0.020318,-0.010394,0.005955,-0.010404,...,0.023816,0.007605,-0.014649,-0.034322,0.006143,0.016642,0.015679,-0.007518,0.010997,-0.000499
75%,0.507857,0.504107,0.512942,0.49408,0.502735,0.489478,0.499369,0.502575,0.495565,0.489919,...,0.52334,0.509467,0.480249,0.468535,0.504126,0.486252,0.485666,0.506503,0.521989,0.492233
max,0.999993,0.999442,0.999359,0.999744,0.998116,0.999625,0.998774,0.999587,0.999911,0.999682,...,0.999984,0.999468,0.998331,0.999835,0.999251,0.999171,0.99991,0.998988,0.999958,0.99963


In [5]:
df_describe = pd.DataFrame(y_train)
df_describe.describe()

Unnamed: 0,0
count,5000.0
mean,0.990689
std,0.586737
min,-0.006466
25%,0.535527
50%,0.94341
75%,1.355344
max,3.849691


In [None]:
M = 100                      # number of Mondrian trees to use
lifetime_max = 0.1          # terminal lifetime
weights_lifetime = 2*1e-6   # lifetime for which weights should be plotted
delta = 0.1              # ridge regression delta

In [None]:
stats = []

for n_sim in sample_range:
    print(f"number of training samples: {n_sim}")
    mse = {'n': n_sim, 'before': [], "after": []}
    for round in range(5):
        print(f"Simulation: {round}")
        mse_before, mse_after = simulate(x_train[:n_sim, ], y_train[:n_sim], x_test, y_test, M, lifetime_max, delta, weights_lifetime)
        mse['before'].append(mse_before)
        mse['after'].append(mse_after)
    stats.append(mse)

In [None]:
x = sample_range
before = []
before_min = []
before_max = []
after = []
after_min = []
after_max = []

for stat in stats:
    before.append(np.mean(stat['before']))
    before_min.append(np.min(stat['before']))
    before_max.append(np.max(stat['before']))
    after.append(np.mean(stat['after']))
    after_min.append(np.min(stat['after']))
    after_max.append(np.max(stat['after']))

fig, ax = plt.subplots()
ax.plot(x, before, color='b')
ax.plot(x, after, color='r')
ax.fill_between(x, (before_min), (before_max), color='b', alpha=.1)
ax.fill_between(x, (after_min), (after_max), color='r', alpha=.1)

plt.legend(["mse_before", "mse_after"], loc ="upper right")

pickle.dump(stats, open("stats.pk", "wb"))

In [None]:
stats_best = {}
best = {}

for multiplier in range(1,3):
    print(f"lifetime multiplier: {multiplier}")
    stats_best[multiplier] = []
    best[multiplier] = {"mean": [], "min": [], "max": []}
    for n_sim in sample_range:
        stats_temp = []
        print(f"number of training samples: {n_sim}")
        mse = {'n': n_sim, 'best': []}
        for round in range(5):
            print(f"Simulation: {round}")
            mse_best = simulate_best(x_train[:n_sim, ], y_train[:n_sim], x_test, y_test, M, lifetime_max * (multiplier), delta, weights_lifetime)
            mse['best'].append(mse_best)
            print(mse_best)
        stats_best[multiplier].append(mse)
        best[multiplier]["mean"].append(np.mean(mse['best']))
        best[multiplier]["min"].append(np.min(mse['best']))
        best[multiplier]["max"].append(np.max(mse['best']))

pickle.dump(stats_best, open("stats_best.pk", "wb"))

In [None]:
colors = plt.cm.rainbow(np.linspace(0, 1, 7))

fig, ax = plt.subplots(figsize=(15,8))

ax.plot(x, before, color=colors[0])
ax.plot(x, after, color=colors[6])
for multiplier in range(1,3):
    ax.plot(x, best[multiplier]["mean"], color=colors[multiplier])
plt.legend(["mse_before", "mse_after", "mse_best_1", "mse_best_2", "mse_best_3", "mse_best_4", "mse_best_5"], loc ="upper right")
ax.fill_between(x, (before_min), (before_max), color=colors[0], alpha=.1)
ax.fill_between(x, (after_min), (after_max), color=colors[6], alpha=.1)
for multiplier in range(1,2):
    ax.fill_between(x, (best[multiplier]["min"]), (best[multiplier]["max"]), color=colors[multiplier], alpha=.1)

In [None]:
stats = pickle.load(open("stats.pk", "rb"))
x = sample_range
before = []
before_min = []
before_max = []
after = []
after_min = []
after_max = []

for stat in stats:
    before.append(np.mean(stat['before']))
    before_min.append(np.min(stat['before']))
    before_max.append(np.max(stat['before']))
    after.append(np.mean(stat['after']))
    after_min.append(np.min(stat['after']))
    after_max.append(np.max(stat['after']))

In [None]:
from Mondrian_matrix_utils import simulate_proj

stats_best = {}
best = {}

for multiplier in range(1,3):
    print(f"lifetime multiplier: {multiplier}")
    stats_best[multiplier] = []
    best[multiplier] = {"mean": [], "min": [], "max": []}
    for n_sim in sample_range:
        stats_temp = []
        print(f"number of training samples: {n_sim}")
        mse = {'n': n_sim, 'best': []}
        for round in range(5):
            print(f"Simulation: {round}")
            mse_best = simulate_proj(x_train[:n_sim, ], y_train[:n_sim], x_test, y_test, M, lifetime_max * (multiplier), delta, weights_lifetime)
            mse['best'].append(mse_best)
            print(mse_best)
        stats_best[multiplier].append(mse)
        best[multiplier]["mean"].append(np.mean(mse['best']))
        best[multiplier]["min"].append(np.min(mse['best']))
        best[multiplier]["max"].append(np.max(mse['best']))

pickle.dump(stats_best, open("stats_proj.pk", "wb"))

In [None]:
colors = plt.cm.rainbow(np.linspace(0, 1, 7))

fig, ax = plt.subplots(figsize=(15,8))

ax.plot(x, before, color=colors[0])
ax.plot(x, after, color=colors[6])
for multiplier in range(1,3):
    ax.plot(x, best[multiplier]["mean"], color=colors[multiplier])
plt.legend(["mse_before", "mse_after", "mse_best_1", "mse_best_2", "mse_best_3", "mse_best_4", "mse_best_5"], loc ="upper right")
ax.fill_between(x, (before_min), (before_max), color=colors[0], alpha=.1)
ax.fill_between(x, (after_min), (after_max), color=colors[6], alpha=.1)
for multiplier in range(1,2):
    ax.fill_between(x, (best[multiplier]["min"]), (best[multiplier]["max"]), color=colors[multiplier], alpha=.1)