In [41]:
import numpy as np
import pandas as pd

# set a seed 
np.random.seed(42)

# number of values in each dataset
N = 1000

In [64]:
# dataset A â€“ Linear (y = 3x + 5)

# make x values
xs = np.linspace(-10, 10, N)

# make y values using the linear formula
ys = 3 * xs + 5

# dataframe
dataA = pd.DataFrame({
    "x": xs,
    "y": ys
})

# save file
dataA.to_csv("dataset_A_linear.csv", index=False)
print("saved dataset_A_linear.csv")

saved dataset_A_linear.csv


In [69]:
# Dataset B - Poly + noise (y = x^2 + random noise)

# make x numbers
xs = np.linspace(-10, 10, N)

# random noise
noise = np.random.normal(0, 2, size=N)

# poly + noise
ys = (xs**2) + noise

# dataframe
dataB = pd.DataFrame({
    "x": xs,
    "y": ys
})

# save file
dataB.to_csv("dataset_B_poly_noise.csv", index=False)
print("saved dataset_B_poly_noise.csv")

saved dataset_B_poly_noise.csv


In [70]:
# Dataset C - random

# random x and y values
xs = np.random.uniform(-10, 10, size=N)
ys = np.random.uniform(-10, 10, size=N)

# dataframe
dataC = pd.DataFrame({
    "x": xs,
    "y": ys
})

# save file
dataC.to_csv("dataset_C_random.csv", index=False)
print("saved dataset_C_random.csv")

saved dataset_C_random.csv


In [77]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor

# read / split csv
def get_data(file_name):
    df = pd.read_csv(file_name)
    X = df[["x"]].values   # inputs
    y = df["y"].values     # labels
    return train_test_split(X, y, test_size=0.2, random_state=42)

# all 3 datasets
datasets = {
    "A (linear)": "dataset_A_linear.csv",
    "B (poly+noise)": "dataset_B_poly_noise.csv",
    "C (random)": "dataset_C_random.csv"
}

# the 3 models we want to try
model_list = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Neural Net (MLP)": MLPRegressor(
        hidden_layer_sizes=(16,),  # 1 hidden layer with 16 nodes
        activation="relu",
        max_iter=3000,
        random_state=42
    )
}

# loop over each dataset and train models
for ds_name, ds_file in datasets.items():
    print("Training on:", ds_name)
    print("----------------------")

    X_train, X_test, y_train, y_test = get_data(ds_file)

    for m_name, m in model_list.items():
        # fit the model
        m.fit(X_train, y_train)

        # predictions on train and test
        y_train_pred = m.predict(X_train)
        y_test_pred = m.predict(X_test)

        # mean squared error
        train_mse = mean_squared_error(y_train, y_train_pred)
        test_mse = mean_squared_error(y_test, y_test_pred)
        gap = test_mse - train_mse

        print(m_name)
        print("train MSE:", round(train_mse, 4))
        print("test MSE: ", round(test_mse, 4))
        print("gap:", round(gap, 4))
        print("")



Training on: A (linear)
----------------------
Linear Regression
train MSE: 0.0
test MSE:  0.0
gap: -0.0

Decision Tree
train MSE: 0.0
test MSE:  0.0039
gap: 0.0039

Neural Net (MLP)
train MSE: 0.0131
test MSE:  0.0123
gap: -0.0008

Training on: B (poly+noise)
----------------------
Linear Regression
train MSE: 905.9513
test MSE:  887.7702
gap: -18.1811

Decision Tree
train MSE: 0.0
test MSE:  6.9507
gap: 6.9507





Neural Net (MLP)
train MSE: 4.0565
test MSE:  3.9332
gap: -0.1232

Training on: C (random)
----------------------
Linear Regression
train MSE: 33.6366
test MSE:  34.769
gap: 1.1324

Decision Tree
train MSE: 0.0
test MSE:  75.3971
gap: 75.3971

Neural Net (MLP)
train MSE: 33.0694
test MSE:  34.9067
gap: 1.8373



In [72]:
import os
import gzip

data_files = {
    "A": "dataset_A_linear.csv",
    "B": "dataset_B_poly_noise.csv",
    "C": "dataset_C_random.csv"
}

def get_gzip_size(fname):
    with open(fname, "rb") as f:
        stuff = f.read()
    return len(gzip.compress(stuff))

results = []

for name, fname in data_files.items():
    orig = os.path.getsize(fname)
    zipped = get_gzip_size(fname)

    results.append((name, orig, zipped))

    print(name)
    print("original:", orig)
    print("ziped:", zipped)
    print("ratio ->", (zipped / orig, 4))
    print("")

print("results:")
for name, o, z in results:
    print(name, ": ", z, " (was ", o, ")", sep="")


A
original: 38615
ziped: 7782
ratio -> (0.20152790366437912, 4)

B
original: 38108
ziped: 14422
ratio -> (0.37845071900913196, 4)

C
original: 38564
ziped: 19453
ratio -> (0.5044341873249663, 4)

results:
A: 7782 (was 38615)
B: 14422 (was 38108)
C: 19453 (was 38564)
