# Subsampling continous and discrete responses

In [None]:
!pip uninstall nnetsauce --yes

In [None]:
!pip install nnetsauce==0.16.7

# 1 - using a percentage of the data

In [None]:
import os 
import matplotlib.pyplot as plt
import nnetsauce as ns
import numpy as np
from sklearn.datasets import load_digits, load_diabetes
from time import time


# dataset no. 1 ---------- 

print("\n\n dataset no. 1 (classification) ---------- \n\n")

dataset = load_digits()
u = dataset.target

print(" \n sequential ----- \n")

sub1 = ns.SubSampler(y=u, row_sample=0.6, seed=123, n_jobs=None)
start = time()
x1 = sub1.subsample()
print(f"elapsed time: {time() - start}")

print(x1)

_ = plt.hist(x1, bins='auto')
plt.show()

In [None]:
print(" \n parallel ----- \n") # needs more data to be efficient

sub2 = ns.SubSampler(y=u, row_sample=0.6, seed=123, n_jobs=2)
start = time()
x2 = sub2.subsample()
print(f"elapsed time: {time() - start}")

print(x2)

print(f" \n check: {np.allclose(x1, x2)} \n")

_ = plt.hist(x2, bins='auto')
plt.show()


In [None]:
# dataset no. 2 ---------- 

print("\n\n dataset no. 2 (regression) ---------- \n\n")

dataset = load_diabetes()
v = dataset.target

print(" \n sequential ----- \n")

sub1 = ns.SubSampler(y=v, row_sample=0.6, seed=123, n_jobs=None)
start = time()
y1 = sub1.subsample()
print(f"elapsed time: {time() - start}")

print(y1)

_ = plt.hist(y1, bins='auto')
plt.show()


In [None]:
print(" \n parallel ----- \n") # needs more data to be efficient

sub2 = ns.SubSampler(y=v, row_sample=0.6, seed=123, n_jobs=2)
start = time()
y2 = sub2.subsample()
print(f"elapsed time: {time() - start}")

print(y2)

_ = plt.hist(y2, bins='auto')
plt.show()

In [None]:
print(f" \n check: {np.allclose(y1, y2)} \n")

In [None]:
print(len(x1))
print(len(x2))
print(len(u))
print(len(y1))
print(len(y2))
print(len(v))

In [None]:
fig, axs = plt.subplots(2, 3)
axs[0, 0].hist(u[x1], bins='auto', density=False)
axs[0, 1].hist(u[x2], bins='auto', density=False)
axs[0, 2].hist(u, bins='auto', density=False)
axs[1, 0].hist(v[y1], bins='auto', density=False)
axs[1, 1].hist(v[y2], bins='auto', density=False)
axs[1, 2].hist(v, bins='auto', density=False)

# 2 - using a fixed number of samples

In [None]:
print(" \n sequential ----- \n")

sub1 = ns.SubSampler(y=u, n_samples=100, seed=123, n_jobs=None)
start = time()
x1 = sub1.subsample()
print(f"elapsed time: {time() - start}")
print(x1)
print(len(x1))
print(len(u))

sub2 = ns.SubSampler(y=u, n_samples=250, seed=123, n_jobs=None)
start = time()
x2 = sub2.subsample()
print(f"elapsed time: {time() - start}")
print(sub2.indices)
print(x2)
print(len(x2))
print(len(u))

fig, axs = plt.subplots(1, 3)
axs[0].hist(u[x1], bins='auto', density=False)
axs[1].hist(u[x2], bins='auto', density=False)
axs[2].hist(u, bins='auto', density=False)