# Exercise 01

In [1]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

## PART A - NumPy

* **(7 points)** Create a random array with 1000 rows and 5 columns by generating the columns one at a time and then concatenating them to create the dataset.

In [2]:
rand_array = np.empty((1000,0)) #empty array that can fit 1000 rows
for i in range(0,5): #loop through 5 times
    col = np.random.rand(1000,1) #generate random 1000 row column
    rand_array = np.hstack((rand_array, col)) #append to array
print(rand_array)
print(rand_array.shape)

[[0.75163551 0.21517617 0.97137068 0.29110584 0.88552509]
 [0.58144467 0.9275708  0.21388838 0.80979618 0.34296785]
 [0.4715218  0.98621432 0.62509873 0.80685944 0.66919438]
 ...
 [0.76603243 0.50730121 0.98169809 0.77786192 0.31158031]
 [0.00168853 0.96432945 0.4210681  0.78215753 0.59215648]
 [0.1848045  0.31009975 0.63434073 0.83466619 0.04476396]]
(1000, 5)


* **(4 points)** Shuffle the dataset.

In [3]:
np.random.shuffle(rand_array)
print(rand_array)

[[0.49761724 0.52775188 0.79832255 0.64008884 0.64773076]
 [0.53516761 0.8508546  0.4340267  0.0434548  0.94175421]
 [0.96158402 0.85893433 0.53725214 0.67354171 0.73713656]
 ...
 [0.48931001 0.10775135 0.04362865 0.63282679 0.8103825 ]
 [0.87237714 0.45394154 0.33883746 0.14610274 0.24224143]
 [0.34235562 0.78537505 0.32143264 0.02888991 0.93546883]]


* **(5 points)** Split it using a 60/25/15 split into three sets: training, validation and test.

In [4]:
len(rand_array)

1000

In [5]:
#get split indexes from array length and fractions
array_len = len(rand_array)
train_split = int(.6*array_len)
validation_split = int(.25*array_len)
test_split = int(.15*array_len)

#split array using indexes
rand_array_train = rand_array[:train_split]
rand_array_valid = rand_array[train_split:train_split+validation_split]
rand_array_test = rand_array[train_split+validation_split:] #take all lefover rows

print(rand_array_train.shape)
print(rand_array_valid.shape)
print(rand_array_test.shape)

(600, 5)
(250, 5)
(150, 5)


* **(4 points)** Standardize every column in the training set by subtracting every column by its mean and dividing the result by its standard deviation.

In [6]:
#save these values for use in validation and test
train_mean = rand_array_train.mean(axis = 0) #one row per column
train_std = rand_array_train.std(axis = 0)

#full array transformation
rand_array_train = (rand_array_train-train_mean)/train_std 
rand_array_train

array([[ 0.04206556,  0.06481263,  1.04605507,  0.52035035,  0.56201644],
       [ 0.17022976,  1.17797931, -0.19971905, -1.55713738,  1.56422233],
       [ 1.62564327,  1.20581592,  0.15327858,  0.63683368,  0.86676431],
       ...,
       [ 0.77146973, -1.45257558,  0.73172278, -1.47960532, -1.60612171],
       [ 0.94382788,  0.97219173,  0.31426742,  0.46573314,  0.45704933],
       [ 1.73108852,  0.24465355,  0.02743186,  0.63906646,  0.9572767 ]])

* **(2 points)** Do the same with validation and test sets but using the mean and standard deviation from the training set.

In [7]:
#apply same transformation for valid and test
rand_array_valid = (rand_array_valid-train_mean)/train_std
rand_array_test = (rand_array_test-train_mean)/train_std

print(rand_array_valid[:5])
print()
print(rand_array_test[:5])

[[ 1.64228441  0.90084778  1.44224602  0.79153311  0.30810123]
 [-1.55037338  0.48138995  1.38535553 -1.11576702  0.95825463]
 [-1.22722177  0.027836   -1.49171089  1.16357783 -0.55510121]
 [ 1.70161713  0.81498912  0.47194229  0.57696734 -0.10579517]
 [-0.17895298 -0.64346033  0.72452068  0.18723061  0.47592382]]

[[-1.50089139 -1.3162285  -1.17801249  0.80106971 -1.29437128]
 [ 1.30653272  0.92119211 -1.37425296 -0.43313461 -1.07125162]
 [ 0.09632107  1.63060457 -0.89502149  1.17864368 -0.30801838]
 [ 0.90906226 -1.01208598  1.6378238  -0.69481314  1.37256024]
 [ 0.88503406  1.13112395  1.21906298 -0.62606017 -0.09074497]]


## PART B - PyTorch
This is a repetition of PART A but using PyTorch instead.

* **(5 points)** Create a tensor  with 1000 rows and 5 columns by generating the columns one at a time and then concatenating them to create the dataset.

In [8]:
for i in range(0,5): #loop through 5 times
    if i == 0: #for the first column
        col = torch.rand(1000,1) #generate random column
        rand_torch = col #create the base array based off first column
    else:  #for columns 2-5
        col = torch.rand(1000,1) #generate random column
        rand_torch = torch.cat((rand_torch, col), dim = 1) #append to exisitng tensor
print(rand_torch)
print(rand_torch.shape)

tensor([[0.8003, 0.8601, 0.3913, 0.7795, 0.7726],
        [0.3377, 0.3450, 0.4437, 0.7451, 0.6461],
        [0.3286, 0.5863, 0.2979, 0.7579, 0.6151],
        ...,
        [0.5793, 0.7584, 0.8764, 0.5330, 0.9151],
        [0.4425, 0.7987, 0.5851, 0.8206, 0.2822],
        [0.7041, 0.1093, 0.5218, 0.5278, 0.8563]])
torch.Size([1000, 5])


* **(4 points)** Shuffle the dataset.

In [9]:
shuffled_index = torch.randperm(rand_torch.shape[0]) #use randperm() function to generate random samples within length of tensor
rand_torch = rand_torch[shuffled_index]
rand_torch

tensor([[0.6273, 0.0331, 0.1148, 0.8788, 0.3910],
        [0.8188, 0.5117, 0.5546, 0.0967, 0.3220],
        [0.3013, 0.2081, 0.4553, 0.0159, 0.9733],
        ...,
        [0.5086, 0.8803, 0.9926, 0.2728, 0.8364],
        [0.9982, 0.1591, 0.8063, 0.5979, 0.3852],
        [0.3650, 0.9943, 0.3072, 0.9042, 0.6633]])

* **(5 points)** Split it using a 60/25/15 split into three sets: training, validation and test.

In [10]:
#get split indexes from tensor length and fractions
torch_len = len(rand_torch)
train_split = int(.6*torch_len)
validation_split = int(.25*torch_len)
test_split = int(.15*torch_len)

#split tensor using indexes
rand_torch_train = rand_torch[:train_split]
rand_torch_valid = rand_torch[train_split:train_split+validation_split]
rand_torch_test = rand_torch[train_split+validation_split:] #take all lefover rows

print(rand_torch_train.shape)
print(rand_torch_valid.shape)
print(rand_torch_test.shape)

torch.Size([600, 5])
torch.Size([250, 5])
torch.Size([150, 5])


* **(4 points)** Standardize every column in the training set by subtracting every column by its mean and dividing the result by its standard deviation.

In [11]:
#save these values for use in validation and test
train_mean = rand_torch_train.mean(axis = 0) #one row per column
train_std = rand_torch_train.std(axis = 0)

#full array transformation
rand_torch_train = (rand_torch_train-train_mean)/train_std 
rand_torch_train

tensor([[ 0.4127, -1.6237, -1.3435,  1.2356, -0.4561],
        [ 1.0786,  0.0522,  0.2081, -1.4005, -0.6937],
        [-0.7210, -1.0110, -0.1423, -1.6729,  1.5480],
        ...,
        [ 0.7477,  0.4489,  1.7786,  0.6790,  1.4353],
        [-0.4717, -1.3077, -0.8961, -0.4929,  1.3076],
        [-1.2894, -0.9482,  1.5418, -0.3850,  0.3521]])

* **(2 points)** Do the same with validation and test sets but using the mean and standard deviation from the training set.

In [12]:
#apply same transformation for valid and test
rand_torch_valid = (rand_torch_valid-train_mean)/train_std
rand_torch_test = (rand_torch_test-train_mean)/train_std

print(rand_torch_valid[:5])
print()
print(rand_torch_test[:5])

tensor([[-0.5452, -1.0768, -1.3115, -1.2541, -0.8374],
        [ 0.2368, -1.5803,  0.0916,  0.4575, -1.1198],
        [-0.5174,  1.0873,  0.6191,  0.0557,  0.1042],
        [ 1.4058, -0.1170, -0.9441,  0.7785, -0.1161],
        [-1.0550, -0.3819,  0.0763,  1.1148,  0.5924]])

tensor([[-0.5584,  0.1022,  1.2540,  0.2453, -1.5912],
        [-0.5541, -1.4701, -0.8711, -0.6473,  1.0272],
        [-0.6221,  0.5215,  1.0674, -0.3654,  0.6930],
        [ 0.7686,  0.7450, -0.8908, -1.6625,  0.6549],
        [-1.0806, -0.2992, -1.5704,  0.1151,  0.1925]])


## PART C - Pandas
* **(6 points)** Take the 1000-row dataset you created in the Numpy part and create a DataFrame with it. Give the columns the names `x1` to `x5`.

In [13]:
cols = ['x1','x2','x3','x4','x5']

df = pd.DataFrame(rand_array, columns = cols)
df

Unnamed: 0,x1,x2,x3,x4,x5
0,0.497617,0.527752,0.798323,0.640089,0.647731
1,0.535168,0.850855,0.434027,0.043455,0.941754
2,0.961584,0.858934,0.537252,0.673542,0.737137
3,0.372617,0.081365,0.926203,0.853181,0.240759
4,0.508953,0.706298,0.066575,0.465725,0.265353
...,...,...,...,...,...
995,0.982321,0.107887,0.367223,0.103049,0.685719
996,0.711992,0.306617,0.421300,0.709578,0.532664
997,0.489310,0.107751,0.043629,0.632827,0.810383
998,0.872377,0.453942,0.338837,0.146103,0.242241


* **(5 points)** Shuffle the DataFrame.

In [14]:
df = df.sample(frac=1) #shuffle = randomly sample 100% of rows
df

Unnamed: 0,x1,x2,x3,x4,x5
667,0.641439,0.136382,0.448545,0.542787,0.618749
760,0.600568,0.444317,0.599178,0.386357,0.078151
625,0.262460,0.625121,0.605669,0.343264,0.972562
386,0.871563,0.504950,0.083884,0.714540,0.339843
997,0.489310,0.107751,0.043629,0.632827,0.810383
...,...,...,...,...,...
477,0.328478,0.133295,0.010622,0.194639,0.042911
4,0.508953,0.706298,0.066575,0.465725,0.265353
362,0.776734,0.153550,0.237214,0.726196,0.517286
854,0.744596,0.837255,0.848914,0.310851,0.456226


* **(7 points)** Break it using a 60/25/15 split into three dataframes: training, validation, and test.

In [15]:
#let's just use the indeces from the numpy section with pandas .iloc[] funciton

df_train = df.iloc[:train_split]
df_valid = df.iloc[train_split:train_split+validation_split]
df_test = df.iloc[train_split+validation_split:] #take all lefover rows

print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)

(600, 5)
(250, 5)
(150, 5)


* **(6 points)** Standardize every column in the training dataframe by subtracting every column by its mean and dividing the result by its standard deviation. Do the same with validation and test dataframes but using the mean and standard deviation from the training dataframe.

In [16]:
#save these values for use in validation and test
train_mean = df_train.mean() #one row per column
train_std = df_train.std()

#full array transformation
df_train = (df_train-train_mean)/train_std 
df_valid = (df_valid-train_mean)/train_std 
df_test = (df_test-train_mean)/train_std 

print(df_train)
print()
print(df_valid)
print()
print(df_test)

           x1        x2        x3        x4        x5
667  0.535629 -1.253206 -0.142902  0.147636  0.438735
760  0.393882 -0.195935  0.376689 -0.403669 -1.382411
625 -0.778740  0.424843  0.399078 -0.555541  1.630649
386  1.333741  0.012245 -1.400752  0.752944 -0.500833
997  0.008019 -1.351508 -1.539610  0.464963  1.084303
..        ...       ...       ...       ...       ...
170  1.527630 -0.041279  0.966325 -1.215357 -0.352668
292 -1.005852  0.302936  0.347231  1.075111 -1.321508
181 -0.682129 -1.202935 -1.309097  0.095450  1.207896
817 -0.529049 -0.359988  1.117836  0.609706 -1.101763
439  0.377032 -0.995760 -0.067774  0.877296  1.241674

[600 rows x 5 columns]

           x1        x2        x3        x4        x5
982 -0.599287  1.412647 -1.526151  1.552813 -0.658049
542  0.431182 -0.773982  1.663097 -0.818627  0.102635
141 -1.623644  1.654492  1.426498 -1.764206 -1.630122
831  1.422164  0.353817  0.305176  0.272197 -0.435401
402 -1.325200 -0.808654  1.364460  0.104066 -0.850888
.. 

* **(4 points)** Save all three dataframes into their own `.csv` files.

In [17]:
df_train.to_csv('output\\df_train.csv')
df_valid.to_csv('output\\df_valid.csv')
df_test.to_csv('output\\df_test.csv')

## PART D - Matplotlib

* **(27 points)** Plot the following functions using $3\ by\ 2$ subplots. Each plot must have its own title, and its own x and y labels. Make also sure that each plot enough of the function to show its behavior.
    * $f(x) = 3 x^2 + 10$
    * $f(x) = \frac{1}{1 + e^{-x}}$
    * $f(z) = max(\alpha z, z)$ where $\alpha = 0.01$
    * $f(x) = sin(x^2)$
    * $f(x) = cos(x) + ln(x)$
    * $f(x) = tanh(x)$

In [None]:
# TODO
titles = [
    "Funciton A", "Funciton B", "Funciton C", 
    "Funciton D", "Funciton E", "Funciton F"
]

x_s = [ 
    np.arange(-12, 12, .1), np.arange(-5, 5, .1), 
    np.arange(-6, 6, .1), np.arange(-7, 7, .1),np.arange(1, 7, .1),np.arange(-7, 7, .1)
]

f_x = [
    lambda x: 3 * x**2 + 10,
    lambda x: 1 / (1 + np.exp(-x)),
    lambda z: np.maximum(.1 * z, z),
    lambda x: np.sin(x**2),
    lambda x: np.cos(x) + np.log(x),
    lambda x: np.tanh(x)
]

fig = plt.figure(figsize=(16, 15))
for i, (x, fx) in enumerate(zip(x_s, f_x)):
    plt.subplot(3, 2,i + 1)
    plt.plot(x, fx(x))
    plt.xlabel("$x$")
    plt.ylabel("$f(x)$")
    plt.title(titles[i])
    plt.grid()

plt.show()

* **(3 points)** Save the plot into a file named `e01_3by2-fcns.png`.

In [None]:
fig.savefig('output\\e01_3by2-fcns.png')