1. Write a Python function to compute the predictions according to the mean Euclidean distance to the sample points of each class.
The function should have the following interface function [prediction] = meanPrediction(dataClass1, dataClass2, dataUnknownClass) where dataClass1 is an array N1xd; dataClass2 is an array N2xd; dataUnknownClass is an array Ntxd; and prediction is an array Ntx1. d is the dimension of the features.

a)Determine the training error on your samples using only the x1 feature value. Make use of the function meanPrediction you wrote.

b) Repeat but now use two feature values, x1 and x2.

c) Repeat but use all three feature values.

d) Discuss your results. Is it ever possible for a finite set of data that the training error be larger for more data dimensions?

In [3]:
import numpy as np
import pandas as pd
from skimage import io
from sklearn import metrics, preprocessing
from matplotlib import pyplot as pl

#%%

# Implement the euclidean_distance function here:
def euclidean_distance(a, b):
    # Computes and returns the Euclidean distance between vectors 'a' and 'b'
    distance = np.sqrt(np.sum(np.square(a - b)))
    return distance

def mac_queen_initialisation(X, k):
    centroids = np.zeros((k, X.shape[1]))
    for cc in range(k):
        index = np.random.randint(0, high=X.shape[0])
        centroids[cc] = X[index]
    return centroids


#%%

# The Euclidean distance function can be used for k-Means clustering:
def k_means(X, k=3, n_iterations=10):
    # X is a N-by-M numpy array of N data points, each with M dimensions/features
    # k is the number of clusters to compute (3 is the default value)
    # n_iterations is the maximum number of iterations we want (100 is the default value)
    
    # First, we need to initialise the clusters.
    # To simplify, we will use the MacQueen method, selecting 'k' random points of 'X' as initial centroids:
    centroids = mac_queen_initialisation(X, k)
    
    # Keep a history of the centroids' movements:    
    centroids_history = np.zeros((n_iterations+1, k, X.shape[1]))
    centroids_history[0, :, :] = centroids
    
    # This will store the cluster membership of each data point in 'X':
    membership = np.zeros((X.shape[0]))
    
    # The k_means algorithm is iterative. Start a loop here for 'n_iterations':
    for ii in range(n_iterations):    
        # In each loop, for each data point:
        for index in range(X.shape[0]):
            # Compute the Euclidean distance between the data point and each centroid:
            distance = [euclidean_distance(X[index], cc) for cc in centroids]
        
            # Then, assign each data point to the cluster with the nearest centroid:
            membership[index] = np.argmin(distance)
        
        # Now, recompute the centroids of each cluster, computing the mean of the cluster data points:
        for cc in range(k):
            centroids[cc] = np.mean(X[membership == cc], axis=0)
        
        centroids_history[ii+1, :, :] = centroids
        
    # Finally, return the clustering result:
    return membership, centroids, centroids_history

In [9]:
dataClass1 = np.zeros((10, 3))
#Assign Values dataClass1
dataClass1[0, 0] = -5.01
dataClass1[0, 1] = -8.12
dataClass1[0, 2] = -3.68

dataClass1[1, 0] = -5.43
dataClass1[1, 1] = -3.48
dataClass1[1, 2] = -3.54

dataClass1[2, 0] = 1.08
dataClass1[2, 1] = -5.52
dataClass1[2, 2] = -1.66

dataClass1[3, 0] = 0.86
dataClass1[3, 1] = -3.78
dataClass1[3, 2] = -4.11

dataClass1[4, 0] = -2.67
dataClass1[4, 1] = -0.63
dataClass1[4, 2] = 7.39

dataClass1[5, 0] = 4.94
dataClass1[5, 1] = 3.29
dataClass1[5, 2] = 2.08

dataClass1[6, 0] = -2.51
dataClass1[6, 1] = 2.09
dataClass1[6, 2] = -2.59

dataClass1[7, 0] = -2.25
dataClass1[7, 1] = -2.13
dataClass1[7, 2] = -6.94

dataClass1[8, 0] = 5.56
dataClass1[8, 1] = 2.86
dataClass1[8, 2] = -2.26

dataClass1[9, 0] = 1.03
dataClass1[9, 1] = -3.33
dataClass1[9, 2] = 4.33


dataClass2 = np.zeros((10, 3))
#Assign Values to dataClass2
dataClass2[0, 0] = -0.91
dataClass2[0, 1] = -0.18
dataClass2[0, 2] = -0.05

dataClass2[1, 0] = 1.30
dataClass2[1, 1] = -2.06
dataClass2[1, 2] = -3.53

dataClass2[2, 0] = -7.75
dataClass2[2, 1] = -4.54
dataClass2[2, 2] = -0.95

dataClass2[3, 0] = -5.47
dataClass2[3, 1] = 0.50
dataClass2[3, 2] = 3.92

dataClass2[4, 0] = 6.14
dataClass2[4, 1] = 5.72
dataClass2[4, 2] = -4.85

dataClass2[5, 0] = 3.60
dataClass2[5, 1] = 1.26
dataClass2[5, 2] = 4.36

dataClass2[6, 0] = 5.37
dataClass2[6, 1] = -4.63
dataClass2[6, 2] = -3.65

dataClass2[7, 0] = 7.18
dataClass2[7, 1] = 1.46
dataClass2[7, 2] = -6.66

dataClass2[8, 0] = -7.39
dataClass2[8, 1] = 1.17
dataClass2[8, 2] = 6.30

dataClass2[9, 0] = -7.50
dataClass2[9, 1] = -6.32
dataClass2[9, 2] = -0.31

#Print both datasets
print("dataClass1 is : \n", dataClass1, '\n')
print("dataClass2 is : \n",dataClass2, '\n')


dataClass1 is : 
 [[-5.01 -8.12 -3.68]
 [-5.43 -3.48 -3.54]
 [ 1.08 -5.52 -1.66]
 [ 0.86 -3.78 -4.11]
 [-2.67 -0.63  7.39]
 [ 4.94  3.29  2.08]
 [-2.51  2.09 -2.59]
 [-2.25 -2.13 -6.94]
 [ 5.56  2.86 -2.26]
 [ 1.03 -3.33  4.33]] 

dataClass2 is : 
 [[-0.91 -0.18 -0.05]
 [ 1.3  -2.06 -3.53]
 [-7.75 -4.54 -0.95]
 [-5.47  0.5   3.92]
 [ 6.14  5.72 -4.85]
 [ 3.6   1.26  4.36]
 [ 5.37 -4.63 -3.65]
 [ 7.18  1.46 -6.66]
 [-7.39  1.17  6.3 ]
 [-7.5  -6.32 -0.31]] 



2. Peter is a very predictable man. When he uses his tablet, all he does is watch movies. He always watches until his battery dies. He is also a very meticulous man. He has kept logs of every time he has charged his tablet, which includes how long he charged his tablet for and how long he was able to watch movies for afterwards. Now, Peter wants to use this log to predict how long he will be able to watch movies for when he starts so that he can plan his activities after watching his movies accordingly.
You will be able to access Peter’s tablet charging log by reading from the file “TabletTrainingdata.txt”. The training data file consists of 100 lines, each with 2 comma-separated numbers. The first number denotes the amount of time the tablet was charged and the second denotes the amount of time the battery lasted.
Read an input (test case) from the console (stdin) representing the amount of time the tablet was charged and output to the console the amount of time you predict his battery will last.

    #example to read test case
    timeCharged = float(input().strip())
    
    #example to output
    print(prediction)

In [None]:
#Regression Model
import numpy as np
import matplotlib.pyplot as plt

#USING OUR OWN IMPLEMENTATION OF LINEAR REGRESSION
def polyRegression(data1D, yy, testData, degree):
    xdata = [data1D**dd for dd in range (degree+1)]
    xdata = np.concatenate(xdata, axis=1)
    
    ww = np.linalg.inv(np.dot(xdata.transpose(),xdata))
    ww = np.dot(ww, xdata.transpose())
    ww = np.dot(ww, yy)    
    
    xdata = [testData**dd for dd in range (degree+1)]
    xdata = np.concatenate(xdata, axis=1)
    pred = np.dot(testData, ww)    
    return pred, ww

data = np.genfromtxt('TabletTrainingdata.txt', delimiter=',')
print (np.shape(data))
print (type(data))
print (data)

testData = np.array(float(input().strip()))
testData = testData.reshape(1, -1)

prediction, model  = polyRegression(data[:,[0]], data[:,[-1]], testData, 0)

In [None]:
print (np.shape(prediction))
print (type(prediction))
print(prediction)
plt.plot(testData, prediction);
plt.plot(data[:,[0]], data[:,[-1]], 'o')
print (model)


In [None]:
#USING SKLEARN
print ("USING sklearn")
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

data = np.genfromtxt('TabletTrainingdata.txt', delimiter=',')
print (np.shape(data))
print (type(data))
print (data)

In [None]:
testData = np.array(float(input().strip()))
testData = testData.reshape(1, -1)

model = make_pipeline(PolynomialFeatures(9), LinearRegression())
model = model.fit(data[:,[0]], data[:,-1])
pred = model.predict(testData)
print (np.shape(pred))
print (type(pred))

plt.plot(testData, pred);
plt.plot(data[:,[0]], data[:,[-1]], 'o')
print (model)
print(pred)