# Classification

In [3]:
import io
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import operator
import math

In [4]:
data=pd.read_csv('Iris.csv')
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


# Removing the unnecessary column

In [5]:
data.drop('Id',axis=1,inplace=True)
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [16]:
train, test = train_test_split(data, test_size=0.33, random_state=42) 
test.index = np.arange(0, len(test))
print(test)

    SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm          Species
0             6.1           2.8            4.7           1.2  Iris-versicolor
1             5.7           3.8            1.7           0.3      Iris-setosa
2             7.7           2.6            6.9           2.3   Iris-virginica
3             6.0           2.9            4.5           1.5  Iris-versicolor
4             6.8           2.8            4.8           1.4  Iris-versicolor
5             5.4           3.4            1.5           0.4      Iris-setosa
6             5.6           2.9            3.6           1.3  Iris-versicolor
7             6.9           3.1            5.1           2.3   Iris-virginica
8             6.2           2.2            4.5           1.5  Iris-versicolor
9             5.8           2.7            3.9           1.2  Iris-versicolor
10            6.5           3.2            5.1           2.0   Iris-virginica
11            4.8           3.0            1.4           0.1    

In [126]:
def euclideanDistance(data1, data2, length):
    distance = 0
    for x in range(length):
        #print("Operand:",data1[x],", ",data2[x])
        distance += pow((data1[x] - data2[x]), 2)
    return math.sqrt(distance)

In [127]:
def ManhattanDistance(data1, data2, length):
    data1 = np.asarray(data1[:4])
    data2 = np.asarray(data2[:4])
    return sum(abs(data1 - data2))

In [128]:
def CosineDistance(data1, data2, length):
    data1 = np.asarray(data1[:4])
    data2 = np.asarray(data2[:4])
    dot = np.dot(data1, data2)
    norma = np.linalg.norm(data1)
    normb = np.linalg.norm(data2)
    cos = dot / (norma * normb)
    return cos#CD = np.vectorize(cosineDistance)


In [129]:
def getNeighbors(trainingSet, testInstance, k, distance_metric):
    distances = []
    #Make this length a list of the feature indexes which we want to use
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        if(distance_metric=='Euclidean'):
            train=(trainingSet.iloc[x].tolist())
            dist = euclideanDistance(testInstance, train, length)
            #print(dist)
        elif(distance_metric=='Manhattan'):
            train=(trainingSet.iloc[x].tolist())
            dist = ManhattanDistance(testInstance, train, length)
            #print(dist)
        elif(distance_metric=='Cosine'):
            train=(trainingSet.iloc[x].tolist())
            dist = CosineDistance(testInstance, train, length)
            #print(dist)
        distances.append((train, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

In [130]:
def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedVotes[0][0]

In [131]:
#Calculate accuracy of predictions
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        test=(testSet.iloc[x].tolist())
        if test[-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [132]:
def KNN_predict(trainingSet, testSet, k, distance_metric):
    predictions=[]
    #print(len(testSet))
    for x in range(len(testSet)):
        test=(testSet.iloc[x].tolist())
        neighbors = getNeighbors(trainingSet, test, k, distance_metric)
        result = getResponse(neighbors)
        predictions.append(result)
        if(distance_metric=='Euclidean'):
            print('> predicted=' + repr(result) + ', actual=' + repr(test[-1]))
        elif(distance_metric=='Manhattan'):
            print('> predicted=' + repr(result) + ', actual=' + repr(test[-1]))
        elif(distance_metric=='Cosine'):
            print('> predicted=' + repr(result) + ', actual=' + repr(test[-1]))

    accuracy = getAccuracy(testSet, predictions)
    if(distance_metric=='Euclidean'):
        print('Accuracy: ' + repr(accuracy) + '%')
    elif(distance_metric=='Manhattan'):
        print('Accuracy: ' + repr(accuracy) + '%')
    elif(distance_metric=='Cosine'):
        print('Accuracy: ' + repr(accuracy) + '%')

In [133]:
K=3
print('****************************Euclidean***************************************')
KNN_predict(train, test, K, distance_metric='Euclidean')
print('****************************Manhattan***************************************')
KNN_predict(train, test, K, distance_metric='Manhattan')
print('****************************Cosine***************************************')
KNN_predict(train, test, K, distance_metric='Cosine')

****************************Euclidean***************************************
> predicted='Iris-versicolor', actual='Iris-versicolor'
> predicted='Iris-setosa', actual='Iris-setosa'
> predicted='Iris-virginica', actual='Iris-virginica'
> predicted='Iris-versicolor', actual='Iris-versicolor'
> predicted='Iris-versicolor', actual='Iris-versicolor'
> predicted='Iris-setosa', actual='Iris-setosa'
> predicted='Iris-versicolor', actual='Iris-versicolor'
> predicted='Iris-virginica', actual='Iris-virginica'
> predicted='Iris-versicolor', actual='Iris-versicolor'
> predicted='Iris-versicolor', actual='Iris-versicolor'
> predicted='Iris-virginica', actual='Iris-virginica'
> predicted='Iris-setosa', actual='Iris-setosa'
> predicted='Iris-setosa', actual='Iris-setosa'
> predicted='Iris-setosa', actual='Iris-setosa'
> predicted='Iris-setosa', actual='Iris-setosa'
> predicted='Iris-versicolor', actual='Iris-versicolor'
> predicted='Iris-virginica', actual='Iris-virginica'
> predicted='Iris-versicolo

# Regression

In [134]:
data=pd.read_csv('Real_estate_valuation.csv')
data.head()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


# Removing the unnecessary column

In [136]:
data.drop('No',axis=1,inplace=True)
data.head()

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [137]:
train, test = train_test_split(data, test_size=0.33, random_state=42) 
test.index = np.arange(0, len(test))
print(test)

     X1 transaction date  X2 house age  \
0               2013.167           1.1   
1               2013.000          13.2   
2               2013.083           0.0   
3               2012.917          12.7   
4               2012.667          20.2   
5               2013.583          32.5   
6               2012.917          15.9   
7               2013.250          16.2   
8               2012.917          31.9   
9               2013.583           6.6   
10              2013.500          25.3   
11              2013.500           4.0   
12              2012.833           5.1   
13              2012.833          31.7   
14              2013.083          38.6   
15              2013.417          14.7   
16              2013.417          33.6   
17              2012.833           3.4   
18              2012.917          17.3   
19              2013.083          15.1   
20              2013.000          30.9   
21              2013.417          17.9   
22              2013.250          

In [142]:
def euclideanDistance(data1, data2, length):
    distance = 0
    for x in range(length):
        #print("Operand:",data1[x],", ",data2[x])
        distance += pow((data1[x] - data2[x]), 2)
    return math.sqrt(distance)

In [143]:
def ManhattanDistanceReg(data1, data2, length):
    data1 = np.asarray(data1[:6])
    data2 = np.asarray(data2[:6])
    return sum(abs(data1 - data2))


In [144]:
def CosineDistanceReg(data1, data2, length):
    data1 = np.asarray(data1[:6])
    data2 = np.asarray(data2[:6])
    dot = np.dot(data1, data2)
    norma = np.linalg.norm(data1)
    normb = np.linalg.norm(data2)
    cos = dot / (norma * normb)
    return cos#CD = np.vectorize(cosineDistance)

In [145]:
def getNeighbors(trainingSet, testInstance, k, distance_metric):
    distances = []
    #Make this length a list of the feature indexes which we want to use
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        if(distance_metric=='Euclidean'):
            train=(trainingSet.iloc[x].tolist())
            dist = euclideanDistance(testInstance, train, length)
            #print(dist)
        elif(distance_metric=='Manhattan'):
            train=(trainingSet.iloc[x].tolist())
            dist = ManhattanDistanceReg(testInstance, train, length)
            #print(dist)
        elif(distance_metric=='Cosine'):
            train=(trainingSet.iloc[x].tolist())
            dist = CosineDistanceReg(testInstance, train, length)
            #print(dist)
        distances.append((train, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

In [146]:
def getResponse_Reg(neighbors, k):
    sum1=0
    for x in range(len(neighbors)):
        sum1 += neighbors[x][-1]
    result=sum1/k
    return result

In [152]:
def RMSE_Reg(testSet, predictions):
    sqerr=0
    for x in range(len(testSet)):
        test=(testSet.iloc[x].tolist())
        err=test[-1]-predictions[x]
        sqerr+=pow(err, 2)
    mse=sqerr/len(testSet)
    rmse=pow(mse, 0.5)
        
    return rmse

In [153]:
def KNN_predict_Reg(trainingSet, testSet, k, distance_metric):
    predictions=[]
    #print(len(testSet))
    for x in range(len(testSet)):
        test=(testSet.iloc[x].tolist())
        neighbors = getNeighbors(trainingSet, test, k, distance_metric)
        result = getResponse_Reg(neighbors, k)
        predictions.append(result)
        if(distance_metric=='Euclidean'):
            print('> predicted=' + repr(result) + ', actual=' + repr(test[-1]))
        elif(distance_metric=='Manhattan'):
            print('> predicted=' + repr(result) + ', actual=' + repr(test[-1]))
        elif(distance_metric=='Cosine'):
            print('> predicted=' + repr(result) + ', actual=' + repr(test[-1]))

    rmse = RMSE_Reg(testSet, predictions)
    if(distance_metric=='Euclidean'):
        print('Root Mean Square Error: ' + repr(rmse) + '%')
    elif(distance_metric=='Manhattan'):
        print('Root Mean Square Error: ' + repr(rmse) + '%')
    elif(distance_metric=='Cosine'):
        print('Root Mean Square Error: ' + repr(rmse) + '%')

In [154]:
K=3
print('****************************Euclidean***************************************')
KNN_predict_Reg(train, test, K, distance_metric='Euclidean')
print('****************************Manhattan***************************************')
KNN_predict_Reg(train, test, K, distance_metric='Manhattan')
print('****************************Cosine***************************************')
KNN_predict_Reg(train, test, K, distance_metric='Cosine')

****************************Euclidean***************************************
> predicted=49.46666666666667, actual=45.1
> predicted=36.5, actual=42.3
> predicted=41.666666666666664, actual=52.2
> predicted=37.53333333333333, actual=37.3
> predicted=25.066666666666666, actual=22.8
> predicted=37.43333333333334, actual=36.3
> predicted=45.0, actual=53.0
> predicted=47.96666666666667, actual=51.4
> predicted=15.200000000000001, actual=16.1
> predicted=60.5, actual=59.0
> predicted=25.833333333333332, actual=30.6
> predicted=32.13333333333333, actual=30.7
> predicted=26.26666666666667, actual=35.6
> predicted=15.200000000000001, actual=13.7
> predicted=31.966666666666665, actual=62.9
> predicted=27.7, actual=30.5
> predicted=40.53333333333333, actual=41.9
> predicted=53.73333333333333, actual=54.4
> predicted=26.599999999999998, actual=29.5
> predicted=41.666666666666664, actual=43.7
> predicted=17.066666666666666, actual=12.2
> predicted=25.600000000000005, actual=22.1
> predicted=46.6666

> predicted=23.900000000000002, actual=30.6
> predicted=25.066666666666666, actual=25.6
> predicted=47.96666666666667, actual=46.2
> predicted=26.03333333333333, actual=27.7
> predicted=25.266666666666666, actual=27.0
> predicted=15.133333333333335, actual=15.6
> predicted=17.733333333333334, actual=15.5
> predicted=17.066666666666666, actual=11.2
> predicted=25.599999999999998, actual=23.7
> predicted=26.76666666666667, actual=24.6
> predicted=34.36666666666667, actual=42.0
> predicted=43.699999999999996, actual=29.8
> predicted=27.333333333333332, actual=28.9
> predicted=37.833333333333336, actual=48.1
> predicted=38.833333333333336, actual=36.8
> predicted=56.03333333333333, actual=53.5
> predicted=49.03333333333333, actual=51.6
> predicted=37.800000000000004, actual=38.8
> predicted=37.333333333333336, actual=37.4
> predicted=46.5, actual=40.6
> predicted=47.199999999999996, actual=37.2
> predicted=40.300000000000004, actual=25.0
> predicted=26.599999999999998, actual=23.5
> predic

> predicted=17.066666666666666, actual=57.1
> predicted=53.73333333333333, actual=24.7
> predicted=17.066666666666666, actual=55.3
> predicted=17.066666666666666, actual=55.0
> predicted=17.066666666666666, actual=53.3
> predicted=17.066666666666666, actual=23.1
> predicted=17.066666666666666, actual=29.5
> predicted=17.066666666666666, actual=40.1
> predicted=53.73333333333333, actual=45.1
> predicted=17.066666666666666, actual=70.1
> predicted=17.066666666666666, actual=50.7
> predicted=17.066666666666666, actual=37.0
> predicted=17.066666666666666, actual=46.7
> predicted=53.73333333333333, actual=29.3
> predicted=17.066666666666666, actual=47.7
> predicted=53.73333333333333, actual=19.2
> predicted=53.73333333333333, actual=20.9
> predicted=53.73333333333333, actual=29.4
> predicted=17.066666666666666, actual=39.1
> predicted=17.066666666666666, actual=45.5
> predicted=17.066666666666666, actual=38.1
> predicted=53.73333333333333, actual=28.4
> predicted=17.066666666666666, actual=