a) Use the Nearest Neighbors method to determine missing values. Choose k = 3. Normalize the attributes
to [0, 1]. Use the Manhattan metric for distance or the 0/1 distance for nominal attributes

In [84]:
import pandas as pd

df = pd.read_csv("../../original_files/exercises/Sheet 6/data.csv", delimiter=",")
df.isna().sum()
df.Day = df.Day.str.replace("D", "").astype(int)


In total there are 5 missing values to be filled.
Using k-Nearest Neighbor we need to calculate:
$$5 \times
    10 = 50$$ 
distances in total.

In [94]:
class knn_weather():

    def __init__(self, k):
        self.training_X = None
        self.k = k
        self.MinMax = {}
        return

    def fit(self, training_X):
        self.training_X = training_X.copy()
        self.__normalize(self.training_X)
        return

    # Min-Max normalization is integrated in the class
    def __normalize(self, X):
        for column in X.columns:
            if (X[column].dtype == "float64") or (X[column].dtype == "int64"):
                # store the values for normalizing the sample for prediction
                max = X[column].max(); min = X[column].min()
                # y = k*x + b, here we store [k, b]
                self.MinMax[column] = [-1/(max-min), max/(max-min)]
        return

    def predict(self, X):
        for i, train_sample in self.training_X.iterrows():
            sum = 0
            for column in X.index[~X.isna()]:
                if type(X[column]) == str:
                    # 0/1 distance
                    if X[column] != train_sample[column]:
                        sum += 1
                else:
                    # L1 distance + normalization
                    dist = X[column] - train_sample[column]
                    k, b = tuple(self.MinMax[column])
                    dist = k*dist
                    sum += abs(dist)
            self.training_X.loc[i, "Distance"] = sum
            self.training_X = self.training_X.sort_values("Distance")
        # display(self.training_X)
        return self.training_X[:self.k].mode(axis=0).loc[0, X[X.isna()].index]


In [86]:
for i, item in df.loc[:, df.columns[:5]].iterrows():
    if item.isna().sum() == 0:
        continue
    knn_model = knn_weather(k=3)
    knn_model.fit(df.dropna())
    predict = knn_model.predict(item)
    df.loc[i, predict.index.values] = predict[predict.index.values]
    print(item[~item.isna()].to_string())
    print(predict.to_string())


Unnamed: 0,Day,Outlook,Temperature,Humidity,Wind,PlayTennis,Distance
1,2,Sunny,28.0,High,Strong,No,0.189076
7,8,Sunny,25.0,High,Weak,No,0.558824
2,3,Overcast,29.0,High,Weak,Yes,1.319328
3,4,Rain,23.0,High,Weak,Yes,1.390756
8,9,Sunny,18.0,Normal,Weak,Yes,2.042017
13,14,Rain,24.0,High,Strong,No,2.046218
11,12,Overcast,21.0,High,Strong,Yes,2.079832
14,15,Sunny,23.0,Normal,Weak,No,2.176471
15,16,Sunny,21.0,Normal,Weak,Yes,2.365546
9,10,Rain,20.0,Normal,Weak,Yes,2.995798


Day                1
Outlook        Sunny
Temperature     26.0
Humidity        High
Wind    Weak


Unnamed: 0,Day,Outlook,Temperature,Humidity,Wind,PlayTennis,Distance
9,10,Rain,20.0,Normal,Weak,Yes,0.333333
3,4,Rain,23.0,High,Weak,Yes,1.066667
5,6,Rain,12.0,Normal,Strong,No,1.066667
8,9,Sunny,18.0,Normal,Weak,Yes,1.266667
14,15,Sunny,23.0,Normal,Weak,No,1.666667
15,16,Sunny,21.0,Normal,Weak,Yes,1.733333
2,3,Overcast,29.0,High,Weak,Yes,2.133333
7,8,Sunny,25.0,High,Weak,No,2.2
0,1,Sunny,26.0,High,Weak,No,2.266667
13,14,Rain,24.0,High,Strong,No,2.6


Day              5
Outlook       Rain
Humidity    Normal
Wind          Weak
Temperature    12.0


Unnamed: 0,Day,Outlook,Temperature,Humidity,Wind,PlayTennis,Distance
11,12,Overcast,21.0,High,Strong,Yes,1.098039
5,6,Rain,12.0,Normal,Strong,No,1.301961
4,5,Rain,12.0,Normal,Weak,Yes,2.368627
13,14,Rain,24.0,High,Strong,No,2.407843
2,3,Overcast,29.0,High,Weak,Yes,2.501961
1,2,Sunny,28.0,High,Strong,No,2.509804
8,9,Sunny,18.0,Normal,Weak,Yes,2.721569
9,10,Rain,20.0,Normal,Weak,Yes,2.905882
7,8,Sunny,25.0,High,Weak,No,3.066667
3,4,Rain,23.0,High,Weak,Yes,3.082353


Day                   7
Outlook        Overcast
Temperature         8.0
Wind             Strong
Humidity    Normal


Unnamed: 0,Day,Outlook,Temperature,Humidity,Wind,PlayTennis,Distance
14,15,Sunny,23.0,Normal,Weak,No,0.27619
15,16,Sunny,21.0,Normal,Weak,Yes,0.438095
9,10,Rain,20.0,Normal,Weak,Yes,0.485714
8,9,Sunny,18.0,Normal,Weak,Yes,0.647619
4,5,Rain,12.0,Normal,Weak,Yes,1.2
7,8,Sunny,25.0,High,Weak,No,1.380952
3,4,Rain,23.0,High,Weak,Yes,1.742857
0,1,Sunny,26.0,High,Weak,No,1.8
2,3,Overcast,29.0,High,Weak,Yes,1.809524
5,6,Rain,12.0,Normal,Strong,No,2.133333


Day                13
Temperature      26.0
Humidity       Normal
Wind             Weak
Outlook    Sunny


In [68]:
df


Unnamed: 0,Day,Outlook,Temperature,Humidity,Wind,PlayTennis
0,1,Sunny,26.0,High,Weak,No
1,2,Sunny,28.0,High,Strong,No
2,3,Overcast,29.0,High,Weak,Yes
3,4,Rain,23.0,High,Weak,Yes
4,5,Rain,12.0,Normal,Weak,Yes
5,6,Rain,12.0,Normal,Strong,No
6,7,Overcast,8.0,Normal,Strong,Yes
7,8,Sunny,25.0,High,Weak,No
8,9,Sunny,18.0,Normal,Weak,Yes
9,10,Rain,20.0,Normal,Weak,Yes


b)Do the classification labels (PlayTennis) have to be included? Why or why not? (1 P.)

No, including the classification label will drop the significance of the features in classifying new data.

c) Classify the new sample D17 for k=1. (1 P.)
D17: Outlook=Sunny, Temperature=23, Humidity=High, Wind=Strong 

In [91]:
import numpy as np
new_sample = pd.Series({"Day":"D17", "Outlook":"Sunny", "Temperature": 23, "Humidity": "High", "Wind": "Strong", "PlayTennis": np.nan})


In [93]:
knn_model = knn_weather(k=1)
knn_model.fit(df.dropna())
predict = knn_model.predict(new_sample)
print(predict.to_string())


Unnamed: 0,Day,Outlook,Temperature,Humidity,Wind,PlayTennis,Distance
1,2,Sunny,28.0,High,Strong,No,1.238095
13,14,Rain,24.0,High,Strong,No,2.047619
7,8,Sunny,25.0,High,Weak,No,2.095238
11,12,Overcast,21.0,High,Strong,Yes,2.095238
0,1,Sunny,26.0,High,Weak,No,2.142857
3,4,Rain,23.0,High,Weak,Yes,3.0
14,15,Sunny,23.0,Normal,Weak,No,3.0
15,16,Sunny,21.0,Normal,Weak,Yes,3.095238
12,13,Sunny,26.0,Normal,Weak,Yes,3.142857
8,9,Sunny,18.0,Normal,Weak,Yes,3.238095


PlayTennis    No


d) Test different values of k. At what value of k does the assignment change compared to k=1? (1 P.)

In [100]:
for i in range(14):
    print("k =", i+1)
    knn_model = knn_weather(k=i+1)
    knn_model.fit(df.dropna())
    predict = knn_model.predict(new_sample)
    print(predict.to_string())


k = 1
PlayTennis    No
k = 2
PlayTennis    No
k = 3
PlayTennis    No
k = 4
PlayTennis    No
k = 5
PlayTennis    No
k = 6
PlayTennis    No
k = 7
PlayTennis    No
k = 8
PlayTennis    No
k = 9
PlayTennis    No
k = 10
PlayTennis    No
k = 11
PlayTennis    Yes
k = 12
PlayTennis    No
k = 13
PlayTennis    Yes
k = 14
PlayTennis    Yes


At $k = 11$ does the assignment changed from **No** to **Yes**