In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

print("Loading dataset...")

Loading dataset...


In [56]:
# Läs in data white wine från csv-fil
# sep=";" eftersom filen använder semikolon som separator
df = pd.read_csv("wine.csv", sep=";")

# behåll endast rader med quality 
tillåtna_klasser = [4, 5, 6, 7, 8 ]
df = df[df["quality"].isin(tillåtna_klasser)].copy()

#gör om quality till sträng
df["quality"] = df["quality"].astype(str)

# x = features, y = target
X = df.drop("quality", axis=1)
Y = df["quality"]

print("from på data:", X.shape, "features", ",", Y.shape[0], "rader")
print ("klasser i quality:")
print(y.value_counts().sort_index())

from på data: (4873, 11) features , 4873 rader
klasser i quality:
quality
4     163
5    1457
6    2198
7     880
8     175
Name: count, dtype: int64


In [61]:
def kör_knn(k, normalisera, train_andel, slump=42):
    """
    k = antal grannar
    normalisera = True/False
    train_andel = andel träning (0.9 = 90%)
    """

    # data
    if normalisera:
        scaler = MinMaxScaler()
        X_data = scaler.fit_transform(X)
        norm_text = "Normaliserat"
    else:
        X_data = X.values
        norm_text = "Originaldata"

    test_andel = 1 - train_andel

    X_train, X_test, y_train, y_test = train_test_split(
        X_data,
        Y,
        train_size=train_andel,
        test_size=test_andel,
        stratify=Y,
        random_state=slump
    )

    modell = KNeighborsClassifier(n_neighbors=k)
    modell.fit(X_train, y_train)

    y_pred = modell.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)

    print("-" * 60)
    print(f"k = {k}")
    print(f"Normalisering: {norm_text}")
    print(f"Train/Test: {train_andel:.2f} / {test_andel:.2f}")
    print("Confusion matrix")
    print(cm)
    print("Accuracy:", acc)

    return {
        "k": k,
        "Normalisering": norm_text,
        "Train_andel": train_andel,
        "Test_andel": test_andel,
        "Accuracy": acc
    }


In [62]:
resultat = []

k_values = [3, 5, 7]
train_splits = [0.90, 2/3, 0.50]

for k in k_values:
    for normalisera in [False, True]:
        for train_andel in train_splits:
            res = kör_knn(k, normalisera, train_andel)
            resultat.append(res)

resultat_df = pd.DataFrame(resultat)
resultat_df


------------------------------------------------------------
k = 3
Normalisering: Originaldata
Train/Test: 0.90 / 0.10
Confusion matrix
[[  4   9   3   0   0]
 [  5  80  50  10   1]
 [  5  65 124  24   2]
 [  2  19  36  30   1]
 [  0   2   7   6   3]]
Accuracy: 0.49385245901639346
------------------------------------------------------------
k = 3
Normalisering: Originaldata
Train/Test: 0.67 / 0.33
Confusion matrix
[[  9  23  21   1   0]
 [ 23 239 186  35   3]
 [ 23 220 402  82   6]
 [  9  61 127  92   5]
 [  1  12  21  15   9]]
Accuracy: 0.46215384615384614
------------------------------------------------------------
k = 3
Normalisering: Originaldata
Train/Test: 0.50 / 0.50
Confusion matrix
[[ 18  36  25   2   0]
 [ 46 346 287  47   3]
 [ 51 309 603 130   6]
 [ 17 102 188 126   7]
 [  1  19  39  18  11]]
Accuracy: 0.4530160032827247
------------------------------------------------------------
k = 3
Normalisering: Normaliserat
Train/Test: 0.90 / 0.10
Confusion matrix
[[  3   7   4   2  

Unnamed: 0,k,Normalisering,Train_andel,Test_andel,Accuracy
0,3,Originaldata,0.9,0.1,0.493852
1,3,Originaldata,0.666667,0.333333,0.462154
2,3,Originaldata,0.5,0.5,0.453016
3,3,Normaliserat,0.9,0.1,0.567623
4,3,Normaliserat,0.666667,0.333333,0.546462
5,3,Normaliserat,0.5,0.5,0.534674
6,5,Originaldata,0.9,0.1,0.45082
7,5,Originaldata,0.666667,0.333333,0.471385
8,5,Originaldata,0.5,0.5,0.463685
9,5,Normaliserat,0.9,0.1,0.561475
