In [101]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

print("Loading dataset...")

Loading dataset...


In [102]:
# läser in white wine quality
# filen ligger i samma mapp som denna notebook
vin = pd.read_csv("wine.csv", sep=";")

# jag behåller bara quality 4–8, där finns många rader per klass
tillåtna_klasser = [4, 5, 6, 7, 8]
vin = vin[vin["quality"].isin(tillåtna_klasser)].copy()

# gör om quality till sträng så den blir en klass, inte siffra
vin["quality"] = vin["quality"].astype(str)

# features och målvariabel
features = vin.drop("quality", axis=1)
mål = vin["quality"]

print("Form på data:", features.shape)
print("Klasser i quality:")
print(mål.value_counts().sort_index())



Form på data: (4873, 11)
Klasser i quality:
quality
4     163
5    1457
6    2198
7     880
8     175
Name: count, dtype: int64


In [103]:
# egen liten funktion så jag slipper kopiera kod 18 gånger
def kör_knn_en_gång(k, använd_normalisering, train_andel, slump=42):
    """
    k = antal grannar
    använd_normalisering = True eller False
    train_andel = andel träningsdata, t.ex. 0.90
    """

    # här väljer jag om jag ska skala datat eller inte
    if använd_normalisering:
        scaler = MinMaxScaler()
        X_data = scaler.fit_transform(features)
        norm_text = "Normalized"
    else:
        X_data = features.values
        norm_text = "Original"

    test_andel = 1 - train_andel

    X_train, X_test, y_train, y_test = train_test_split(
        X_data,
        mål,
        train_size=train_andel,
        test_size=test_andel,
        stratify=mål,
        random_state=slump
    )

    modell = KNeighborsClassifier(n_neighbors=k)
    modell.fit(X_train, y_train)

    prognos = modell.predict(X_test)

    cm = confusion_matrix(y_test, prognos)
    acc = accuracy_score(y_test, prognos)

    # skriver ut lite lagom info för varje körning
    print("-" * 50)
    print("k:", k)
    print("Normalisering:", norm_text)
    print(f"Train/Test: {train_andel:.2f} / {test_andel:.2f}")
    print("Confusion matrix:")
    print(cm)
    print("Accuracy:", acc)

    # sparar resultatet i en dict som jag kan lägga i en lista sen
    return {
        "k": k,
        "normalisering": norm_text,
        "train_andel": train_andel,
        "test_andel": test_andel,
        "accuracy": acc
    }




In [104]:
# här gör jag själva “experimentet” med alla kombinationer
alla_resultat = []

k_lista = [3, 5, 7]
split_lista = [0.90, 2/3, 0.50]
normaliserings_val = [False, True]

for k in k_lista:
    for norm in normaliserings_val:
        for train_andel in split_lista:
            rad = kör_knn_en_gång(k, norm, train_andel)
            alla_resultat.append(rad)

resultat_df = pd.DataFrame(alla_resultat)
resultat_df



--------------------------------------------------
k: 3
Normalisering: Original
Train/Test: 0.90 / 0.10
Confusion matrix:
[[  4   9   3   0   0]
 [  5  80  50  10   1]
 [  5  65 124  24   2]
 [  2  19  36  30   1]
 [  0   2   7   6   3]]
Accuracy: 0.49385245901639346
--------------------------------------------------
k: 3
Normalisering: Original
Train/Test: 0.67 / 0.33
Confusion matrix:
[[  9  23  21   1   0]
 [ 23 239 186  35   3]
 [ 23 220 402  82   6]
 [  9  61 127  92   5]
 [  1  12  21  15   9]]
Accuracy: 0.46215384615384614
--------------------------------------------------
k: 3
Normalisering: Original
Train/Test: 0.50 / 0.50
Confusion matrix:
[[ 18  36  25   2   0]
 [ 46 346 287  47   3]
 [ 51 309 603 130   6]
 [ 17 102 188 126   7]
 [  1  19  39  18  11]]
Accuracy: 0.4530160032827247
--------------------------------------------------
k: 3
Normalisering: Normalized
Train/Test: 0.90 / 0.10
Confusion matrix:
[[  3   7   4   2   0]
 [  7  88  46   5   0]
 [  5  55 133  18   9]
 [  

Unnamed: 0,k,normalisering,train_andel,test_andel,accuracy
0,3,Original,0.9,0.1,0.493852
1,3,Original,0.666667,0.333333,0.462154
2,3,Original,0.5,0.5,0.453016
3,3,Normalized,0.9,0.1,0.567623
4,3,Normalized,0.666667,0.333333,0.546462
5,3,Normalized,0.5,0.5,0.534674
6,5,Original,0.9,0.1,0.45082
7,5,Original,0.666667,0.333333,0.471385
8,5,Original,0.5,0.5,0.463685
9,5,Normalized,0.9,0.1,0.561475


In [105]:
# gör om train/test till procent först
train_pct = (resultat_df["train_andel"] * 100).round(0).astype(int)
test_pct  = (resultat_df["test_andel"]  * 100).round(0).astype(int)

# bygger en ny tabell som liknar exemplet i instruktionen
tabell = pd.DataFrame({
    "k (Neighbors)": resultat_df["k"],
    "Normalization": resultat_df["normalisering"],
    "Train/Test Split": train_pct.astype(str) + "%/" + test_pct.astype(str) + "%",
    "Accuracy (%)": (resultat_df["accuracy"] * 100).round(2)
})

# jag vill ha Original före Normalized när jag tittar
tabell


Unnamed: 0,k (Neighbors),Normalization,Train/Test Split,Accuracy (%)
0,3,Original,90%/10%,49.39
1,3,Original,67%/33%,46.22
2,3,Original,50%/50%,45.3
3,3,Normalized,90%/10%,56.76
4,3,Normalized,67%/33%,54.65
5,3,Normalized,50%/50%,53.47
6,5,Original,90%/10%,45.08
7,5,Original,67%/33%,47.14
8,5,Original,50%/50%,46.37
9,5,Normalized,90%/10%,56.15


In [106]:
tabell.sort_values(["k (Neighbors)", "Normalization", "Train/Test Split"])

Unnamed: 0,k (Neighbors),Normalization,Train/Test Split,Accuracy (%)
5,3,Normalized,50%/50%,53.47
4,3,Normalized,67%/33%,54.65
3,3,Normalized,90%/10%,56.76
2,3,Original,50%/50%,45.3
1,3,Original,67%/33%,46.22
0,3,Original,90%/10%,49.39
11,5,Normalized,50%/50%,53.8
10,5,Normalized,67%/33%,55.2
9,5,Normalized,90%/10%,56.15
8,5,Original,50%/50%,46.37


In [107]:
bästa_tre = tabell.sort_values("Accuracy (%)", ascending=False).head(3)
bästa_tre


Unnamed: 0,k (Neighbors),Normalization,Train/Test Split,Accuracy (%)
15,7,Normalized,90%/10%,56.97
3,3,Normalized,90%/10%,56.76
9,5,Normalized,90%/10%,56.15
