In [57]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix

In [58]:
df = pd.read_csv("wine.csv", sep=";")

df.head()



Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [59]:
df.columns


Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

y ska ha ett litet antal kategorier

annars är det inte klassificering

confusion matrix fungerar bara med få klasser


In [60]:
# För att hitta din y
cols = [
    "fixed acidity",
    "volatile acidity",
    "citric acid",
    "residual sugar",
    "chlorides",
    "free sulfur dioxide",
    "total sulfur dioxide",
    "density",
    "pH",
    "sulphates",
    "alcohol",
    "quality"
]

for col in cols:
    print(col, "→", df[col].nunique())


fixed acidity → 68
volatile acidity → 125
citric acid → 87
residual sugar → 310
chlorides → 160
free sulfur dioxide → 132
total sulfur dioxide → 251
density → 890
pH → 103
sulphates → 79
alcohol → 103
quality → 7


Valde "quality" för minsta värdet

In [61]:
# Define values 
TARGET_COL = "quality"

FEATURE_COLS = [
    "fixed acidity",
    "volatile acidity",
    "citric acid",
    "residual sugar",
    "chlorides",
    "free sulfur dioxide",
    "total sulfur dioxide",
    "density",
    "pH",
    "sulphates",
    "alcohol"
]

X = df[FEATURE_COLS]
y = df[TARGET_COL]

# Inte nödvändig 
print("Form X", X.shape)
print("Form y", y.shape)
X.head()


Form X (4898, 11)
Form y (4898,)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [62]:
# "test_size = 0.2" = testing 20 % of the data 
def decision_tree(X, y, test_size=0.2, normalize=False):
    X_data = X.copy()

    if normalize:
        scaler = MinMaxScaler()
        X_data = scaler.fit_transform(X_data)
    else:
        X_data = X_data.values

    X_train, X_test, y_train, y_test = train_test_split(
        X_data,
        y,
        test_size=test_size,
        shuffle=True
    )

    clf = DecisionTreeClassifier(random_state=None)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    return acc, cm


In [63]:
acc_raw, cm_raw = decision_tree(X, y, test_size=0.2, normalize=False)

print("Utan normalisering")
print("Confusion matrix")
print(cm_raw)
print("Accuracy", acc_raw)


Utan normalisering
Confusion matrix
[[  0   0   2   1   3   0   0]
 [  0   8  10  18   1   0   0]
 [  1  10 178  73  14   0   0]
 [  0   7  73 269  66   7   0]
 [  1   1  17  52 110   6   0]
 [  0   0   3  10  14  24   1]
 [  0   0   0   0   0   0   0]]
Accuracy 0.6010204081632653


In [64]:
acc_scaled, cm_scaled = decision_tree(X, y, test_size=0.2, normalize=True)

print("Med normalisering")
print("Confusion matrix")
print(cm_scaled)
print("Accuracy", acc_scaled)


Med normalisering
Confusion matrix
[[  0   0   1   0   1   0]
 [  0  11   6   9   3   0]
 [  0  10 173  88  18   4]
 [  0   4  87 280  61  12]
 [  0   1  15  45 110   8]
 [  0   0   1   1  13  18]]
Accuracy 0.6040816326530613


In [65]:
def run_many_times(X, y, test_size, normalize, n_runs=100):
    results = []

    for run in range(n_runs):
        acc, cm = decision_tree(
            X,
            y,
            test_size=test_size,
            normalize=normalize
        )
        results.append({
            "run": run + 1,
            "test_size": test_size,
            "train_size": 1 - test_size,
            "normalize": normalize,
            "accuracy": acc,
            "confusion_matrix": cm
        })

    return results


In [None]:
TEST_SIZE_1 = 0.1   # 90 train 10 test
TEST_SIZE_2 = 0.3   # 70 train 30 test

all_results = []

for test_sizes in [TEST_SIZE_1, TEST_SIZE_2]:
    for normalize in [False, True]:
        runs = run_many_times(
            X,
            y,
            test_size=test_sizes,
            normalize=normalize,
            n_runs=100
        )
        all_results.extend(runs)

len(all_results)


400

In [67]:
results_df = pd.DataFrame(all_results)
print("Antal körningar", len(results_df))
results_df.head()


Antal körningar 400


Unnamed: 0,run,test_size,train_size,normalize,accuracy,confusion_matrix
0,1,0.1,0.9,False,0.626531,"[[0, 0, 0, 1, 1, 0, 0], [1, 5, 6, 3, 1, 1, 0],..."
1,2,0.1,0.9,False,0.657143,"[[0, 0, 1, 1, 0, 0, 0], [1, 7, 3, 1, 2, 0, 1],..."
2,3,0.1,0.9,False,0.632653,"[[0, 0, 0, 1, 0, 0, 0], [0, 6, 6, 2, 1, 0, 0],..."
3,4,0.1,0.9,False,0.628571,"[[0, 0, 0, 0, 0, 0, 0], [0, 4, 9, 7, 0, 1, 0],..."
4,5,0.1,0.9,False,0.636735,"[[0, 0, 1, 0, 0, 0, 0], [0, 5, 3, 4, 0, 0, 0],..."


In [68]:
group_means = (
    results_df
    .groupby(["test_size", "normalize"])["accuracy"]
    .mean()
    .reset_index()
)

group_means


Unnamed: 0,test_size,normalize,accuracy
0,0.1,False,0.625
1,0.1,True,0.626408
2,0.3,False,0.587714
3,0.3,True,0.58849


In [69]:
top3 = results_df.sort_values(by="accuracy", ascending=False).head(3)

top3[["run", "test_size", "train_size", "normalize", "accuracy"]]


Unnamed: 0,run,test_size,train_size,normalize,accuracy
191,92,0.1,0.9,True,0.689796
57,58,0.1,0.9,False,0.685714
68,69,0.1,0.9,False,0.677551


In [70]:
for idx, row in top3.iterrows():
    print("\nKörning nummer", row["run"])
    print("test_size", row["test_size"], "train_size", row["train_size"])
    print("normalize", row["normalize"])
    print("accuracy", row["accuracy"])
    print("Confusion matrix")
    print(row["confusion_matrix"])



Körning nummer 92
test_size 0.1 train_size 0.9
normalize True
accuracy 0.689795918367347
Confusion matrix
[[  0   0   0   2   0   0]
 [  0   4   6   3   0   1]
 [  2   4 104  32   9   1]
 [  0   6  31 167  23   3]
 [  0   1   5  15  57   2]
 [  0   1   0   3   2   6]]

Körning nummer 58
test_size 0.1 train_size 0.9
normalize False
accuracy 0.6857142857142857
Confusion matrix
[[  0   1   2   1   0   0   0]
 [  0   2   4   0   0   0   0]
 [  1   3  98  36   4   0   0]
 [  0   3  26 164  30   4   0]
 [  0   0   5  25  67   4   0]
 [  0   0   0   2   2   5   0]
 [  0   0   0   0   1   0   0]]

Körning nummer 69
test_size 0.1 train_size 0.9
normalize False
accuracy 0.6775510204081633
Confusion matrix
[[  0   1   0   0   0   0   0]
 [  0   9  10   4   1   0   0]
 [  0   4  86  31   7   3   0]
 [  0   1  23 165  22   5   0]
 [  0   0   5  26  63   4   0]
 [  0   0   1   7   2   9   0]
 [  0   0   0   1   0   0   0]]


In [71]:
results_df.to_csv("wine_decisiontree_results.csv", index=False)
print("Sparat i wine_decisiontree_results.csv")


Sparat i wine_decisiontree_results.csv
