In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix

In [17]:
df = pd.read_csv("wine.csv", sep=";")

df.head()



Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [18]:
df.columns


Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

y ska ha ett litet antal kategorier

annars är det inte klassificering

confusion matrix fungerar bara med få klasser


In [19]:
# För att hitta din y
cols = [
    "fixed acidity",
    "volatile acidity",
    "citric acid",
    "residual sugar",
    "chlorides",
    "free sulfur dioxide",
    "total sulfur dioxide",
    "density",
    "pH",
    "sulphates",
    "alcohol",
    "quality"
]

for col in cols:
    print(col, "→", df[col].nunique())


fixed acidity → 68
volatile acidity → 125
citric acid → 87
residual sugar → 310
chlorides → 160
free sulfur dioxide → 132
total sulfur dioxide → 251
density → 890
pH → 103
sulphates → 79
alcohol → 103
quality → 7


Valde "quality" för minsta värdet

In [20]:
# Define values 
TARGET_COL = "quality"

FEATURE_COLS = [
    "fixed acidity",
    "volatile acidity",
    "citric acid",
    "residual sugar",
    "chlorides",
    "free sulfur dioxide",
    "total sulfur dioxide",
    "density",
    "pH",
    "sulphates",
    "alcohol"
]

X = df[FEATURE_COLS]
y = df[TARGET_COL]

# Inte nödvändig 
print("Form X", X.shape)
print("Form y", y.shape)
X.head()


Form X (4898, 11)
Form y (4898,)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [21]:
# "test_size = 0.2" = testing 20 % of the data 
def decision_tree(X, y, test_size=0.2, normalize=False):
    X_data = X.copy()

    if normalize:
        scaler = MinMaxScaler()
        X_data = scaler.fit_transform(X_data)
    else:
        X_data = X_data.values

    X_train, X_test, y_train, y_test = train_test_split(
        X_data,
        y,
        test_size=test_size,
        shuffle=True
    )

    clf = DecisionTreeClassifier(random_state=None)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    return acc, cm


In [22]:
acc_raw, cm_raw = decision_tree(X, y, test_size=0.2, normalize=False)

print("Utan normalisering")
print("Confusion matrix")
print(cm_raw)
print("Accuracy", acc_raw)


Utan normalisering
Confusion matrix
[[  0   0   0   1   0   1   0]
 [  0  13  10   6   3   1   1]
 [  0   9 186  82   9   4   0]
 [  1  13  88 290  52   7   1]
 [  1   4  10  49  98  12   0]
 [  0   0   3   8   4  13   0]
 [  0   0   0   0   0   0   0]]
Accuracy 0.6122448979591837


In [23]:
acc_scaled, cm_scaled = decision_tree(X, y, test_size=0.2, normalize=True)

print("Med normalisering")
print("Confusion matrix")
print(cm_scaled)
print("Accuracy", acc_scaled)


Med normalisering
Confusion matrix
[[  0   2   0   2   1   0]
 [  2  11  12   8   3   2]
 [  1   9 180  93  16   5]
 [  0   4  78 283  50  11]
 [  0   1  15  52  91  11]
 [  0   0   3   9  10  15]]
Accuracy 0.5918367346938775


In [24]:
def run_many_times(X, y, test_size, normalize, n_runs=100):
    results = []

    for run in range(n_runs):
        acc, cm = decision_tree(
            X,
            y,
            test_size=test_size,
            normalize=normalize
        )
        results.append({
            "run": run + 1,
            "test_size": test_size,
            "train_size": 1 - test_size,
            "normalize": normalize,
            "accuracy": acc,
            "confusion_matrix": cm
        })

    return results


In [25]:
TEST_SIZE_1 = 0.1   # 90 train 10 test
TEST_SIZE_2 = 0.3   # 70 train 30 test

all_results = []

for test_sizes in [TEST_SIZE_1, TEST_SIZE_2]:
    for normalize in [False, True]:
        runs = run_many_times(
            X,
            y,
            test_size=test_sizes,
            normalize=normalize,
            n_runs=100
        )
        all_results.extend(runs)

len(all_results)


400

In [26]:
results_df = pd.DataFrame(all_results)
print("Antal körningar", len(results_df))
results_df.head()


Antal körningar 400


Unnamed: 0,run,test_size,train_size,normalize,accuracy,confusion_matrix
0,1,0.1,0.9,False,0.616327,"[[0, 1, 0, 3, 0, 0, 0], [0, 8, 5, 1, 0, 0, 0],..."
1,2,0.1,0.9,False,0.610204,"[[0, 0, 0, 2, 0, 0], [0, 3, 8, 2, 0, 1], [1, 1..."
2,3,0.1,0.9,False,0.565306,"[[0, 1, 1, 1, 0, 0], [0, 5, 1, 3, 3, 0], [0, 6..."
3,4,0.1,0.9,False,0.640816,"[[0, 0, 0, 1, 0, 0, 0], [0, 3, 4, 3, 0, 1, 0],..."
4,5,0.1,0.9,False,0.62449,"[[0, 1, 0, 0, 0, 0, 0], [0, 4, 6, 3, 0, 0, 0],..."


In [27]:
group_means = (
    results_df
    .groupby(["test_size", "normalize"])["accuracy"]
    .mean()
    .reset_index()
)

group_means


Unnamed: 0,test_size,normalize,accuracy
0,0.1,False,0.624755
1,0.1,True,0.617327
2,0.3,False,0.587619
3,0.3,True,0.589252


In [28]:
top3 = results_df.sort_values(by="accuracy", ascending=False).head(3)

top3[["run", "test_size", "train_size", "normalize", "accuracy"]]


Unnamed: 0,run,test_size,train_size,normalize,accuracy
85,86,0.1,0.9,False,0.683673
39,40,0.1,0.9,False,0.673469
94,95,0.1,0.9,False,0.669388


In [29]:
for idx, row in top3.iterrows():
    print("\nKörning nummer", row["run"])
    print("test_size", row["test_size"], "train_size", row["train_size"])
    print("normalize", row["normalize"])
    print("accuracy", row["accuracy"])
    print("Confusion matrix")
    print(row["confusion_matrix"])



Körning nummer 86
test_size 0.1 train_size 0.9
normalize False
accuracy 0.6836734693877551
Confusion matrix
[[  0   1   0   0   0   0]
 [  0   6   2   2   0   0]
 [  0   3  95  38   4   0]
 [  0   5  34 182  15   2]
 [  0   0   6  29  44   6]
 [  0   0   1   5   2   8]]

Körning nummer 40
test_size 0.1 train_size 0.9
normalize False
accuracy 0.673469387755102
Confusion matrix
[[  0   0   3   0   0   0   0]
 [  0   9   5   6   0   0   0]
 [  2   1 103  27   8   0   0]
 [  0   2  35 152  30   1   0]
 [  0   1   6  24  57   5   0]
 [  1   0   0   1   1   9   0]
 [  0   0   1   0   0   0   0]]

Körning nummer 95
test_size 0.1 train_size 0.9
normalize False
accuracy 0.6693877551020408
Confusion matrix
[[  0   0   0   0   0   1   0]
 [  0   4   4   5   1   1   0]
 [  0   3  95  37   5   0   0]
 [  0   2  27 158  27   2   0]
 [  0   1   4  23  61   5   0]
 [  0   0   2   4   7  10   0]
 [  0   0   0   0   1   0   0]]


In [30]:
results_df.to_csv("wine_decisiontree_results.csv", index=False)
print("Sparat i wine_decisiontree_results.csv")


Sparat i wine_decisiontree_results.csv
