In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
white_wine = pd.read_csv("winequality-white.csv", sep =';')
red_wine = pd.read_csv("winequality-red.csv", sep =';')
# Remove duplicates
white_wine= white_wine.drop_duplicates()
red_wine= red_wine.drop_duplicates()

In [None]:
# create new column
red_wine['quality_label'] = red_wine['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')
white_wine['quality_label'] = white_wine['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

# transform into categorical type
red_wine['quality_label'] = pd.Categorical(red_wine['quality_label'], categories=['low', 'medium', 'high'])
white_wine['quality_label'] = pd.Categorical(white_wine['quality_label'], categories=['low', 'medium', 'high'])


In [None]:
red_wine["wine_type"]="red"
white_wine["wine_type"]="white"

all_wine = pd.concat([red_wine, white_wine], axis =0, ignore_index= True)
# all_wine

Outliers Removal from Dataset

In [None]:
import pandas as pd
import numpy as np

# Function to remove unique outliers using IQR
def remove_unique_outliers(df):
    outlier_indices = set()  # Store unique indices of rows to remove
    for col in df.select_dtypes(include=[np.number]).columns:  # Only numeric columns
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        S = 1.5 * IQR
        LB, UB = Q1 - S, Q3 + S
        # Find indices of outliers and store them
        outlier_indices.update(df[(df[col] < LB) | (df[col] > UB)].index)

    # Remove outliers based on unique indices
    filtered_df = df.drop(index=outlier_indices)
    return filtered_df

# Apply the function to remove outliers from both datasets
red_wine_clean = remove_unique_outliers(red_wine)
white_wine_clean = remove_unique_outliers(white_wine)

# Print the number of remaining rows after outlier removal
print(f"Red Wine - Rows before: {len(red_wine)}, Rows after: {len(red_wine_clean)}")
print(f"White Wine - Rows before: {len(white_wine)}, Rows after: {len(white_wine_clean)}")


In [None]:
# Check the distribution of wine quality categories after removing duplicates and outliers

# Get counts for each category in Red and White Wine datasets
red_quality_counts = red_wine_clean["quality_label"].value_counts()
white_quality_counts = white_wine_clean["quality_label"].value_counts()
print(red_quality_counts)
print(white_quality_counts)
# # Create a DataFrame to display the counts
# quality_distribution_df = pd.DataFrame({
#     "Red Wine Samples": red_quality_counts,
#     "White Wine Samples": white_quality_counts
# }).fillna(0)  # Fill missing values with 0 if a category is missing

# # Display the results
# import ace_tools as tools
# tools.display_dataframe_to_user(name="Wine Quality Distribution After Cleaning", dataframe=quality_distribution_df)

# # Return the DataFrame for reference
# quality_distribution_df


In [None]:
#quality label 
quality_label_encod = pd.get_dummies(all_wine['quality_label'], dtype=int)

all_wine_encod = pd.concat([all_wine, quality_label_encod], axis=1)


y = quality_label_encod.idxmax(axis=1) 
y = y.map({'low': 0, 'medium': 1, 'high': 2}) 
#print(y)

#print("Fixed Unique values in y:", np.unique(y))



In [None]:
# Splitting Define X (features) and y (target)
from sklearn.model_selection import train_test_split
X = all_wine_encod.drop(columns=['wine_type','quality_label', 'quality','medium','high','low','citric acid','residual sugar','density','pH'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=420)

In [None]:
# # Daten normalisieren
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)


from sklearn.preprocessing import MinMaxScaler
# Scale features using MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)




In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
# define model
LR_model = LogisticRegression(max_iter=1000)
# fit model
LR_model.fit(X_train, y_train)
# make predictions
LR_preds = LR_model.predict(X_test)



In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
# define model
#Train Accuracy is 100%
#RF_clf = RandomForestClassifier(random_state=420)      

RF_clf = RandomForestClassifier(n_estimators=100,       # Anzahl der Bäume im Wald
    max_depth=10,           # Maximale Tiefe der Bäume
    min_samples_split=5,    # Mindestanzahl der Samples, um einen neuen Split zu erstellen
    min_samples_leaf=3,     # Mindestanzahl der Samples in einem Blatt
    random_state=420        # Zufallszahl für Reproduzierbarkeit
)
# fit model
RF_clf.fit(X_train, y_train)
# make predictions
RF_preds = RF_clf.predict(X_test)


In [None]:
#SVC
from sklearn.svm import SVC
svc_model = SVC(kernel='rbf', C=1.0, gamma='scale')
svc_model.fit(X_train, y_train)

# Predictions
svc_preds = svc_model.predict(X_test)


In [None]:

from sklearn.metrics import accuracy_score 

# check overall accuracy %

LR_acc = accuracy_score(y_test, LR_preds)
RF_acc = accuracy_score(y_test, RF_preds)
svc_acc= accuracy_score(y_test, svc_preds)


train_LR_acc = accuracy_score(y_train, LR_model.predict(X_train))
train_RF_acc = accuracy_score(y_train, RF_clf.predict(X_train))
train_svc_acc = accuracy_score(y_train, svc_model.predict(X_train))


print("\n Train Logistic Regression: ", train_LR_acc,"\n Test Logistic Regression: ", LR_acc)
print("\n Train Random Forest : ", train_RF_acc, "\n Test Random Forest: ", RF_acc)
print("\n Train SVC : ", train_svc_acc, "\n Test SVC: ", svc_acc)





Hyperparameter tuning Random Forest

In [None]:
n_estimators = np.arange(50,110,10)
max_depth = np.arange(5, 20, 5)
min_samples_split = np.arange(1, 9, 2)
min_samples_leaf = np.arange(1, 9, 2)
#max_features = ["sqrt", "log2", None]

param_grid = {
  'n_estimators': n_estimators,
  'max_depth': max_depth,
  'min_samples_split': min_samples_split,
  'min_samples_leaf':  min_samples_leaf,
}

param_grid

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# redefine model
RF_clf = RandomForestClassifier(random_state=42)

RS_grid = RandomizedSearchCV(estimator=RF_clf, param_distributions=param_grid, n_iter=10)
RS_grid

In [None]:
RS_grid.fit(X_train, y_train)

In [None]:
print(
  'best score: ', RS_grid.best_score_,
  '\nparams: ', RS_grid.best_params_
)

In [None]:
import pandas as pd

grid_results = pd.concat([
  pd.DataFrame(RS_grid.cv_results_["params"]),
  pd.DataFrame(RS_grid.cv_results_["mean_test_score"], columns=["Accuracy"])
], axis=1)

grid_results