In [480]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [481]:
df = pd.read_csv("wine_classification/wine-test-set.csv")
df = pd.DataFrame(df)

In [482]:
df['quality'] = df['quality'].apply(lambda x:1 if x>=6 else 0)
df = df.dropna(subset=['quality'])

In [483]:
df['fixed_density'] = df['fixed acidity'] * df['density']
df['free_total_sulfur'] = df['free sulfur dioxide'] * df['total sulfur dioxide']
df['fixed_pH'] = df['fixed acidity'] * df['pH']
df['citric_acidity'] = df['citric acid'] * df['fixed acidity']

df = df.drop(['citric acid' , 'density' , 'fixed acidity' , 'pH' , 'total sulfur dioxide' , 'free sulfur dioxide' ] , axis =1)


In [484]:
changes = ['residual sugar'  , 'alcohol' , 'free_total_sulfur' , 'fixed_density' , 'fixed_pH','citric_acidity']

for col in changes:
    df[col] = np.log2(df[col])
    df[col] = np.sqrt(df[col])

changes1 = ['sulphates' , 'chlorides']

for col in changes1:
    from scipy.stats import boxcox
    df[col],_ = boxcox(df[col])

In [485]:
def calculate_z_scores(dfa, column):
    mean = dfa[column].mean()
    std_dev = dfa[column].std()
    z_scores = ((dfa[column] - mean) / std_dev).abs()
    return z_scores

columns_to_check = ['volatile acidity' ,'residual sugar', 'chlorides' , 'fixed_density','citric_acidity']
threshold = 3
for column in columns_to_check:
    z_scores = calculate_z_scores(df, column)
    df = df[z_scores < threshold]
    

In [486]:
def impute_outliers(df, column):
    Q1 = df[column].quantile(0.2)
    Q3 = df[column].quantile(0.8)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    median = df[column].mean()
    df[column] = np.where((df[column] < lower_bound) | (df[column] > upper_bound), median, df[column])
    return df

columns_to_check = ['chlorides','residual sugar']

for column in columns_to_check:
    df = impute_outliers(df, column)

In [487]:
import pickle
with open('wine_quality_gradient.pkl','rb') as file:
    gradient_boost = pickle.load(file)
    
with open('scaler_q.pkl','rb') as file:
    scaler = pickle.load(file)

In [488]:
from sklearn.preprocessing import MinMaxScaler

# Creating an instance of MinMaxScaler with default range [0, 1]
scalers = MinMaxScaler()

# Selecting the specific features to scale
features_to_scale = ['chlorides', 'sulphates']

# Fitting the scaler and transforming the data
df[features_to_scale] = scalers.fit_transform(df[features_to_scale])

df['quality'] = le.transform(df['quality'])

In [489]:
X_test = df.drop('quality',axis=1).values
y_test = df['quality'].values

In [490]:
X_test_scaled = scaler.transform(X_test)


In [491]:
predictions = gradient_boost.predict(X_test_scaled)
print(predictions) # quality was change to binary classification

[0 1 1 0 1 1 1 0 0 1 0 1 0 1 0 0 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1
 0 1 1 1 1 0 1 1 1 0 0 1 0 1 0 0 1 1 1 1 0 1 0 1 1 0 1 1 1 1 0 1 0 1 1 0 0
 1 1 0 1 1 0 1 1 0 0 1 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 0 0 1 1 1 1 0 0 0 0 1
 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 1 1 1 0 0 0 0 0 1 1 1 0 1
 1 0 1 0 0 1 0 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 0 1 1 0
 1 1 1 0 0 1 1 0 0 0 1 1 0 0 0 0 1 1 1 0 1 0 1 1]


In [492]:

import evaluation
metrics = evaluation.evaluate_classification(y_test,predictions)
for metric, value in metrics.items():
    if metric != 'Confusion Matrix':
        print(f"{metric}: {value:.4f}")
    else:
        print(f"{metric}:\n{value}")

Accuracy: 0.7225
Precision: 0.7271
Recall: 0.7225
F1 Score: 0.7179
Confusion Matrix:
[[58 39]
 [19 93]]
