In [36]:
import numpy as np
import os
import pandas as pd 

In [37]:
red_wine_file_path=r'T:\TOSHITH\PROGRAMMING\wine+quality\winequality-red.csv'
white_wine_file_path=r'T:\TOSHITH\PROGRAMMING\wine+quality\winequality-white.csv'

### step 1 combine data and preprocess it for red and white wine seperately 

In [69]:
red_wine_df=pd.read_csv(red_wine_file_path,sep=';')
white_wine_df=pd.read_csv(white_wine_file_path,sep=';')

unwanted_qualities = [ 3, 9,8,4]

# Filter out those rows from red and white wine data
red_wine_df = red_wine_df[~red_wine_df['quality'].isin(unwanted_qualities)]
white_wine_df = white_wine_df[~white_wine_df['quality'].isin(unwanted_qualities)]

print(red_wine_df.shape,white_wine_df.shape)
#red_wine_df.head()
print(red_wine_df['quality'].value_counts())
print(white_wine_df['quality'].value_counts())

(1518, 12) (4535, 12)
quality
5    681
6    638
7    199
Name: count, dtype: int64
quality
6    2198
5    1457
7     880
Name: count, dtype: int64


So we see that the data is unbalanced 

In [70]:
from sklearn.preprocessing import StandardScaler
#function to scale our data to make it easier for our model to learn 

def scaler(wine_df):
    columns=list(wine_df.columns)

    wine_df=StandardScaler().fit_transform(wine_df)
    wine_df=pd.DataFrame(wine_df,columns=columns)
    return wine_df

'''is this really necessairy as some row values have become negative'''

'is this really necessairy as some row values have become negative'

### 1. Undersampling
* As white is more we will reduce it's numbers to equal reds
* Also we will do a train test split before sampling as we dont want to introduce redundant samples in the training and testing data 

In [71]:
from sklearn.model_selection import train_test_split
#70:30 split 
split_ratio=0.3
#the data frame in train and test still contains the quality column so we have garbage values which we will not use

red_wine_train, red_wine_test, garbage1,garbage2 = train_test_split(red_wine_df,red_wine_df['quality'],test_size=split_ratio)


#we want same no of samples in white wine test as red wine 

white_wine_test=white_wine_df.sample(n=red_wine_test.shape[0])
white_wine_train=white_wine_df.drop(white_wine_test.index)

red_wine_train.shape,red_wine_test.shape,white_wine_train.shape, white_wine_test.shape

((1062, 12), (456, 12), (4079, 12), (456, 12))

encoding red as 0 and white as 1 

In [72]:
def return_train_and_test(red_wine_tr,white_wine_tr,red_wine_test,white_wine_test):
    # Add 'colour' column
    red_wine_tr['colour'] = 0
    white_wine_tr['colour'] = 1
    red_wine_test['colour'] = 0
    white_wine_test['colour'] = 1

    # Drop 'quality' for X, but keep it for Y
    X_train = pd.concat([
        red_wine_tr.drop(columns=['quality']),
        white_wine_tr.drop(columns=['quality'])
    ], axis=0)

    Y_train = pd.concat([
        red_wine_tr['quality'],
        white_wine_tr['quality']
    ], axis=0)

    X_test = pd.concat([
        red_wine_test.drop(columns=['quality']),
        white_wine_test.drop(columns=['quality'])
    ], axis=0)

    Y_test = pd.concat([
        red_wine_test['quality'],
        white_wine_test['quality']
    ], axis=0)

    return X_train,X_test,Y_train,Y_test

US is for under sample , N is for normal with out sampling 

In [73]:
#need to undersample white 
no_of_red=red_wine_train.shape[0] #shape of train y

N_X_train,N_X_test,N_Y_train,N_Y_test=return_train_and_test(red_wine_train,white_wine_train,red_wine_test,white_wine_test)

sample_white= white_wine_train.sample(no_of_red) # we get equal no of columns 
US_X_train,US_X_test,US_Y_train,US_Y_test=return_train_and_test(red_wine_train,sample_white,red_wine_test,white_wine_test)

US_X_train.shape,US_X_test.shape,US_Y_train.shape,US_Y_test.shape,N_X_train.shape,

((2124, 12), (912, 12), (2124,), (912,), (5141, 12))

In [74]:
US_X_train=scaler(US_X_train)
US_X_test=scaler(US_X_test)
#as the colour column has changed because of the standard scaler 
US_X_train['colour'] = US_X_train['colour'].replace(-1, 0)
US_X_test['colour'] = US_X_test['colour'].replace(-1, 0)

US_X_train.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,colour
0,1.671752,0.777839,0.413276,2.608559,3.366086,1.717095,0.02764,2.915765,-0.402582,1.172799,-1.256578,0.0


In [75]:
N_X_train = scaler(N_X_train)
N_X_test = scaler(N_X_test)
# as these 2 are unbalenced data ill change the colour column to 0 and 1 manually 
#can apply this to above as well 

unique_values = N_X_train['colour'].unique()
lower_value = min(unique_values)

N_X_train['colour'] = (N_X_train['colour'] != lower_value).astype(int)

unique_values = N_X_test['colour'].unique()
lower_value = min(unique_values)
N_X_test['colour'] = (N_X_test['colour'] != lower_value).astype(int)

N_X_train.tail()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,colour
5136,-0.533573,-0.62922,0.412269,-0.898051,-0.660118,-0.15241,-0.151086,-0.555205,0.489557,0.104559,-0.644794,1
5137,-0.452325,-0.033458,0.26894,0.472229,-0.22315,1.486856,0.872255,0.080419,-0.394488,-0.453632,-0.729081,1
5138,-0.533573,-0.563024,-0.949355,-0.918503,-0.397938,-0.093865,-0.16936,-0.700869,-1.404825,-0.453632,-0.897655,1
5139,-1.346053,-0.232046,-0.161046,-0.938955,-0.95143,-0.679317,-0.187634,-1.975428,0.805287,-1.011824,1.968093,1
5140,-0.939813,-0.761612,0.412269,-1.000311,-1.009693,-0.562226,-0.406921,-1.737069,0.300119,-1.430468,1.125226,1


In [None]:
#perform train test split before sampling and sampling should be done only on training data 

#also verify if the data column has the same no of 1 and 0 (y values) values 11:57 in the video 


2 upsampling 

In [76]:
print(N_X_train['colour'].value_counts()) # w

colour
1    4079
0    1062
Name: count, dtype: int64


In [77]:
# Reset index to avoid index misalignment
N_X_train = N_X_train.reset_index(drop=True)
N_Y_train = N_Y_train.reset_index(drop=True)

# Split by colour
red_mask = N_X_train['colour'] == 0
red_X = N_X_train[red_mask]
red_Y = N_Y_train[red_mask]

white_mask = N_X_train['colour'] == 1
white_X = N_X_train[white_mask]
white_Y = N_Y_train[white_mask]

# Now you can upsample
from sklearn.utils import resample
red_X_upsampled, red_Y_upsampled = resample(
    red_X, red_Y,
    replace=True,
    n_samples=len(white_X),
    random_state=42
)

UP_X_train = pd.concat([red_X_upsampled, white_X])
UP_Y_train = pd.concat([red_Y_upsampled, white_Y])
UP_X_test = US_X_test.copy()
UP_Y_test = US_Y_test.copy()



In [78]:
print(UP_X_train['colour'].value_counts()) # w

colour
0    4079
1    4079
Name: count, dtype: int64


### apply smote 

In [79]:
from imblearn.over_sampling import SMOTE
from collections import Counter
import pandas as pd

In [80]:
print("Original class distribution:", Counter(N_Y_train))

Original class distribution: Counter({6: 2427, 5: 1791, 7: 923})


In [81]:
smote = SMOTE(random_state=42)
sampling_strategy = {4: 500, 8: 500}
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
SM_X_train, SM_Y_train = smote.fit_resample(N_X_train, N_Y_train)
print("Resampled class distribution:", Counter(SM_Y_train))

ValueError: The {8, 4} target class is/are not present in the data.

In [82]:
print(SM_X_train['colour'].value_counts()) #

colour
1    4946
0    1204
Name: count, dtype: int64


### apply ensamble technique on your data 


### use different ML model for the same data 

XGBoost classifier 

In [83]:
'''from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Get unique labels and encode them
unique_labels = sorted(set(US_Y_train) | set(US_Y_test))
le = LabelEncoder()
le.fit(unique_labels)

US_Y_train_encoded = le.transform(US_Y_train)

model = XGBClassifier(objective='multi:softmax', num_class=len(unique_labels), random_state=42)
model.fit(US_X_train, US_Y_train_encoded)

y_pred_encoded = model.predict(US_X_test)
US_y_pred = le.inverse_transform(y_pred_encoded)'''
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Encode labels
unique_labels = sorted(set(US_Y_train) | set(US_Y_test))
le = LabelEncoder()
le.fit(unique_labels)

US_Y_train_encoded = le.transform(US_Y_train)
US_Y_test_encoded = le.transform(US_Y_test)  # optional, in case needed later

# Define base model
xgb = XGBClassifier(objective='multi:softmax', num_class=len(unique_labels), random_state=42)

# Define hyperparameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 2, 5],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0.1, 0.5, 1]
}

# Perform randomized search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=50,
    scoring='accuracy',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit on training data
random_search.fit(US_X_train, US_Y_train_encoded)

# Get best model
best_model = random_search.best_estimator_

# Predict and decode labels
y_pred_encoded = best_model.predict(US_X_test)
US_y_pred = le.inverse_transform(y_pred_encoded)

# Evaluate
print("Best Hyperparameters:", random_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters: {'subsample': 0.6, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 300, 'max_depth': 9, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 1.0}


In [84]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Get unique labels and encode them
unique_labels = sorted(set(N_Y_train) | set(N_Y_test))
le = LabelEncoder()
le.fit(unique_labels)

N_Y_train_encoded = le.transform(N_Y_train)
N_Y_test_encoded = le.transform(N_Y_test)

model = XGBClassifier(objective='multi:softmax', num_class=len(unique_labels), random_state=42)
model.fit(N_X_train, N_Y_train_encoded)

y_pred_encoded = model.predict(N_X_test)
N_y_pred = le.inverse_transform(y_pred_encoded)

In [85]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Get unique labels and encode them
unique_labels = sorted(set(UP_Y_train) | set(UP_Y_test))
le = LabelEncoder()
le.fit(unique_labels)

UP_Y_train_encoded = le.transform(UP_Y_train)
UP_Y_test_encoded = le.transform(UP_Y_test)

model = XGBClassifier(objective='multi:softmax', num_class=len(unique_labels), random_state=42)
model.fit(UP_X_train, UP_Y_train_encoded)

y_pred_encoded = model.predict(UP_X_test)
UP_y_pred = le.inverse_transform(y_pred_encoded)

In [88]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Get unique labels and encode them
unique_labels = sorted(set(SM_Y_train) | set(N_Y_test))
le = LabelEncoder()
le.fit(unique_labels)

SM_Y_train_encoded = le.transform(SM_Y_train)

model = XGBClassifier(objective='multi:softmax', num_class=len(unique_labels), random_state=42)
model.fit(SM_X_train, SM_Y_train_encoded)

y_pred_encoded = model.predict(N_X_test)
SM_y_pred = le.inverse_transform(y_pred_encoded)

In [89]:
print("Accuracy for Undersampled data:", accuracy_score(US_Y_test, US_y_pred))
print(classification_report(US_Y_test, US_y_pred))
print("Accuracy for normal data:", accuracy_score(N_Y_test, N_y_pred))
print(classification_report(N_Y_test, N_y_pred))
print("Accuracy for normal data:", accuracy_score(UP_Y_test, UP_y_pred))
print(classification_report(UP_Y_test, UP_y_pred))
'''
print("Accuracy for normal data:", accuracy_score(N_Y_test, SM_y_pred))
print(classification_report(N_Y_test, SM_y_pred))'''

Accuracy for Undersampled data: 0.6348684210526315
              precision    recall  f1-score   support

           5       0.70      0.71      0.70       347
           6       0.60      0.66      0.63       409
           7       0.59      0.40      0.48       156

    accuracy                           0.63       912
   macro avg       0.63      0.59      0.60       912
weighted avg       0.63      0.63      0.63       912

Accuracy for normal data: 0.5712719298245614
              precision    recall  f1-score   support

           5       0.65      0.61      0.63       347
           6       0.53      0.67      0.59       409
           7       0.54      0.22      0.32       156

    accuracy                           0.57       912
   macro avg       0.57      0.50      0.51       912
weighted avg       0.57      0.57      0.56       912

Accuracy for normal data: 0.5679824561403509
              precision    recall  f1-score   support

           5       0.64      0.57      0.6

'\nprint("Accuracy for normal data:", accuracy_score(N_Y_test, SM_y_pred))\nprint(classification_report(N_Y_test, SM_y_pred))'

In [90]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# 1️⃣ Cross-validation on TRAINING data
train_cv_scores = cross_val_score(
    best_model,
    US_X_train,
    US_Y_train_encoded,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

print("\n💡 Cross-Validation on TRAINING Set:")
print(f"Scores: {train_cv_scores}")
print(f"Mean Accuracy: {np.mean(train_cv_scores):.4f}")
print(f"Standard Deviation: {np.std(train_cv_scores):.4f}")

# 2️⃣ Evaluation on TEST data
y_test_pred_encoded = best_model.predict(US_X_test)
y_test_pred = le.inverse_transform(y_test_pred_encoded)

print("\n🧪 Evaluation on TEST Set:")
print("Accuracy:", accuracy_score(US_Y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(US_Y_test, y_test_pred))


💡 Cross-Validation on TRAINING Set:
Scores: [0.64941176 0.62588235 0.65176471 0.59529412 0.61084906]
Mean Accuracy: 0.6266
Standard Deviation: 0.0218

🧪 Evaluation on TEST Set:
Accuracy: 0.14912280701754385

Classification Report:
               precision    recall  f1-score   support

           4       0.00      0.00      0.00         0
           5       0.21      0.28      0.24       347
           6       0.37      0.10      0.15       409
           7       0.00      0.00      0.00       156

    accuracy                           0.15       912
   macro avg       0.15      0.09      0.10       912
weighted avg       0.25      0.15      0.16       912



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


XGBoost Regressor 

In [91]:
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score, classification_report

model = XGBRegressor(random_state=42)
model.fit(US_X_train, US_Y_train)

y_pred = model.predict(US_X_test)
US_y_pred_rounded = np.clip(np.round(y_pred), min(US_Y_train), max(US_Y_train)).astype(int)


In [92]:
model = XGBRegressor(random_state=42)
model.fit(N_X_train, N_Y_train)

y_pred = model.predict(N_X_test)
N_y_pred_rounded = np.clip(np.round(y_pred), min(N_Y_train), max(N_Y_train)).astype(int)

In [93]:
print("Accuracy:", accuracy_score(US_Y_test, US_y_pred_rounded))
print(classification_report(US_Y_test, US_y_pred_rounded))
print("Accuracy:", accuracy_score(N_Y_test, N_y_pred_rounded))
print(classification_report(N_Y_test, N_y_pred_rounded))

Accuracy: 0.6162280701754386
              precision    recall  f1-score   support

           5       0.69      0.64      0.66       347
           6       0.57      0.69      0.63       409
           7       0.60      0.36      0.45       156

    accuracy                           0.62       912
   macro avg       0.62      0.56      0.58       912
weighted avg       0.62      0.62      0.61       912

Accuracy: 0.5712719298245614
              precision    recall  f1-score   support

           5       0.67      0.55      0.60       347
           6       0.53      0.70      0.61       409
           7       0.49      0.28      0.35       156

    accuracy                           0.57       912
   macro avg       0.57      0.51      0.52       912
weighted avg       0.58      0.57      0.56       912



### Random Forest regressor 

In [94]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [95]:
rf_regressor = RandomForestRegressor(random_state=42)
rf_regressor.fit(US_X_train, US_Y_train)
US_y_pred = rf_regressor.predict(US_X_test)

In [96]:
rf_regressor = RandomForestRegressor(random_state=42)
rf_regressor.fit(N_X_train, N_Y_train)
N_y_pred = rf_regressor.predict(N_X_test)

In [97]:
print("Mean Squared Error Under sampled:", mean_squared_error(US_Y_test, US_y_pred))
print("R² Score Under sampled:", r2_score(US_Y_test, US_y_pred))
print("Mean Squared Error Normal:", mean_squared_error(N_Y_test, N_y_pred))
print("R² Score Normal:", r2_score(N_Y_test, N_y_pred))

Mean Squared Error Under sampled: 0.2977084429824562
R² Score Under sampled: 0.4135837084226355
Mean Squared Error Normal: 0.353253399122807
R² Score Normal: 0.3041730821423073


### random forest classifier

In [98]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [99]:
le = LabelEncoder()
US_Y_train_encoded = le.fit_transform(US_Y_train)
US_Y_test_encoded = le.transform(US_Y_test)
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(US_X_train, US_Y_train_encoded)
y_pred_encoded = rf_classifier.predict(US_X_test)
US_y_pred = le.inverse_transform(y_pred_encoded)

In [100]:
N_Y_train_encoded = le.fit_transform(N_Y_train)
N_Y_test_encoded = le.transform(N_Y_test)
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(N_X_train, N_Y_train_encoded)
y_pred_encoded = rf_classifier.predict(N_X_test)
N_y_pred = le.inverse_transform(y_pred_encoded)

In [101]:
print("Accuracy for Undersampled data:", accuracy_score(US_Y_test, US_y_pred))
print(classification_report(US_Y_test, US_y_pred))
print("Accuracy for normal data:", accuracy_score(N_Y_test, N_y_pred))
print(classification_report(N_Y_test, N_y_pred))

Accuracy for Undersampled data: 0.6600877192982456
              precision    recall  f1-score   support

           5       0.72      0.71      0.71       347
           6       0.62      0.70      0.66       409
           7       0.65      0.45      0.53       156

    accuracy                           0.66       912
   macro avg       0.66      0.62      0.63       912
weighted avg       0.66      0.66      0.66       912

Accuracy for normal data: 0.5745614035087719
              precision    recall  f1-score   support

           5       0.67      0.57      0.62       347
           6       0.53      0.71      0.61       409
           7       0.53      0.22      0.31       156

    accuracy                           0.57       912
   macro avg       0.58      0.50      0.51       912
weighted avg       0.58      0.57      0.56       912

