In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('winequality-red.csv')

In [3]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [4]:
#Splitting x and y
x=df.drop(columns=['quality'])
y=df['quality']

In [5]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42,stratify=y)

In [7]:
# Model building

from sklearn.ensemble import GradientBoostingClassifier

model=GradientBoostingClassifier(learning_rate=0.01, max_depth=7, max_features='sqrt',
                           n_estimators=300, verbose=1)

model.fit(x_train,y_train)

      Iter       Train Loss   Remaining Time 
         1           1.1554            6.39s
         2           1.1345            4.93s
         3           1.1160           13.77s
         4           1.0990           18.46s
         5           1.0828           22.12s
         6           1.0672           23.69s
         7           1.0522           25.53s
         8           1.0384           26.69s
         9           1.0248           26.78s
        10           1.0128           25.85s
        20           0.9038           23.48s
        30           0.8173           25.22s
        40           0.7460           24.44s
        50           0.6830           22.68s
        60           0.6270           22.02s
        70           0.5780           21.59s
        80           0.5357           20.73s
        90           0.4976           20.25s
       100           0.4631           19.57s
       200           0.2442           10.54s
       300           0.1375            0.00s


In [8]:
pred=model.predict(x_test)

In [9]:
from sklearn.metrics import classification_report

print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00        16
           5       0.72      0.75      0.74       204
           6       0.61      0.73      0.67       192
           7       0.67      0.40      0.50        60
           8       0.50      0.20      0.29         5

    accuracy                           0.66       480
   macro avg       0.42      0.35      0.37       480
weighted avg       0.64      0.66      0.65       480



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Hyper parameter tuning
---

In [None]:
from sklearn.model_selection import GridSearchCV

params={
    'learning_rate': [0.1,0.01],
    'n_estimators': [200,300],
    'max_depth': [3,5,7],
    'max_features': ['sqrt',None]
}

In [None]:
model=GradientBoostingClassifier(verbose=1)

In [None]:
grid=GridSearchCV(model,params,cv=3)

In [None]:
# grid.fit(x_train,y_train)

In [None]:
grid.best_estimator_

In [None]:
grid.best_params_

In [None]:
grid.best_score_

SMOTE
----

In [10]:
from imblearn.over_sampling import SMOTE

In [11]:
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [12]:
model=GradientBoostingClassifier(learning_rate=0.01, max_depth=7, max_features='sqrt',
                           n_estimators=300, verbose=1)

model.fit(x_train_resampled,y_train_resampled)

      Iter       Train Loss   Remaining Time 
         1           1.7606           15.12s
         2           1.7320           43.32s
         3           1.7037           52.81s
         4           1.6763           56.13s
         5           1.6496           59.53s
         6           1.6227            1.02m
         7           1.5984            1.03m
         8           1.5740            1.05m
         9           1.5505            1.07m
        10           1.5280            1.07m
        20           1.3305            1.05m
        30           1.1700            1.03m
        40           1.0397           54.43s
        50           0.9300           52.71s
        60           0.8365           51.18s
        70           0.7562           48.58s
        80           0.6860           44.78s
        90           0.6247           43.21s
       100           0.5712           41.56s
       200           0.2584           21.57s
       300           0.1367            0.00s


In [13]:
pred_2=model.predict(x_test)

In [14]:
print(classification_report(y_test,pred_2))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.11      0.12      0.11        16
           5       0.76      0.72      0.74       204
           6       0.63      0.67      0.65       192
           7       0.54      0.50      0.52        60
           8       0.20      0.40      0.27         5

    accuracy                           0.64       480
   macro avg       0.37      0.40      0.38       480
weighted avg       0.65      0.64      0.64       480



In [16]:
y_train_resampled.value_counts()

quality
6    477
5    477
7    477
4    477
3    477
8    477
Name: count, dtype: int64