In [28]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [29]:
dt= pd.read_csv("winequality-red.csv", sep=";")

In [30]:
dt

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [31]:
bins = (2, 5, 8)
group_names = ['bad', 'good']
dt['quality'] = pd.cut(dt['quality'], bins = bins, labels = group_names)

In [32]:
dt["quality"]

0        bad
1        bad
2        bad
3       good
4        bad
        ... 
1594     bad
1595    good
1596    good
1597     bad
1598    good
Name: quality, Length: 1599, dtype: category
Categories (2, object): ['bad' < 'good']

In [33]:
#Now lets assign a labels to our quality variable
label_quality = LabelEncoder()

In [34]:
#Bad becomes 0 and good becomes 1 
dt['quality'] = label_quality.fit_transform(dt['quality'])

In [35]:
dt['quality'].value_counts()

1    855
0    744
Name: quality, dtype: int64

In [36]:
X = dt.iloc[:, :-1]
y = dt.iloc[:, -1]

In [37]:
#Train and Test splitting of data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [38]:
#Applying Standard scaling to get optimized result
sc = StandardScaler()

In [39]:
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [59]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)


0.46770717334674267

0.11248464677681358

In [49]:
from sklearn.metrics import confusion_matrix, classification_report
#Let's see how our model performed
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.74      0.78      0.76       141
           1       0.82      0.78      0.80       179

    accuracy                           0.78       320
   macro avg       0.78      0.78      0.78       320
weighted avg       0.78      0.78      0.78       320



In [53]:
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(penalty=None)
sgd.fit(X_train, y_train)
pred_sgd = sgd.predict(X_test)


In [54]:
print(classification_report(y_test, pred_sgd))

              precision    recall  f1-score   support

           0       0.65      0.71      0.68       141
           1       0.75      0.70      0.73       179

    accuracy                           0.71       320
   macro avg       0.70      0.71      0.70       320
weighted avg       0.71      0.71      0.71       320



In [55]:
svc = SVC()
svc.fit(X_train, y_train)
pred_svc = svc.predict(X_test)

In [56]:
print(classification_report(y_test, pred_svc))

              precision    recall  f1-score   support

           0       0.72      0.77      0.74       141
           1       0.81      0.76      0.78       179

    accuracy                           0.77       320
   macro avg       0.76      0.77      0.76       320
weighted avg       0.77      0.77      0.77       320



In [61]:
from sklearn.model_selection import  cross_val_score
#Now lets try to do some evaluation for random forest model using cross validation.
rfc_eval = cross_val_score(estimator = rfc, X = X_train, y = y_train,cv=10)
rfc_eval.mean()


0.8202202263779528