In [0]:
#Importing required packages.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
%matplotlib inline

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'


In [0]:
#Loading dataset using url
data = pd.read_csv(url,sep=';', dtype='float')

In [3]:
#Displays how data is distributed
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6.0
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6.0
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6.0


In [4]:
#displays info about cols
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
fixed acidity           4898 non-null float64
volatile acidity        4898 non-null float64
citric acid             4898 non-null float64
residual sugar          4898 non-null float64
chlorides               4898 non-null float64
free sulfur dioxide     4898 non-null float64
total sulfur dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
quality                 4898 non-null float64
dtypes: float64(12)
memory usage: 459.3 KB


In [0]:
#stores unique values in 'quality' column and stores them in an array
bins = data['quality'].unique()

In [6]:
bins

array([6., 5., 7., 8., 4., 3., 9.])

In [0]:
#Making binary classificaion and labeling wine as good and bad.
bins = (3, 6, 9)
group_names = ['bad', 'good']
data['quality'] = pd.cut(data['quality'], bins = bins, labels = group_names)

In [8]:
#assigns a labels to our quality variable
label_quality = LabelEncoder()
data.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,bad
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,bad
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,bad
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,bad
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,bad


In [0]:
#This labels the string values bad, good to o, 1 respectively
data['quality'] = label_quality.fit_transform(data['quality'].astype(str))

In [10]:
data.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,0
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,0
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,0


In [11]:
data['quality'].value_counts()

0    3818
1    1060
2      20
Name: quality, dtype: int64

In [0]:
#COnverting pandas dataframe to Numpy array
data_numpy = data.values

In [0]:
#Now we separate features and target values
X = data_numpy[:,:-1]
Y = data_numpy[:,-1]

In [14]:
X.shape

(4898, 11)

In [15]:
Y.shape

(4898,)

In [0]:
#Test and train values are divided
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size = 0.2, 
                                                    random_state = 42)

In [0]:
#Applying Standard scaling to get optimized result
S = StandardScaler()

X_train= S.fit_transform(X_train)
X_test = S.fit_transform(X_test)

Now we can use Machine Learning Algorithms  on our Training and Testing Data

In [0]:
#Random Forest Classifier
RF = RandomForestClassifier(n_estimators = 200)

In [19]:
RF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [0]:
y_pred = RF.predict(X_test)

In [21]:
#Model Performance
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92       748
         1.0       0.84      0.59      0.69       227
         2.0       0.00      0.00      0.00         5

   micro avg       0.87      0.87      0.87       980
   macro avg       0.57      0.52      0.54       980
weighted avg       0.87      0.87      0.86       980



  'precision', 'predicted', average, warn_for)


In [22]:
RF.score(X_test,y_test)

0.8734693877551021

In [23]:
#Stochastic Gradient Decent Classifier
sgd = SGDClassifier(penalty=None)
sgd.fit(X_train, y_train)
pred_sgd = sgd.predict(X_test)



In [24]:
#Model Perfromance
print(classification_report(y_test, pred_sgd))


              precision    recall  f1-score   support

         0.0       0.84      0.85      0.85       748
         1.0       0.51      0.51      0.51       227
         2.0       0.00      0.00      0.00         5

   micro avg       0.76      0.76      0.76       980
   macro avg       0.45      0.45      0.45       980
weighted avg       0.76      0.76      0.76       980



In [25]:
sgd.score(X_test,y_test)

0.763265306122449

In [26]:
#Now lets try to do some evaluation for random forest model using cross validation.
RF_eval = cross_val_score(estimator = RF, 
                           X = X_train, 
                           y = y_train, 
                           cv = 10)
RF_eval.mean()

0.8690486533146282

In [0]:
#SVM Model and Performance
svc = SVC()
svc.fit(X_train, y_train)
pred_svc = svc.predict(X_test)


In [28]:
print(classification_report(y_test, pred_svc))

              precision    recall  f1-score   support

         0.0       0.82      0.97      0.89       748
         1.0       0.75      0.34      0.47       227
         2.0       0.00      0.00      0.00         5

   micro avg       0.82      0.82      0.82       980
   macro avg       0.52      0.43      0.45       980
weighted avg       0.80      0.82      0.79       980



  'precision', 'predicted', average, warn_for)


In [0]:
#Using Grid CV we can increase accuracy of SVM model
param = {
    'C': [0.1,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6],
    'kernel':['linear', 'rbf'],
    'gamma' :[0.1,0.8,0.9,1,1.1,1.2,1.3,1.4]
}
grid_svc = GridSearchCV(svc, param_grid=param, scoring='accuracy', cv=10)

In [30]:
grid_svc.fit(X_train, y_train)
grid_svc.best_params_


{'C': 1.6, 'gamma': 1.2, 'kernel': 'rbf'}

In [31]:
#Running SVM with best parameters
svc2 = SVC(C = 1.2, gamma =  0.9, kernel= 'rbf')
svc2.fit(X_train, y_train)
pred_svc2 = svc2.predict(X_test)
print(classification_report(y_test, pred_svc2))

              precision    recall  f1-score   support

         0.0       0.86      0.96      0.91       748
         1.0       0.80      0.51      0.62       227
         2.0       0.00      0.00      0.00         5

   micro avg       0.85      0.85      0.85       980
   macro avg       0.55      0.49      0.51       980
weighted avg       0.84      0.85      0.84       980



  'precision', 'predicted', average, warn_for)
