###  Import Necessary Modules 

In [1]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('ignore')


# data visualisation and manipulation
import numpy as np
import pandas as pd
import math
import os 

#import the necessary modelling algorithms.

#classifiaction.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
 

#regression
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

#model selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#preprocessing
from sklearn.preprocessing import MinMaxScaler,StandardScaler,Imputer,LabelEncoder

#evaluation metrics
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error # for regression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score  # for classification



### Loading the Data File  

In [2]:
wineq = pd.read_csv("WineQ.csv")

wineq.shape
wineq.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
1,7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5
2,7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...
3,11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...
4,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5


In [3]:
wineq = pd.read_csv("wineq.csv", sep=";")

In [4]:
wineq.shape

(1599, 12)

In [5]:
wineq.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [6]:
wineq.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [7]:
wineq.columns=wineq.columns.str.capitalize()
wineq.columns = wineq.columns.str.replace(' ', '_')


In [8]:
wineq.head(10)

Unnamed: 0,Fixed_acidity,Volatile_acidity,Citric_acid,Residual_sugar,Chlorides,Free_sulfur_dioxide,Total_sulfur_dioxide,Density,Ph,Sulphates,Alcohol,Quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


### Checking For Nulls 

In [9]:
wineq.isnull().any().any().sum()

0

### Checking for Duplicates

In [18]:
wineq.columns.duplicated().sum()

0

### Labeling the Quality as 1 and 0

In [11]:
bins = (2, 6.5, 8)
group_names = ['bad', 'good']
wineq['Quality'] = pd.cut(wineq['Quality'], bins = bins, labels = group_names)

In [12]:
label_quality = LabelEncoder()
wineq['Quality'] = label_quality.fit_transform(wineq['Quality'])            ##Bad becomes 0 and good becomes 1 

In [13]:
wineq.head(10)

Unnamed: 0,Fixed_acidity,Volatile_acidity,Citric_acid,Residual_sugar,Chlorides,Free_sulfur_dioxide,Total_sulfur_dioxide,Density,Ph,Sulphates,Alcohol,Quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,0
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,0
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,1
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,1
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,0


### Splitting Data into Train and Test sets 

In [14]:
from sklearn.model_selection import train_test_split
from sklearn import metrics # for checking the model accuracy


train, test = train_test_split(wineq, test_size=0.3) 


print(train.shape , test.shape)

(1119, 12) (480, 12)


In [15]:
train_x = train[['Fixed_acidity', 'Volatile_acidity', 'Citric_acid', 'Residual_sugar','Chlorides', 'Free_sulfur_dioxide', 'Total_sulfur_dioxide', 'Density','Ph', 'Sulphates', 'Alcohol']]
train_y = train.Quality

test_x = test[['Fixed_acidity', 'Volatile_acidity', 'Citric_acid', 'Residual_sugar','Chlorides', 'Free_sulfur_dioxide', 'Total_sulfur_dioxide', 'Density','Ph', 'Sulphates', 'Alcohol']]
test_y = test.Quality


### Applying Macine Learning Algorithms  


In [16]:
models=  [LogisticRegression(),LinearSVC(),SVC(kernel='rbf'),KNeighborsClassifier(),RandomForestClassifier(),
          DecisionTreeClassifier(),GradientBoostingClassifier(),GaussianNB()]


model_names=   ['LogisticRegression','LinearSVM','rbfSVM','KNearestNeighbors','RandomForestClassifier','DecisionTree',
                'GradientBoostingClassifier','GaussianNB']

accuracy=[]
d={}

for model in range(len(models)):
    clf=models[model]
    clf.fit(train_x,train_y)
    pred=clf.predict(test_x)
    accuracy.append(accuracy_score(pred,test_y))
     
d={'Modelling Algo': model_names,'Accuracy':accuracy}
d

{'Modelling Algo': ['LogisticRegression',
  'LinearSVM',
  'rbfSVM',
  'KNearestNeighbors',
  'RandomForestClassifier',
  'DecisionTree',
  'GradientBoostingClassifier',
  'GaussianNB'],
 'Accuracy': [0.9020833333333333,
  0.8916666666666667,
  0.8854166666666666,
  0.8875,
  0.91875,
  0.8854166666666666,
  0.9083333333333333,
  0.8020833333333334]}

In [17]:
acc_frame=pd.DataFrame(d)
acc_frame

Unnamed: 0,Modelling Algo,Accuracy
0,LogisticRegression,0.902083
1,LinearSVM,0.891667
2,rbfSVM,0.885417
3,KNearestNeighbors,0.8875
4,RandomForestClassifier,0.91875
5,DecisionTree,0.885417
6,GradientBoostingClassifier,0.908333
7,GaussianNB,0.802083
