# Data Analysis of Wine Quality

### Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

### Reading the dataset

In [None]:
df = pd.read_csv('wine quality.csv')

In [None]:
df.head()

In [None]:
df.info()

Here the df.info() output showing no nulls.

In [None]:
df.describe().T

### Exploratory Data Analysis on the datesets

#### Observations
- Here all the columns have numerical values (no categorical data).
- All the rows are filled so data imputation is not needed.

In [None]:
sns.pairplot(df)

In [None]:
df.corr()

In [None]:
for i, col in enumerate(df.columns):
    plt.figure(i)
    sns.histplot(data=df, x=col, kde=True)

In [None]:
for i, col in enumerate(df.columns):
    plt.figure(i)
    sns.boxplot(x=col, data=df)

#### Assumptions 
- pH value above 4 are considered basic and below 3.5 are considered more acidic.
- fixed acidity, volatile acidity, citric acid and pH are correlated as they all gives the information about the acid.

In [None]:
finding_df = df

In [None]:
finding_df[['fixed acidity', 'volatile acidity', 'citric acid', 'pH','quality']].corr()

From the above table we can see that they are somewhat correlated. And pH value is more correlated.

In [None]:
def pHDivision(x):
    if x > 3.5:
        return 1
    else:
        return 0

In [None]:
finding_df['Acidic or not'] = finding_df['pH'].apply(pHDivision)

In [None]:
finding_df.groupby('Acidic or not').mean()['quality']

In [None]:
finding_df.groupby('Acidic or not').median()['quality']

Here the mean and median are not much different. So, the pH has less significant feature.

#### About sulfur dioxide
Free sulfur dioxide and total sulfur dioxide both gives the information about the sulfur. They are likely to be highly correlated with each other.

In [None]:
finding_df[['free sulfur dioxide','total sulfur dioxide', 'quality']].corr()

From the above table we see that the free sulfur dioxide and total sulfur dioxide are correlated but they are negatively correlated with qualtiy.

From the data description we know that total sulfur dioxide is the amount of free and bound forms of S02; in low concentrations, SO2 is mostly undetectable in wine, but at free SO2 concentrations over 50 ppm, SO2 becomes evident in the nose and taste of wine.

Lets assume that the give data are about the ppm

In [None]:
def ppm(x):
    if x > 50:
        return 1
    else:
        return 0

finding_df['ppm above 50'] = finding_df['total sulfur dioxide'].apply(ppm)

In [None]:
finding_df.groupby('ppm above 50').mean()['quality']

In [None]:
finding_df.groupby('ppm above 50').median()['quality']

In [None]:
finding_df.groupby('ppm above 50').count()['quality']

Here the mean and median are not much different. So, the total sulfur dioxide has less significant feature.

#### About Alcohol 
Let divide the alcohol into three category; low, medium and high alcohol concentration

- Low alcohol concentration refers to around 8% to less than 11%
- Medium alcohol concentration refers to around 11% - 13%
- High alcohol concentration refers to something up to 15%

In [None]:
finding_df[['alcohol', 'quality']].corr()

In [None]:
def alcohol_concentration(x):
    if x < 11:
        return 0
    elif x < 13:
        return 1
    else:
        return 2

In [None]:
finding_df['a_grade'] = finding_df['alcohol'].apply(alcohol_concentration)

In [None]:
finding_df.groupby('a_grade').mean()['quality']

In [None]:
finding_df.groupby('a_grade').median()['quality']

In [None]:
finding_df.groupby('a_grade').count()['quality']

### Findings
- Although acidity and pH values are correlated but it does not seem to affect wine.
- Total sulfur dioxide is less significant feature.
- Quality of wine seem to have the strongest correlation with alcohol.

### ML model creation 

Importing necessary libraries

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
bins = (2, 6.5, 8)
group_names = ['bad', 'good']
df['quality'] = pd.cut(df['quality'], bins = bins, labels = group_names)

In [None]:
label_quality = LabelEncoder()

Bad becomes 0 and good becomes 1 

In [None]:
df['quality'] = label_quality.fit_transform(df['quality'])

In [None]:
df['quality'].value_counts()

In [None]:
X = df.drop('quality', axis = 1)
y = df['quality']

### Train and Test splitting of data 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
sc = StandardScaler() 

### Standardizing Data

In [None]:
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

### Training SVM

In [None]:
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

### Model Performance

In [None]:
from math import sqrt
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, mean_squared_error

In [None]:
def score(a, b):
    cm = confusion_matrix(a, b)
    ps = precision_score(a, b)
    rmse = sqrt(mean_squared_error(a, b))
    ac = accuracy_score(a, b)
    print('The confusion matrix is:\n', cm)
    print('The precision score is:', ps)
    print('The root mean square error is:', rmse)
    print('The accuracy score is:', ac)
    return
    

In [None]:
score(y_test, y_pred)

### Finding best parameters for our SVC model

In [None]:
param = {
    'C': [0.1,0.8,0.9,1,1.1,1.2,1.3,1.4],
    'kernel':['linear', 'rbf'],
    'gamma' :[0.1,0.8,0.9,1,1.1,1.2,1.3,1.4]
}
grid_search_vc = GridSearchCV(svc, param_grid=param, scoring='accuracy', cv=10)

In [None]:
grid_search_vc.fit(X_train, y_train)

Best parameters for our SVC model

In [None]:
grid_search_vc.best_params_

Running SVC again with the best parameters

In [None]:
svc2 = SVC(C = 1.2, gamma =  0.9, kernel= 'rbf')
svc2.fit(X_train, y_train)
y_pred_2 = svc2.predict(X_test)

### Model Performance after best parameters

In [None]:
score(y_test, y_pred_2)