Import Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

Data Collection

In [None]:
#loading the dataset to a Pandas dataframe
wine_dataset = pd.read_csv('/content/sample_data/winequality-red.csv')

In [None]:
# number of rows & columns in the dataset
wine_dataset.shape

In [None]:
# first 5 rows of the dataset
wine_dataset.head()

In [None]:
# checking for missing values
wine_dataset.isnull().sum()

Data Analysis and Visualization

In [None]:
# statistical measures of the dataset
wine_dataset.describe()

In [None]:
# number of values for each quality
sns.catplot(x='quality', data=wine_dataset, kind='count')

In [None]:
# volatile acidity vs Quality
plot = plt.figure(figsize=(5,5))
sns.barplot(x='quality', y='volatile acidity', data=wine_dataset)

In [None]:
# citric acid vs Quality
plot = plt.figure(figsize=(5,5))
sns.barplot(x='quality', y='citric acid', data=wine_dataset)

Correlation

1. Positive Correlation
2. Negative Correlation

In [None]:
correlation = wine_dataset.corr()

In [None]:
# constructing a heatmap to understand between the columns
plt.figure(figsize=(10,10))
sns.heatmap(correlation, cbar=True, square=True, fmt='.1f', annot=True, annot_kws={'size':8}, cmap='Blues')

Data Preprocessing

In [None]:
# seperate the data and Label
X= wine_dataset.drop('quality', axis=1)


In [None]:
print(X)

Label Binarization

In [None]:
Y = wine_dataset['quality'].apply(lambda y_value: 1 if y_value>=7 else 0)

In [None]:
print(Y)

Train and Test Split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [None]:
print (Y.shape, Y_train.shape, Y_test.shape)

Model Training


Random Forest Classifier

In [None]:
model = RandomForestClassifier()

In [None]:
model.fit(X_train , Y_train)

In [None]:
from pprint import pprint
pprint(model.get_params())


Model Evaluation

Accuracy Score

In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy :', test_data_accuracy)

Building a Predictive System

In [None]:
input_data = (7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,)

# changing the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the data as we are predicting the label for only one instance

input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]==1):
  print('Good Quality Wine')
else:
  print('Bad Quality Wine')

