<a href="https://colab.research.google.com/github/Shailesh0209/x_Machine_Learning_Practicals-diploma-IITM/blob/main/x_W1_Wine_Quality(ML_Projects)_MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get the data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(data_url, sep=";")

## Check data samples

In [None]:
data.head()

## Features

In [None]:
feature_list = data.columns[:-1].values
label = [data.columns[-1]]

print("Feature List:", feature_list)
print("Label:", label)

## Data statistics

In [None]:
data.info()

In [None]:
## numeric attributes
data.describe()

In [None]:
data['quality'].value_counts()

In [None]:
sns.set()
data.quality.hist()
plt.xlabel('Wine Quality')
plt.ylabel('Count')

data.total_sulfur_dioxide.hist()

## Create test set

In [None]:
def split_train_test(data, test_ratio):
    # Set the random seed.
    np.random.seed(42)

    # Shuffle the dataset
    shuffled_indices = np.random.permutation(len(data))

    # Calculate the size of the test set.
    test_set_size = int(len(data) * test_ratio)

    # split dataset to get training and test sets.
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(data, 0.2)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Perform random sampling on our dataset:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

## Stratified sampling

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["quality"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [None]:
strat_dist = strat_test_set["quality"].value_counts() / len(strat_test_set)

In [None]:
overall_dist = data["quality"].value_counts() / len(data)

In [None]:
# Let's look at them side-by-side
dist_comparison = pd.DataFrame({'overall': overall_dist, 'stratified': strat_dist})
dist_comparison['diff(s-o)'] = dist_comparison['stratified'] - dist_comparison['overall']
dist_comparison['diff(s-o)_pct'] = 100*(dist_comparison['diff(s-o)']/dist_comparison['overall'])

In [None]:
dist_comparison

In [None]:
# Let's contrast this with random sampling:
random_dist = test_set["quality"].value_counts() / len(test_set)
random_dist

In [None]:
# Sampling bias comparison
dist_comparison.loc[:, ['diff(s-o)_pct', 'diff(r-o)_pct']]

## Data Visualization

In [None]:
exploration_set = strat_train_set.copy()

### Scatter Visualization

In [None]:
sns.scatterplot(x='fixed acidity', y='density', hue='quality',
                data=exploration_set)

In [None]:
## with matplotlib
exploration_set.plot(kind='scatter', x='fixed acidity',
                     y='density', alpha=0.5,
                     c="quality", cmap=plt.get_cmap("jet"))

In [None]:
corr_matrix = exploration_set.corr() # corr funcn to calculate correlation

In [None]:
corr_matrix['quality']

In [None]:
# correlation matrix with heatmap
plt.figure(figsize=(14, 7))
sns.heatmap(corr_matrix, annot=True)

In [None]:
from pandas.plotting import scatter_matrix
attribute_list = ['citric acid', 'pH', 'alcohol', 'sulphates', 'quality']
scatter_matrix(exploration_set[attribute_list])

# Prepare data for ML algorithm

## Separate features and labels from the training set.

In [None]:
# Copy all features leaving aside the label.
wine_features = strat_train_set.drop("quality", axis=1) 

# Copy the label list
wine_labels = strat_train_set['quality'].copy()

## Data cleaning

In [None]:
wine_features.isna().sum() # counts the no. of NaN in each column of wine_feature


In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

In [None]:
imputer.fit(wine_features)

In [None]:
# Let's check the statistics learnt by the imputer on the training set
imputer.statistics_

In [None]:
wine_features.median()

In [None]:
tr_features = imputer.transform(wine_features)

In [None]:
tr_features.shape

In [None]:
wine_features_tr = pd.DataFrame(tr_features, columns=wine_features.columns)

## Handling text and categorical attributes