In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine

In [2]:
# Loading the dataset
wine = load_wine()
df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
target = wine.target
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [3]:
# Getting Information about the inbuilt sklearn's dataset 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
dtypes: fl

In [4]:
# Finding missing vaules throughout the dataset
missing_values = df.isnull()
for columns in df:
    print(columns)
    print(missing_values[columns].value_counts)
    print('')

alcohol
<bound method IndexOpsMixin.value_counts of 0      False
1      False
2      False
3      False
4      False
       ...  
173    False
174    False
175    False
176    False
177    False
Name: alcohol, Length: 178, dtype: bool>

malic_acid
<bound method IndexOpsMixin.value_counts of 0      False
1      False
2      False
3      False
4      False
       ...  
173    False
174    False
175    False
176    False
177    False
Name: malic_acid, Length: 178, dtype: bool>

ash
<bound method IndexOpsMixin.value_counts of 0      False
1      False
2      False
3      False
4      False
       ...  
173    False
174    False
175    False
176    False
177    False
Name: ash, Length: 178, dtype: bool>

alcalinity_of_ash
<bound method IndexOpsMixin.value_counts of 0      False
1      False
2      False
3      False
4      False
       ...  
173    False
174    False
175    False
176    False
177    False
Name: alcalinity_of_ash, Length: 178, dtype: bool>

magnesium
<bound method IndexOpsMi

In [5]:
# Categorization for Alcohol
df['Alcohol_Category'] = pd.cut(df['alcohol'], bins=3, labels=['High', 'Medium', 'Low'])
df['Alcohol_Category'].head()

0       Low
1    Medium
2    Medium
3       Low
4    Medium
Name: Alcohol_Category, dtype: category
Categories (3, object): ['High' < 'Medium' < 'Low']

In [6]:
df['Malic_Acid_Category'] = pd.cut(df['malic_acid'], bins=3, labels=['High', 'Medium', 'Low'])
df['Malic_Acid_Category'].head()

0      High
1      High
2      High
3      High
4    Medium
Name: Malic_Acid_Category, dtype: category
Categories (3, object): ['High' < 'Medium' < 'Low']

In [7]:
df['Color_Intensity_Category'] = pd.cut(df['color_intensity'], bins=3, labels=['High', 'Medium', 'Low'])
df['Color_Intensity_Category'].head()

0    Medium
1      High
2    Medium
3    Medium
4      High
Name: Color_Intensity_Category, dtype: category
Categories (3, object): ['High' < 'Medium' < 'Low']

In [8]:
# Importing Libraries for training and testing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [9]:
ingredient = ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 
               'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'od280/od315_of_diluted_wines', 'proline']
scaler = StandardScaler()
df[ingredient] = scaler.fit_transform(df[ingredient])
print(df.head())

    alcohol  malic_acid       ash  alcalinity_of_ash  magnesium  \
0  1.518613   -0.562250  0.232053          -1.169593   1.913905   
1  0.246290   -0.499413 -0.827996          -2.490847   0.018145   
2  0.196879    0.021231  1.109334          -0.268738   0.088358   
3  1.691550   -0.346811  0.487926          -0.809251   0.930918   
4  0.295700    0.227694  1.840403           0.451946   1.281985   

   total_phenols  flavanoids  nonflavanoid_phenols  proanthocyanins  \
0       0.808997    1.034819             -0.659563         1.224884   
1       0.568648    0.733629             -0.820719        -0.544721   
2       0.808997    1.215533             -0.498407         2.135968   
3       2.491446    1.466525             -0.981875         1.032155   
4       0.808997    0.663351              0.226796         0.401404   

   color_intensity   hue  od280/od315_of_diluted_wines   proline  \
0         0.251717  1.04                      1.847920  1.013009   
1        -0.293321  1.05          

In [10]:
# Normalization for ingredient
scaler = MinMaxScaler()
df[ingredient] = scaler.fit_transform(df[ingredient])
print(df.head())

    alcohol  malic_acid       ash  alcalinity_of_ash  magnesium  \
0  0.842105    0.191700  0.572193           0.257732   0.619565   
1  0.571053    0.205534  0.417112           0.030928   0.326087   
2  0.560526    0.320158  0.700535           0.412371   0.336957   
3  0.878947    0.239130  0.609626           0.319588   0.467391   
4  0.581579    0.365613  0.807487           0.536082   0.521739   

   total_phenols  flavanoids  nonflavanoid_phenols  proanthocyanins  \
0       0.627586    0.573840              0.283019         0.593060   
1       0.575862    0.510549              0.245283         0.274448   
2       0.627586    0.611814              0.320755         0.757098   
3       0.989655    0.664557              0.207547         0.558360   
4       0.627586    0.495781              0.490566         0.444795   

   color_intensity   hue  od280/od315_of_diluted_wines   proline  \
0         0.372014  1.04                      0.970696  0.561341   
1         0.264505  1.05          

In [11]:
# Importing Labelling
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [12]:
# Adding data labeling
label_encoder = LabelEncoder()
category_column = ['Alcohol_Category', 'Malic_Acid_Category', 'Color_Intensity_Category']
for columns in category_column:
    df[columns + '_Encoded'] = label_encoder.fit_transform(df[columns])
print(df.head())

    alcohol  malic_acid       ash  alcalinity_of_ash  magnesium  \
0  0.842105    0.191700  0.572193           0.257732   0.619565   
1  0.571053    0.205534  0.417112           0.030928   0.326087   
2  0.560526    0.320158  0.700535           0.412371   0.336957   
3  0.878947    0.239130  0.609626           0.319588   0.467391   
4  0.581579    0.365613  0.807487           0.536082   0.521739   

   total_phenols  flavanoids  nonflavanoid_phenols  proanthocyanins  \
0       0.627586    0.573840              0.283019         0.593060   
1       0.575862    0.510549              0.245283         0.274448   
2       0.627586    0.611814              0.320755         0.757098   
3       0.989655    0.664557              0.207547         0.558360   
4       0.627586    0.495781              0.490566         0.444795   

   color_intensity   hue  od280/od315_of_diluted_wines   proline  \
0         0.372014  1.04                      0.970696  0.561341   
1         0.264505  1.05          

In [13]:
# Importing PCA
from sklearn.decomposition import PCA

In [14]:
# Applying PCA
pca = PCA(n_components=2)
x_pca = pca.fit_transform(df[ingredient])
df_pca = pd.DataFrame(data=x_pca,columns=['Principal Component 1', 'Principal Component 2'])
df_pca.target = pca
df_pca.head()

Unnamed: 0,Principal Component 1,Principal Component 2
0,0.721622,-0.20113
1,0.479469,0.020363
2,0.531448,-0.149592
3,0.892461,-0.484331
4,0.194431,-0.056663
