<a href="https://colab.research.google.com/github/OldmanHades/Analyzing-the-Wine-Data-Sets-using-Linear-Discriminant-Analysis/blob/main/WineAnaylsis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analyzing the Wine Data Sets using Linear Discriminant Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.datasets import load_wine
 
wine = load_wine()
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df['target'] = wine.target
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [2]:
X = wine.data.astype('float32')
y = wine.target
n_samples, n_features = X.shape
classes = np.unique(y)
n_classes = len(classes)
n_components = 2
max_components = min(n_classes-1,n_features)
print("Number of classes: {}".format(n_classes))
print("Number of features: {}".format(n_features))
if n_components > max_components:
   raise ValueError("the number of components cannot be larger than min(n_features,n_classes-1)")


Number of classes: 3
Number of features: 13


In [3]:
mean = np.mean(X,axis=0)
Sw = np.zeros((n_features,n_features))
Sb = np.zeros((n_features,n_features))
for c in classes:
   Xc = X[y==c]
   class_means = np.mean(Xc,axis=0)
   #within-class variance
   Sw += (Xc-class_means).T.dot(Xc-class_means)
   mean_diff = (class_means-mean).reshape(n_features,1)
   #between-class variance
   Sb += n_classes * (mean_diff).dot(mean_diff.T)

In [4]:
A = np.linalg.inv(Sw).dot(Sb)
eigen_values, eigen_vectors = np.linalg.eig(A) 
eigen_vectors = eigen_vectors.T

In [10]:
sorted_idxs = np.argsort(abs(eigen_values))[::-1] 
eigen_values,eigen_vectors = eigen_values[sorted_idxs],eigen_vectors[sorted_idxs]
linear_discriminants = eigen_vectors[0:n_components]

In [8]:
explained_variance_ratio = np.sort(eigen_values / np.sum(eigen_values))[::-1][:max_components]
print(explained_variance_ratio)

[0.72817751 0.27182251]


# **Visualize the wine dataset in a two-dimensional space**

In [11]:
X_lda = np.dot(X,linear_discriminants.T)
X_lda_df = pd.DataFrame({'LDA_1':X_lda[:,0],'LDA_2':X_lda[:,1]})
X_lda_df['target'] = y
X_lda_df['target'] = X_lda_df['target'].apply(lambda y: str(y))

fig = px.scatter(X_lda_df, x='LDA_1', y='LDA_2', color=X_lda_df.target,labels={'0': 'LDA 1', '1': 'LDA 2'})
fig.show()
