# Linear Discriminant Analysis (LDA)

---

Name: Shantanu Shaji

PRN: 24070126165

AIML - C1

---

### Theory

#### LDA contains two criteria:

- Maximize the distance between means of classes.
- Minimize the variation within each class.

![image.png](attachment:image.png)
![image-2.png](attachment:image-2.png)

#### Steps for LDA:

- Compute Mean Vectors for each Class.

- Compute Covariance for each Class.

- Compute Within Class Scatter Matrix (S<sub>W</sub>)

- Compute Between Class Scatter Matrix (S<sub>B</sub>)

- Find the EigenValues and Vectors

- Compute Dot Product

Transform d Dimensions of dataset into (Class - 1) Dimensions

### Implementation

##### Setup

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn import metrics

rState = 1

In [2]:
taxis_df = sns.load_dataset('taxis')
taxis_df.head()

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,0.0,12.95,yellow,credit card,Lenox Hill West,UN/Turtle Bay South,Manhattan,Manhattan
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,0.0,9.3,yellow,cash,Upper West Side South,Upper West Side South,Manhattan,Manhattan
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,0.0,14.16,yellow,credit card,Alphabet City,West Village,Manhattan,Manhattan
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,0.0,36.95,yellow,credit card,Hudson Sq,Yorkville West,Manhattan,Manhattan
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,0.0,13.4,yellow,credit card,Midtown East,Yorkville West,Manhattan,Manhattan


In [9]:
taxis_df.describe()
taxis_df.columns

Index(['pickup', 'dropoff', 'passengers', 'distance', 'fare', 'tip', 'tolls',
       'total', 'color', 'payment', 'pickup_zone', 'dropoff_zone',
       'pickup_borough', 'dropoff_borough'],
      dtype='object')

##### Preprocessing

In [4]:
df = taxis_df.dropna()

In [None]:
enco = LabelEncoder() #encoding output column
df.loc[:,'payment'] = enco.fit_transform(df.loc[:,'payment']) # Card or Cash

0       1
1       0
2       1
3       1
4       1
       ..
6428    1
6429    1
6430    0
6431    1
6432    1
Name: payment, Length: 6341, dtype: int64

In [None]:
X = df[['distance', 'fare', 'pickup_borough', 'dropoff_borough']]
X = pd.get_dummies(X, drop_first=True) #Encoding Categorical Columns if any.

y = df['payment']

In [20]:
#Splitting
X_train_unscaled, X_test_unscaled, y_train, y_test = train_test_split(X, y, random_state=rState, test_size=0.2)

#Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_unscaled)
X_test = scaler.transform(X_test_unscaled)

##### Training the model

In [22]:
rfc = RandomForestClassifier(random_state=rState)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)
print("Accuracy on Raw Input: ", metrics.accuracy_score(y_test, y_pred))

Accuracy on Raw Input:  0.6579984239558707


#### LDA Through Library

In [50]:
def lda_lib(n):
    lda_transformer = LinearDiscriminantAnalysis(n_components = n)
    X_train_LDA = lda_transformer.fit_transform(X_train, y_train)
    X_test_LDA = lda_transformer.transform(X_test)

    rfc.fit(X_train_LDA, y_train)
    y_pred_LDA = rfc.predict(X_test_LDA)
    print(f"Accuracy on LDA {n} Component(s) Input: ", metrics.accuracy_score(y_test, y_pred_LDA))

max_components = min(X_train.shape[1], len(set(y_train)) - 1)

for i in range(1, max_components + 1):
    lda_lib(i)

Accuracy on LDA 1 Component(s) Input:  0.6367218282111899


#### LDA Through Manual Implementation

In [57]:
# Work with Numpy arrays, not Pandas DataFrames
X_np = X.values
y_np = y.values

In [61]:
# Finding Classes
classes = np.unique(y_np)
classes

array([0, 1, 2])

In [62]:
overall_mean = np.mean(X_np, axis=0)

class_means = {}
for c in classes:
    class_means[c] = np.mean(X_np[y_np == c], axis=0)

class_means

IndexError: boolean index did not match indexed array along axis 1; size of axis is 13 but size of corresponding boolean axis is 1

### On Wine Dataset

#### Setup

In [44]:
from sklearn.datasets import load_wine
wine = load_wine()

X = pd.DataFrame(wine.data)
y = pd.DataFrame(wine.target)

wine.feature_names
display(X.head() , y.value_counts())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


0
1    71
0    59
2    48
Name: count, dtype: int64

In [46]:
display(X.isnull().sum(), y.isnull().sum(), X.describe())

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
dtype: int64

0    0
dtype: int64

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [None]:
#Splitting
from sklearn.model_selection import train_test_split
X_train_uns, X_test_uns, y_train_uns, y_test_uns = train_test_split(X, y)

#Scaled
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler
