In [1]:
import pandas as pd
import numpy as np
import time
from sklearn import datasets
from sklearn.datasets import fetch_openml
from sklearn.datasets import make_blobs
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Q1.) Dimensionality Reduction

In [2]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import KernelPCA

### Q2, Q3 & Q4.) Comparing the Dimenionality Reduction Techniques with Decision Tree by Analysing with the Metrics Scores

### Iris Dataset

In [3]:
iris = datasets.load_iris()
print('Iris keys:', list(iris.keys()))
print('Iris features:', iris.feature_names)
iris.data.shape
X = iris.data[:,[2,3]]
y = iris.target
print(np.unique(y))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

### PCA
begin_time = time.time()
pca = PCA(n_components = 2)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
tree_model = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, random_state = 1)
tree_model.fit(X_train_pca, y_train)
y_pred = tree_model.predict(X_test_pca)
pca.explained_variance_ratio_
end_time = time.time()
time_taken = end_time - begin_time
print("PCA Accuracy Analysis: \n")
print("PCA Accuracy =", accuracy_score(y_pred, y_test))
print("PCA F1 Score =", f1_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
print("PCA Precision Score =", precision_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
print("PCA Recall Score =", recall_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
print("Time Taken for PCA:", time_taken)
print("Variance Ratio =", pca.explained_variance_ratio_)
print(X_train_pca.shape)


### LDA
begin_time = time.time()
lda = LDA(n_components = 2)
X_train_lda = lda.fit_transform(X_train_std, y_train)
X_test_lda = lda.transform(X_test_std)
tree_model = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, random_state = 1)
tree_model.fit(X_train_lda, y_train)
y_pred = tree_model.predict(X_test_lda)
end_time = time.time()
time_taken = end_time - begin_time
print("\n LDA Accuracy Analysis: \n")
print("LDA Accuracy =", accuracy_score(y_pred, y_test))
print("LDA F1 Score =", f1_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
print("LDA Precision Score =", precision_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
print("LDA Recall Score =", recall_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
print("Time Taken for LDA:", time_taken)
##X_test_lda = lda.transform(X_test_std)
print(X_train_lda.shape)

### KPCA
begin_time = time.time()
kpca = KernelPCA(n_components = 2, kernel = 'rbf', gamma = 15)
X_train_kpca = kpca.fit_transform(X_train)
X_test_kpca = kpca.transform(X_test_std)
tree_model1 = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, random_state = 1)
tree_model1.fit(X_train, y_train)
y_pred1 = tree_model1.predict(X_test)
end_time = time.time()
time_taken = end_time - begin_time
print("\n")
print("Accuracy of DT (no KPCA) = ", sum(y_test==y_pred1)/y_test.shape[0])
print("F1 Score of DT (no KPCA) =", f1_score(y_test, y_pred1, pos_label = 'positive', average = 'micro'))
print("Precision Score of DT (no KPCA) =", precision_score(y_test, y_pred1, pos_label = 'positive', average = 'micro'))
print("Recall Score of DT (no KPCA) =", recall_score(y_test, y_pred1, pos_label = 'positive', average = 'micro'))
print("Time Taken for non-KPCA:", time_taken)
print("\n KPCA Accuracy Analysis: \n")
tree_model2 = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, random_state = 1)
tree_model2.fit(X_train_kpca, y_train)
y_pred2 = tree_model2.predict(X_test_kpca)
end_time = time.time()
time_taken = end_time - begin_time
print("KPCA Accuracy =", sum(y_test==y_pred2)/y_test.shape[0])
print("KPCA F1 Score =", f1_score(y_test, y_pred2, pos_label = 'positive', average = 'micro'))
print("KPCA Precision Score =", precision_score(y_test, y_pred2, pos_label = 'positive', average = 'micro'))
print("KPCA Recall Score =", recall_score(y_test, y_pred2, pos_label = 'positive', average = 'micro'))
print("Time Taken for KPCA:", time_taken)
print("\n")
##X_test_kpca = kpca.transform(X_test)
print(X_train_kpca.shape)


Iris keys: ['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename']
Iris features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
[0 1 2]
PCA Accuracy Analysis: 

PCA Accuracy = 0.9777777777777777
PCA F1 Score = 0.9777777777777777
PCA Precision Score = 0.9777777777777777
PCA Recall Score = 0.9777777777777777
Time Taken for PCA: 0.0039823055267333984
Variance Ratio = [0.98174162 0.01825838]
(105, 2)

 LDA Accuracy Analysis: 

LDA Accuracy = 0.9777777777777777
LDA F1 Score = 0.9777777777777777
LDA Precision Score = 0.9777777777777777
LDA Recall Score = 0.9777777777777777
Time Taken for LDA: 0.002991199493408203
(105, 2)


Accuracy of DT (no KPCA) =  0.9777777777777777
F1 Score of DT (no KPCA) = 0.9777777777777777
Precision Score of DT (no KPCA) = 0.9777777777777777
Recall Score of DT (no KPCA) = 0.9777777777777777
Time Taken for non-KPCA: 0.018953800201416016

 KPCA Accuracy Analysis: 

KPCA Accuracy = 0.3333333333333333
KPCA F



### MNIST Dataset

In [4]:
mist = fetch_openml('mnist_784', version = 1)
print('MNIST keys:', list(mist.keys()))
print('MNIST features:', mist.feature_names)
mist.data.shape
X = mist.data
y = mist.target
X, y = make_blobs(n_samples = 2000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

### PCA
begin_time = time.time()
pca = PCA(n_components = 2)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
y_pred = tree_model.predict(X_test_pca)
pca.explained_variance_ratio_
end_time = time.time()
time_taken = end_time - begin_time
print("PCA Accuracy Analysis: \n")
print("PCA Accuracy =", accuracy_score(y_pred, y_test))
print("PCA F1 Score =", f1_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
print("PCA Precision Score =", precision_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
print("PCA Recall Score =", recall_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
print("Time Taken for PCA:", time_taken)
print("Variance Ratio =", pca.explained_variance_ratio_)
print(X_train_pca.shape)

### LDA
begin_time = time.time()
lda = LDA(n_components = 2)
X_train_lda = lda.fit_transform(X_train_std, y_train)
X_test_lda = lda.transform(X_test_std)
tree_model = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, random_state = 1)
tree_model.fit(X_train_lda, y_train)
y_pred = tree_model.predict(X_test_lda)
end_time = time.time()
time_taken = end_time - begin_time
print("\n LDA Accuracy Analysis: \n")
print("LDA Accuracy =", accuracy_score(y_pred, y_test))
print("LDA F1 Score =", f1_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
print("LDA Precision Score =", precision_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
print("LDA Recall Score =", recall_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
print("Time Taken for LDA:", time_taken)
##X_test_lda = lda.transform(X_test_std)
print(X_train_lda.shape)

### KPCA
begin_time = time.time()
kpca = KernelPCA(n_components = 2, kernel = 'rbf', gamma = 15)
X_train_kpca = kpca.fit_transform(X_train)
X_test_kpca = kpca.transform(X_test_std)
tree_model1 = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, random_state = 1)
tree_model1.fit(X_train, y_train)
y_pred1 = tree_model1.predict(X_test)
end_time = time.time()
time_taken = end_time - begin_time
print("\n")
print("Accuracy of DT (no KPCA) = ", sum(y_test==y_pred1)/y_test.shape[0])
print("F1 Score of DT (no KPCA) =", f1_score(y_test, y_pred1, pos_label = 'positive', average = 'micro'))
print("Precision Score of DT (no KPCA) =", precision_score(y_test, y_pred1, pos_label = 'positive', average = 'micro'))
print("Recall Score of DT (no KPCA) =", recall_score(y_test, y_pred1, pos_label = 'positive', average = 'micro'))
print("Time Taken for non-KPCA:", time_taken)
print("\n KPCA Accuracy Analysis: \n")
tree_model2 = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, random_state = 1)
tree_model2.fit(X_train_kpca, y_train)
y_pred2 = tree_model2.predict(X_test_kpca)
end_time = time.time()
time_taken = end_time - begin_time
print("KPCA Accuracy =", sum(y_test==y_pred2)/y_test.shape[0])
print("KPCA F1 Score =", f1_score(y_test, y_pred2, pos_label = 'positive', average = 'micro'))
print("KPCA Precision Score =", precision_score(y_test, y_pred2, pos_label = 'positive', average = 'micro'))
print("KPCA Recall Score =", recall_score(y_test, y_pred2, pos_label = 'positive', average = 'micro'))
print("Time Taken for KPCA:", time_taken)
print("\n")
##X_test_kpca = kpca.transform(X_test)
print(X_train_kpca.shape)

MNIST keys: ['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url']
MNIST features: ['pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6', 'pixel7', 'pixel8', 'pixel9', 'pixel10', 'pixel11', 'pixel12', 'pixel13', 'pixel14', 'pixel15', 'pixel16', 'pixel17', 'pixel18', 'pixel19', 'pixel20', 'pixel21', 'pixel22', 'pixel23', 'pixel24', 'pixel25', 'pixel26', 'pixel27', 'pixel28', 'pixel29', 'pixel30', 'pixel31', 'pixel32', 'pixel33', 'pixel34', 'pixel35', 'pixel36', 'pixel37', 'pixel38', 'pixel39', 'pixel40', 'pixel41', 'pixel42', 'pixel43', 'pixel44', 'pixel45', 'pixel46', 'pixel47', 'pixel48', 'pixel49', 'pixel50', 'pixel51', 'pixel52', 'pixel53', 'pixel54', 'pixel55', 'pixel56', 'pixel57', 'pixel58', 'pixel59', 'pixel60', 'pixel61', 'pixel62', 'pixel63', 'pixel64', 'pixel65', 'pixel66', 'pixel67', 'pixel68', 'pixel69', 'pixel70', 'pixel71', 'pixel72', 'pixel73', 'pixel74', 'pixel75', 'pixel76', 'pixel77', 'pixel78', 'pixel79', 'pixel80'





Accuracy of DT (no KPCA) =  1.0
F1 Score of DT (no KPCA) = 1.0
Precision Score of DT (no KPCA) = 1.0
Recall Score of DT (no KPCA) = 1.0
Time Taken for non-KPCA: 0.24434852600097656

 KPCA Accuracy Analysis: 

KPCA Accuracy = 0.3333333333333333
KPCA F1 Score = 0.3333333333333333
KPCA Precision Score = 0.3333333333333333
KPCA Recall Score = 0.3333333333333333
Time Taken for KPCA: 0.25531816482543945


(1400, 2)




### Q2 & Q4.) Comparing the Dimenionality Reduction Techniques with Decision Tree by Analysing with the Metrics Scores

In [5]:
# ### PCA

# X_test_pca = pca.transform(X_test_std)
# y_pred = tree_model.predict(X_test_pca)
# end_time = time.time()
# time_taken = end_time - begin_time
# print("PCA Accuracy Analysis: \n")
# print("PCA Accuracy =", accuracy_score(y_pred, y_test))
# print("PCA F1 Score =", f1_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
# print("PCA Precision Score =", precision_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
# print("PCA Recall Score =", recall_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
# print("Time Taken for PCA:", time_taken)

# ### LDA
# tree_model = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, random_state = 1)
# tree_model.fit(X_train_lda, y_train)
# X_test_lda = lda.transform(X_test_std)
# y_pred = tree_model.predict(X_test_lda)
# end_time = time.time()
# time_taken = end_time - begin_time
# print("\n LDA Accuracy Analysis: \n")
# print("LDA Accuracy =", accuracy_score(y_pred, y_test))
# print("LDA F1 Score =", f1_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
# print("LDA Precision Score =", precision_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
# print("LDA Recall Score =", recall_score(y_test, y_pred, pos_label = 'positive', average = 'micro'))
# print("Time Taken for LDA:", time_taken)

# ### KPCA
# tree_model1 = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, random_state = 1)
# tree_model1.fit(X_train, y_train)
# y_pred1 = tree_model1.predict(X_test)
# end_time = time.time()
# time_taken = end_time - begin_time
# print("\n")
# print("Accuracy of DT (no KPCA) = ", sum(y_test==y_pred1)/y_test.shape[0])
# print("F1 Score of DT (no KPCA) =", f1_score(y_test, y_pred1, pos_label = 'positive', average = 'micro'))
# print("Precision Score of DT (no KPCA) =", precision_score(y_test, y_pred1, pos_label = 'positive', average = 'micro'))
# print("Recall Score of DT (no KPCA) =", recall_score(y_test, y_pred1, pos_label = 'positive', average = 'micro'))
# print("Time Taken for non-KPCA:", time_taken)
# print("\n KPCA Accuracy Analysis: \n")
# tree_model2 = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, random_state = 1)
# tree_model2.fit(X_train_kpca, y_train)
# X_test_kpca = kpca.transform(X_test_std)
# y_pred2 = tree_model2.predict(X_test_kpca)
# end_time = time.time()
# time_taken = end_time - begin_time
# print("KPCA Accuracy =", sum(y_test==y_pred2)/y_test.shape[0])
# print("KPCA F1 Score =", f1_score(y_test, y_pred2, pos_label = 'positive', average = 'micro'))
# print("KPCA Precision Score =", precision_score(y_test, y_pred2, pos_label = 'positive', average = 'micro'))
# print("KPCA Recall Score =", recall_score(y_test, y_pred2, pos_label = 'positive', average = 'micro'))
# print("Time Taken for KPCA:", time_taken)
# print("\n")