In [76]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

## with PCA

In [77]:
m = [[1, 7], [8, 3]]
e_values, e_vectors = np.linalg.eig(m)
print(f'e_values = {e_values},\ne_vectors = {e_vectors}')

e_values = [-5.54983444  9.54983444],
e_vectors = [[-0.73019609 -0.63349151]
 [ 0.68323764 -0.77374964]]


In [78]:
df = pd.read_csv('../../../Datasets/ML/winequality-white.csv', sep = ';')

In [79]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [80]:
X = df.drop('quality', axis = 1)
y = df['quality']

In [81]:
sc = StandardScaler()
X_scaled = sc.fit_transform(X)

In [82]:
pca = PCA(n_components = 4)
pca_X = pca.fit_transform(X_scaled)

In [83]:
X_train, X_test, y_train, y_test = train_test_split(pca_X, y, train_size = 0.8, random_state = 42)

In [84]:
tree = DecisionTreeClassifier(criterion = 'gini',
                              max_depth = 2,
                              min_samples_leaf = 3,
                              max_features = 'sqrt')

tree.fit(X_train, y_train)
pca_acc = accuracy_score(y_test, tree.predict(X_test))


## without PCA

In [85]:
df = pd.read_csv('../../../Datasets/ML/winequality-white.csv', sep = ';')

In [86]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [87]:
X = df.drop('quality', axis = 1)
y = df['quality']

In [88]:
sc = StandardScaler()
X_scaled = sc.fit_transform(X)

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42)

In [90]:
tree = DecisionTreeClassifier(criterion = 'gini',
                              max_depth = 2,
                              min_samples_leaf = 3,
                              max_features = 'sqrt')

tree.fit(X_train, y_train)
non_pca_acc = accuracy_score(y_test, tree.predict(X_test))

## final comparison

In [91]:
print(f'accuracy without PCA: {non_pca_acc * 100 :.2f}%\naccuracy with PCA: {pca_acc * 100:.2f}%')

accuracy without PCA: 44.80%
accuracy with PCA: 45.82%


In [92]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# 1. Load a sample dataset (e.g., Iris dataset)
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

# 2. Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Apply PCA (reduce to 2 principal components for visualization)
pca = PCA(n_components=2)
pca_components = pca.fit_transform(X_scaled)

# 4. Analyze explained variance
explained_variance = pca.explained_variance_ratio_
print(f'PCA Explained Variance (PC1 & PC2): {explained_variance}')
print(f'Total Explained Variance: {sum(explained_variance):.2f}%')

X_train, X_test, y_train, y_test = train_test_split(pca_components, y, train_size = 0.8, random_state = 42)

tree = DecisionTreeClassifier(criterion = 'gini',
                              max_depth = 2,
                              min_samples_leaf = 3,
                              max_features = 'sqrt')

tree.fit(X_train, y_train)
non_pca_acc = accuracy_score(y_test, tree.predict(X_test))

print(f'accuracy without PCA: {non_pca_acc * 100 :.2f}%\naccuracy with PCA: {pca_acc * 100:.2f}%')
# 5. Visualize the results
# plt.figure(figsize=(8, 6))
# plt.scatter(pca_components[:, 0], pca_components[:, 1], c=y, cmap='viridis')
# plt.title('PCA - Scatter Plot of First Two Components')
# plt.xlabel(f'PC1 ({explained_variance[0]*100:.2f}% variance)')
# plt.ylabel(f'PC2 ({explained_variance[1]*100:.2f}% variance)')
# plt.colorbar(label='Target Class')
# plt.grid(True)
# plt.show()


PCA Explained Variance (PC1 & PC2): [0.72962445 0.22850762]
Total Explained Variance: 0.96%
accuracy without PCA: 46.67%
accuracy with PCA: 45.82%
