# Instructor Do: Speeding up ML algorithms with PCA

In [7]:
import pandas as pd
from path import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas



In [8]:
# Loading the preprocesses iris dataset
file_path = Path(r"C:\Users\TribThapa\Desktop\Thapa\ResearchFellow\Courses\FinTech_Bootcamp_MonashUni2021\monu-mel-virt-fin-pt-05-2021-u-c\Activities\Week 13\1\05-Ins_PCA\Solved\Data\new_iris_data.csv")

df_iris = pd.read_csv(file_path)

df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


## Using PCA

In [9]:
# Standarize data with StandarScaler
iris_scaled = StandardScaler().fit_transform(df_iris)

print(iris_scaled[:10])

[[-0.90068117  1.03205722 -1.3412724  -1.31297673]
 [-1.14301691 -0.1249576  -1.3412724  -1.31297673]
 [-1.38535265  0.33784833 -1.39813811 -1.31297673]
 [-1.50652052  0.10644536 -1.2844067  -1.31297673]
 [-1.02184904  1.26346019 -1.3412724  -1.31297673]
 [-0.53717756  1.95766909 -1.17067529 -1.05003079]
 [-1.50652052  0.80065426 -1.3412724  -1.18150376]
 [-1.02184904  0.80065426 -1.2844067  -1.31297673]
 [-1.74885626 -0.35636057 -1.3412724  -1.31297673]
 [-1.14301691  0.10644536 -1.2844067  -1.4444497 ]]


In [13]:
# Applying PCA to reduce dimensions from 4 to 2
# Initialize PCA model
pca = PCA(n_components=2)

# Get two principal components for the iris data.
iris_pca = pca.fit_transform(iris_scaled)

iris_pca[:10]

array([[-2.26454173,  0.5057039 ],
       [-2.0864255 , -0.65540473],
       [-2.36795045, -0.31847731],
       [-2.30419716, -0.57536771],
       [-2.38877749,  0.6747674 ],
       [-2.07053681,  1.51854856],
       [-2.44571134,  0.07456268],
       [-2.23384186,  0.24761393],
       [-2.34195768, -1.09514636],
       [-2.18867576, -0.44862905]])

In [14]:
# Transform PCA data to a DataFrame
df_iris_pca = pd.DataFrame(iris_pca,
                           columns=["PCA1", "PCA2"])

df_iris_pca.head()

Unnamed: 0,PCA1,PCA2
0,-2.264542,0.505704
1,-2.086426,-0.655405
2,-2.36795,-0.318477
3,-2.304197,-0.575368
4,-2.388777,0.674767


In [15]:
# Fetch the explained variance
pca.explained_variance_ratio_

array([0.72770452, 0.23030523])

## Running KMeans with PCA Data

In [17]:
# Finding the best value for k
inertia = []

k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    
    km = KMeans(n_clusters=i, random_state=0)
    
    km.fit(df_iris_pca)
    
    inertia.append(km.inertia_)

# Creating the Elbow Curve
elbow_data = {"k": k, "inertia": inertia}

df_elbow = pd.DataFrame(elbow_data)

df_elbow.hvplot.line(x="k",
                     y="inertia",
                     xticks=k,
                     title="Elbow Curve")           

  f"KMeans is known to have a memory leak on Windows "


In [18]:
# Predicting clusters with k=3

# Initialize the K-Means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_iris_pca)

# Predict clusters
pred = model.predict(df_iris_pca)

# Add the predicted class columns
df_iris_pca["class"] = pred

df_iris_pca.head(10)


Unnamed: 0,PCA1,PCA2,class
0,-2.264542,0.505704,0
1,-2.086426,-0.655405,0
2,-2.36795,-0.318477,0
3,-2.304197,-0.575368,0
4,-2.388777,0.674767,0
5,-2.070537,1.518549,0
6,-2.445711,0.074563,0
7,-2.233842,0.247614,0
8,-2.341958,-1.095146,0
9,-2.188676,-0.448629,0


In [19]:
# Plotting the clusters
df_iris_pca.hvplot.scatter(x="PCA1",
                           y="PCA2",
                           hover_cols=["class"],
                           by="class")