<a href="https://colab.research.google.com/github/WilliamShengYangHuang/AALU_Workshop_3/blob/main/Simple_PCA_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Please 'run all' (Ctrl + F9).
-------------------------------------------

---



## Import Necessary Libraries

In [38]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from google.colab import files, drive
from ipywidgets import interact, IntSlider
import os

# Mount Google Drive
drive.mount('/content/drive')

# Set the output directory in Google Drive
output_dir = "/content/drive/My Drive/PCA_results/"  #@param{type:'string'}
os.makedirs(output_dir, exist_ok=True)

# Upload the CSV file
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

# Load the dataset
data = pd.read_csv(file_name)

# Display the raw data for inspection
print("Dataset loaded successfully!")
print("Shape of the dataset:", data.shape)
print("Preview of the dataset:")
print(data.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Upload the Source CSV File

## Principal Component Analysis (PCA)

The parameter n_components in PCA specifies the number of principal components to keep in the output, effectively controlling the dimensionality of the transformed data.

In [42]:
# Drop non-numeric columns (e.g., 'Country' if present)
non_numeric_columns = data.select_dtypes(exclude=np.number).columns
data_numeric = data.drop(columns=non_numeric_columns, errors='ignore')

# Convert all columns to numeric and handle errors
data_numeric = data_numeric.apply(pd.to_numeric, errors='coerce')

# Handle missing values
if data_numeric.isnull().any().any():
    print("Handling missing values by replacing them with column mean.")
    data_numeric = data_numeric.fillna(data_numeric.mean())

# Ensure data is not empty
if data_numeric.empty:
    print("The dataset is empty after preprocessing. Please check your input file.")
else:
    # Standardise the numeric data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data_numeric)

    # Define a function to perform PCA dynamically
    def perform_pca(n_components):
        pca = PCA(n_components=n_components)
        pca_result = pca.fit_transform(scaled_data)

        # Explained variance
        explained_variance = pca.explained_variance_ratio_
        print(f"Explained variance by each component: {explained_variance}")
        print(f"Total explained variance: {np.sum(explained_variance):.2f}")

        # Visualise the PCA results
        if n_components == 1:
            plt.figure(figsize=(8, 6))
            plt.scatter(pca_result[:, 0], np.zeros_like(pca_result[:, 0]), alpha=0.7)
            plt.title(f"PCA Visualization (Component 1)")
            plt.xlabel("Principal Component 1")
            plt.grid()
            plt.show()
        elif n_components >= 2:
            plt.figure(figsize=(8, 6))
            plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.7)
            plt.title(f"PCA Visualisation (First {n_components} Components)")
            plt.xlabel("Principal Component 1")
            plt.ylabel("Principal Component 2")
            plt.grid()
            plt.show()

        # Save PCA result to a CSV file in the output directory
        output_df = pd.DataFrame(pca_result, columns=[f"PC{i+1}" for i in range(n_components)])
        output_file = os.path.join(output_dir, f"PCA_result_{n_components}_components.csv")
        output_df.to_csv(output_file, index=False)
        print(f"PCA results saved to {output_file}")

    # Create an interactive slider for selecting n_components
    interact(perform_pca, n_components=IntSlider(min=1, max=3, step=1, value=2))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Saving Country-Pavilions_Text_Matrix2 (2).csv to Country-Pavilions_Text_Matrix2 (2) (3).csv
Dataset loaded successfully!
Shape of the dataset: (63, 11)
Preview of the dataset:
                Country  History  Sustainability  Urban Transformation  \
0               Albania        6               4                     7   
1             Argentina        7              10                     6   
2             Australia        9               7                     9   
3               Austria        7               6                     8   
4  Bahrain (Kingdom of)        6               9                     7   

   Inclusivity  Collaboration  Ethics  Pluralism  Material Innovation  \
0            6              8       7          7                    9   
1            8              7       8          6                    6   
2            8              7      10          8                    5   
3            9              9       8          9                    5   
4            5

interactive(children=(IntSlider(value=2, description='n_components', max=3, min=1), Output()), _dom_classes=('…