Python databricks
=

In [0]:
%python
# ==========================================
# File: iris_pca_analysis.py
# Purpose: Perform PCA on the iris dataset
# Author: Rick Reijnders
# Summary: This script exports the iris dataset, performs PCA, saves a PCA plot, and extracts the most important features.
# Input Variables:
#   - iris: scikit-learn dataset with flower measurements
# Output Files:
#   - iris.csv: saved iris dataset
#   - iris_pca_plot.png: PCA plot image
#   - important_features: list of most important variables
# ==========================================

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load iris dataset
iris_data = load_iris()
X = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)
y = pd.Series(iris_data.target, name='species')

# Export to CSV
iris_df = pd.concat([X, y], axis=1)
iris_df.to_csv("iris.csv", index=False)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Save PCA plot
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k')
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA of Iris Dataset")
plt.legend(handles=scatter.legend_elements()[0], labels=iris_data.target_names)
plt.grid(True)
plt.tight_layout()
plt.savefig("iris_pca_plot.png", dpi=300)
plt.close()

# Get most important features from PC1
loadings = pd.Series(abs(pca.components_[0]), index=iris_data.feature_names)
important_features = loadings.sort_values(ascending=False).index.tolist()

# Print results
print("Most important features (PC1):")
print(important_features)

# Improvements to Implement:
# - Add explained variance plot
# - Automate top-N feature selection
# - Enable CLI arguments for file paths
