In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

csv_path = Path(r"G:\Other computers\Dell G3\Universitat\2nd Semester\MLMM\New\feature_engineered_data.csv")

# Load data
df_all = pd.read_csv(csv_path, sep=";")
print(df_all.shape)
# Use ALL columns except the last one (bucket).
# Prefer explicit drop if column is named 'bucket'; fall back to iloc if needed.
if 'bucket' in df_all.columns:
    X_df = df_all.drop(columns=['bucket'])
else:
    X_df = df_all.iloc[:, :-1]

# Convert to numpy
X = X_df.values

# Standardize
scaler_all = StandardScaler()
X_scaled = scaler_all.fit_transform(X)

# Fit PCA to full dimensionality (let PCA decide max)
pca_full = PCA(n_components=None, random_state=42)
pca_full.fit(X_scaled)

explained = pca_full.explained_variance_ratio_
cum_explained = np.cumsum(explained)

# Plot cumulative explained variance
plt.figure(figsize=(6,4))
plt.plot(range(1, len(cum_explained)+1), cum_explained, marker='o')
plt.axhline(0.9059, linestyle='--')  # reference line at 90%
plt.xlabel("Number of components")
plt.ylabel("Cumulative explained variance")
plt.title("PCA: cumulative explained variance")
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

# Print a quick table (first 25 PCs)
print("PC  |  VarRatio  |  Cumulative")
for i, (vr, cv) in enumerate(zip(explained, cum_explained), start=1):
    if i <= 25 or i == len(explained):
        print(f"{i:>3} |  {vr:8.4f} |  {cv:9.4f}")
print(f"\nTotal components available: {len(explained)}")
print(f"Input features used: {X_df.shape[1]} (excluded column: 'bucket')")


In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

csv_path = Path(r"G:\Other computers\Dell G3\Universitat\2nd Semester\MLMM\New\feature_engineered_data.csv")
target_variance = 0.90

# Load
df_all = pd.read_csv(csv_path, sep=";")

if 'bucket' in df_all.columns:
    feature_df = df_all.drop(columns=['bucket'])
else:
    feature_df = df_all.iloc[:, :-1]

X = feature_df.values

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit full PCA to measure variance curve
pca_probe = PCA(n_components=None, random_state=42)
pca_probe.fit(X_scaled)

explained = pca_probe.explained_variance_ratio_
cum_explained = np.cumsum(explained)

# Pick the smallest k reaching the target variance
k = int(np.searchsorted(cum_explained, target_variance) + 1)
k = min(k, X_scaled.shape[1])  # safety

print(f"Target variance = {target_variance:.2%}")
print(f"Optimal k = {k} components (cumulative variance = {cum_explained[k-1]:.4f})")

# Fit PCA with optimal k and transform
pca_opt = PCA(n_components=k, random_state=42)
pcs = pca_opt.fit_transform(X_scaled)

# Assemble output
pc_cols = [f"PC{i}" for i in range(1, k+1)]
out_df = pd.concat(
    [df_all.reset_index(drop=True),
     pd.DataFrame(pcs, columns=pc_cols).reset_index(drop=True)],
    axis=1
)

# Save
pca_out_path = csv_path.with_name("feature_engineered_with_pca_optimal.csv")
out_df.to_csv(pca_out_path, index=False)

# Report
print("Wrote:", pca_out_path)
print("Rows:", len(out_df),
      "Original features (excluding 'bucket'):", X.shape[1],
      "PCs:", k)
print("Explained variance by PC (first 15):", [round(v,4) for v in explained[:15]])
print("Cumulative explained variance at k:", round(cum_explained[k-1], 4))
