In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import os

In [None]:
INPUT_FILE = './data/yelp_reviews_merged_features.csv'
try:
    df = pd.read_csv(INPUT_FILE)
    print(f"Dataset loaded with {len(df)} rows and {len(df.columns)} columns.")
except FileNotFoundError:
    print(f"ERROR: File not found. Make sure '{INPUT_FILE}' exists and was generated by the visualization file.")
    exit()

In [None]:
TARGET_COL = 'legit'
y = df[TARGET_COL]

ID_COL = 'id_review'
TEXT_COL = 'review'
cols_to_keep = [ID_COL, TEXT_COL, TARGET_COL]

In [None]:
feature_cols = [col for col in df.columns if col not in cols_to_keep and df[col].dtype in ['int64', 'float64']]

X = df[feature_cols]

print(f"\nNumeric features selected for PCA: {X.shape[1]} columns.")
print(feature_cols)

In [None]:
nan_cols_impute = X.columns[X.isnull().any()].tolist()

if nan_cols_impute:
    print(f"\nPerforming median imputation for columns: {nan_cols_impute}")
    
    medians = X[nan_cols_impute].median()
    
    X = X.fillna(medians)

    if X.isnull().sum().sum() == 0:
        print("Imputation completed successfully. No remaining NaN values.")
    else:
        print("WARNING: There are still NaN values after imputation. Please check the data.")
else:
    print("\nNo NaN values found, proceeding with scaling.")

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\nFeatures standardized (mean ~0, standard deviation ~1).")

In [None]:
# Compute all possible components to analyze explained variance
pca_analysis = PCA(n_components=0.95)
pca_analysis.fit_transform(X_scaled)
print(pca_analysis.n_components_)

# Cumulative explained variance
explained_variance_ratio_cumsum = np.cumsum(pca_analysis.explained_variance_ratio_)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(explained_variance_ratio_cumsum, marker='o', linestyle='--')
plt.title('Cumulative Explained Variance per Principal Component')
plt.xlabel('Number of Principal Components (k)')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)

# Choose a target (e.g. 95% of total variance)
target_variance = 0.95
plt.axhline(
    y=target_variance,
    color='r',
    linestyle='-',
    label=f'{int(target_variance*100)}% Target Variance'
)
plt.legend()
plt.show()

In [None]:
# Find the number of components needed to reach the target variance
try:
    n_components_optimal = np.argmax(
        explained_variance_ratio_cumsum >= target_variance
    ) + 1
except ValueError:
    n_components_optimal = len(feature_cols)  # If not reached, use all components

print(f"\nTarget: {int(target_variance*100)}% of explained variance.")
print(f"{n_components_optimal} principal components (PCs) are required to reach this target.")

In [None]:
# Apply the final PCA with the selected number of components
pca_final = PCA(n_components=n_components_optimal)
X_pca = pca_final.fit_transform(X_scaled)

# Convert PCA results into a DataFrame
pca_cols = [f'PC{i+1}' for i in range(n_components_optimal)]
X_pca_df = pd.DataFrame(
    data=X_pca,
    columns=pca_cols,
    index=df.index
)

print(f"\nFeatures reduced from {len(feature_cols)} to {n_components_optimal} components.")

In [None]:
# Merge ID, text, target, and new Principal Components
df_final_ml = pd.concat([df[cols_to_keep], X_pca_df], axis=1)

# Save the final dataset
OUTPUT_FILE = './data/yelp_reviews_ml_ready_pca.csv'
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
df_final_ml.to_csv(OUTPUT_FILE, index=False)

print(f"\nFinal ML-ready dataset with {df_final_ml.shape[1]} columns saved to {OUTPUT_FILE}.")
print("Numeric features are now represented by Principal Components (PCs).")