In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from joblib import dump

In [None]:
#Connect Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Read data from Excel CSV file
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/dataset_extended_gradient_full.csv')
# Create DataFrame
df = pd.DataFrame(data)

# Prepare features and target
ftr = df.iloc[:,3:-1] # Excludes file path columns and target feature
target = df['DeepFake']  # Target Feature

In [None]:
# Initialize and fit StandardScaler
scaler = StandardScaler()
ftr_scaled = scaler.fit_transform(ftr)

In [None]:
# Initialize and fit PCA
pca = PCA(n_components=0.9)  # Retain 90% of the variance
ftr_pca = pca.fit_transform(ftr_scaled)

In [None]:
# Convert the PCA output back to a DataFrame
df_pca = pd.DataFrame(ftr_pca, columns=[f'Principal Component {i+1}' for i in range(ftr_pca.shape[1])])

In [None]:
# Add the target feature back to the new DataFrame
df_pca['DeepFake'] = target.values

In [None]:
# Save the transformed data to a new CSV file
df_pca.to_csv('/content/drive/MyDrive/Colab Notebooks/Data/transformed_data.csv', index=False)

In [None]:
# Save the PCA loadings (components) to view impactfullness of each feature for each Principal Component
loadings = pd.DataFrame(pca.components_.T, columns=[f'Principal Component {i+1}' for i in range(ftr_pca.shape[1])], index=ftr.columns)
loadings.to_csv('/content/drive/MyDrive/Colab Notebooks/Data/pca_loadings.csv')

In [None]:
# Save the scaler and PCA objects as joblib files
dump(scaler, '/content/drive/MyDrive/Colab Notebooks/Data Transformation/artifacts/scaler.joblib')
dump(pca, '/content/drive/MyDrive/Colab Notebooks/Data Transformation/artifacts/pca.joblib')