# Principal Component Analysis (PCA) and Principal Component Regression (PCR)

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

cut_outliers = cut_outliers['original']
df = cut_outliers.copy()

X = df.drop(columns=["Price"], axis=1)
y = df["Price"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [None]:
# Perform PCA
pca = PCA()
X_pca = pca.fit_transform(X)

# Create DataFrame with PCA results
pca_df = pd.DataFrame(
    data=X_pca,
    columns=[f'PC{i+1}' for i in range(X_pca.shape[1])]
)

# Calculate variance explained
variance_explained = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(variance_explained)

# Create DataFrame with variance information
variance_df = pd.DataFrame({
    'PC': [f'PC{i+1}' for i in range(len(variance_explained))],
    'Variance Explained': variance_explained,
    'Cumulative Variance': cumulative_variance
})

print("\nVariance Explained by each Principal Component:")
print(variance_df)

In [None]:
# Create a figure with multiple subplots
plt.figure(figsize=(15, 10))

# Scree Plot
plt.subplot(2, 2, 1)
plt.bar(range(1, len(variance_explained) + 1), variance_explained)
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.title('Scree Plot')

# Cumulative Variance Plot
plt.subplot(2, 2, 2)
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, 'bo-')
plt.axhline(y=0.8, color='r', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Variance Explained')
plt.title('Cumulative Variance Explained')

plt.tight_layout()
plt.show()

## Principal Component Regression (PCR) Analysis

In [None]:
def calculate_pcr_metrics(X_train, y_train, X_test, y_test, n_components):
    # Transform data using PCA
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    
    # Fit linear regression
    model = LinearRegression()
    model.fit(X_train_pca, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_pca)
    
    # Calculate metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    return r2, rmse

# Initialize lists to store results
n_components_range = range(1, X.shape[1] + 1)
metrics = {
    'R2': [],
    'RMSE': []
}

# Perform PCR for different numbers of components
for n in n_components_range:
    r2, rmse = calculate_pcr_metrics(X_train, y_train, X_test, y_test, n)
    metrics['R2'].append(r2)
    metrics['RMSE'].append(rmse)
    print(f"\nNumber of components: {n}")
    print(f"R²: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")

In [None]:
# Create plots for PCR metrics
plt.figure(figsize=(15, 5))

# R² plot
plt.subplot(1, 2, 1)
plt.plot(n_components_range, metrics['R2'], 'b-o')
plt.title('R² vs Number of Components')
plt.xlabel('Number of Components')
plt.ylabel('R²')
plt.grid(True)

# RMSE plot
plt.subplot(1, 2, 2)
plt.plot(n_components_range, metrics['RMSE'], 'r-o')
plt.title('RMSE vs Number of Components')
plt.xlabel('Number of Components')
plt.ylabel('RMSE')
plt.grid(True)

plt.tight_layout()
plt.show()

El resultado con tantas variables no es deseable, la cantidad de componentes principales que lleguen a representar un 80% de la varianza deberian estar entre 2 y 4. Estamos logrando esto recien con 14 componentes, lo que no es deseable. Vamos a probar PCA pero quitando las variables y dejando solo la que seleccionamos a mano.

In [None]:
df_cut = cut_outliers.copy()
columns = ["Price","Central_Lock", "Met_Color", "Airbag_2", "ABS", "Backseat_Divider", "Metallic_Rim", "Radio", "Diesel", "Airbag_1", "Sport_Model", "m_16v", "m_vvti", "Automatic",
        "Gears", "m_sedan", "m_bns", "m_wagon", "Power_Steering", "Mistlamps", "Tow_Bar", "m_matic4", "m_matic3", "m_g6", "m_gtsi", "m_sport", "Boardcomputer", 
            "m_terra", "m_luna", "m_sol", "m_comfort", "CD_Player", "Powered_Windows", "BOVAG_Guarantee", "Airco", "Mfr_Guarantee", "m_hatch_b", "m_liftb", "m_d4d", "Five_Doors",
            "Trunk", "m_exec"]
df_cut = df_cut.drop(columns, axis=1)

pca = PCA()
X_pca = pca.fit_transform(df_cut)

# Calculate variance explained
variance_explained = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(variance_explained)

# Create DataFrame with variance information
variance_df = pd.DataFrame({
    'PC': [f'PC{i+1}' for i in range(len(variance_explained))],
    'Variance Explained': variance_explained,
    'Cumulative Variance': cumulative_variance
})

print("\nVariance Explained by each Principal Component:")
print(variance_df)