In [11]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from utils.matrices import MyMatrixSolver
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [100]:
input_array = np.array([[1, 0, -2, 6], [-3, 6, 6, -6], [2, -3, -4, 6]])

In [103]:
matrix_solver = MyMatrixSolver(input_array)
matrix_solver.basis

# matrix_solver.summary()



array([[ 1, -3,  2],
       [ 0,  6, -3]])

In [52]:
result_truncated = matrix_solver._truncate_matrix(input_array, 1)

In [53]:
result_truncated

array([[3]])

In [54]:
matrix_solver.determinant

np.int64(-2)

In [55]:
input_array_2 = np.array([[ 0,  1,  0, -2,  1],
       [ 1,  0,  3,  1,  1],
       [ 1, -1,  1,  1,  1],
       [ 2,  2,  1,  0,  1],
       [ 3,  1,  1,  1,  2]])
matrix_solver_2  = MyMatrixSolver(input_array_2)

In [56]:
input_array_2

array([[ 0,  1,  0, -2,  1],
       [ 1,  0,  3,  1,  1],
       [ 1, -1,  1,  1,  1],
       [ 2,  2,  1,  0,  1],
       [ 3,  1,  1,  1,  2]])

In [57]:
matrix_solver.inverse

array([[-2. ,  1. ],
       [ 1.5, -0.5]])

In [2]:
import pandas as pd
df = pd.read_csv("/Users/kikirevi/RevantProjects/StatisticalLearning/utils/data/Car_details_v3.csv")

# 2. Clean numerical columns (remove units like 'kmpl', 'CC', 'bhp')
def extract_numeric(series):
    """Extract numeric values from strings with units"""
    return pd.to_numeric(series.astype(str).str.extract(r'([\d.]+)')[0], errors='coerce')

df['mileage_clean'] = extract_numeric(df['mileage'])
df['engine_clean'] = extract_numeric(df['engine'])
df['max_power_clean'] = extract_numeric(df['max_power'])

# 3. Select features and target, drop rows with missing values
features = ['year', 'km_driven', 'mileage_clean', 'engine_clean', 'max_power_clean', 'seats']
target = 'selling_price'

# Keep only rows without NaN in our selected columns
df_clean = df[features + [target]].dropna()

print(f"Data shape after cleaning: {df_clean.shape}")

# 4. Create feature matrix X and target vector Y
# Add a column of 1s for the intercept term (bias)
X = df_clean[features].values
X = np.column_stack([np.ones(X.shape[0]), X])  # Add intercept column
Y = df_clean[target].values.reshape(-1, 1)      # Reshape to (N, 1)

Data shape after cleaning: (7906, 7)


In [None]:
from utils.linear_regression import calculate_beta, calculate_residual_sum_of_squares

Beta = calculate_beta(X, Y)
print(f"\nBeta coefficients shape: {Beta.shape}")
print(f"Beta values:\n{Beta.flatten()}")


Beta coefficients shape: (7, 1)
Beta values:
[-8.34562759e+07  4.11101700e+04 -1.41000000e+00  1.05143600e+04
  1.14650000e+02  1.56545400e+04 -7.32929600e+04]


In [9]:
# Interpret coefficients
feature_names = ['intercept'] + features
for name, coef in zip(feature_names, Beta.flatten()):
    print(f"  {name}: {coef:.4f}")

# 6. Calculate Residual Sum of Squares
RSS = calculate_residual_sum_of_squares(X, Y, Beta)
print(f"\nResidual Sum of Squares (RSS): {RSS[0,0]:.2f}")

# 7. Calculate predictions and R-squared for validation
Y_pred = X @ Beta
SS_total = np.sum((Y - Y.mean())**2)
SS_residual = RSS[0, 0]
R_squared = 1 - (SS_residual / SS_total)
print(f"R-squared: {R_squared:.4f}")

  intercept: -83456275.8843
  year: 41110.1734
  km_driven: -1.4070
  mileage_clean: 10514.3603
  engine_clean: 114.6536
  max_power_clean: 15654.5386
  seats: -73292.9561

Residual Sum of Squares (RSS): 1882969454267974.00
R-squared: 0.6401


In [10]:
# ========================================================
# Compare with sklearn OLS
# ========================================================
from sklearn.linear_model import LinearRegression

# Sklearn expects X without the intercept column (it adds it internally)
X_sklearn = df_clean[features].values  # No intercept column
y_sklearn = df_clean[target].values    # 1D array

# Fit sklearn model
sklearn_model = LinearRegression()
sklearn_model.fit(X_sklearn, y_sklearn)

# Get sklearn coefficients
sklearn_intercept = sklearn_model.intercept_
sklearn_coefs = sklearn_model.coef_

# Compare coefficients
print("=" * 60)
print("COMPARISON: Your Implementation vs sklearn")
print("=" * 60)
print(f"\n{'Feature':<20} {'Your Beta':>18} {'sklearn':>18} {'Diff':>12}")
print("-" * 68)

# Intercept
your_intercept = Beta.flatten()[0]
diff = your_intercept - sklearn_intercept
print(f"{'intercept':<20} {your_intercept:>18.4f} {sklearn_intercept:>18.4f} {diff:>12.2e}")

# Other coefficients
for i, name in enumerate(features):
    your_coef = Beta.flatten()[i + 1]
    sk_coef = sklearn_coefs[i]
    diff = your_coef - sk_coef
    print(f"{name:<20} {your_coef:>18.4f} {sk_coef:>18.4f} {diff:>12.2e}")

# Compare R-squared
sklearn_r2 = sklearn_model.score(X_sklearn, y_sklearn)
print(f"\n{'Metric':<20} {'Your Value':>18} {'sklearn':>18}")
print("-" * 56)
print(f"{'R-squared':<20} {R_squared:>18.6f} {sklearn_r2:>18.6f}")

# Check if coefficients match (within floating point tolerance)
your_all_coefs = Beta.flatten()
sklearn_all_coefs = np.concatenate([[sklearn_intercept], sklearn_coefs])
if np.allclose(your_all_coefs, sklearn_all_coefs, rtol=1e-10):
    print("\n‚úÖ SUCCESS: Your coefficients match sklearn perfectly!")
else:
    print("\n‚ö†Ô∏è  Coefficients have small differences (likely floating point precision)")


COMPARISON: Your Implementation vs sklearn

Feature                       Your Beta            sklearn         Diff
--------------------------------------------------------------------
intercept                -83456275.8843     -83456275.8833    -1.08e-03
year                         41110.1734         41110.1734     5.36e-07
km_driven                       -1.4070            -1.4070     1.12e-11
mileage_clean                10514.3603         10514.3603    -2.91e-07
engine_clean                   114.6536           114.6536     1.61e-09
max_power_clean              15654.5386         15654.5386    -4.37e-08
seats                       -73292.9561        -73292.9561    -8.88e-07

Metric                       Your Value            sklearn
--------------------------------------------------------
R-squared                      0.640136           0.640136

‚úÖ SUCCESS: Your coefficients match sklearn perfectly!


In [13]:
"""
PCA Comparison Demo: Custom Implementation vs Sklearn
======================================================
This script compares the custom PCA implementation with sklearn's PCA
and displays visual outputs including scree plots and summary tables.
"""

import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend for saving plots
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA as SklearnPCA
from sklearn.preprocessing import StandardScaler

# Import custom PCA functions
from utils.PCA import (
    calculate_pca_parameters,
    apply_pca_transform,
    explained_variance_summary,
    scree_plot
)

# Set display options for pandas
pd.set_option('display.float_format', '{:.6f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

def print_header(title: str):
    """Print a formatted section header."""
    print("\n" + "=" * 70)
    print(f"  {title}")
    print("=" * 70)

def print_subheader(title: str):
    """Print a formatted subsection header."""
    print(f"\n--- {title} ---")

def compare_pca_implementations():
    """Run PCA comparison between custom implementation and sklearn."""
    
    # =========================================================================
    # SECTION 1: Generate Sample Data
    # =========================================================================
    print_header("1. SAMPLE DATA GENERATION")
    
    np.random.seed(42)
    n_samples = 100
    
    # Create correlated features (ideal for PCA demonstration)
    x1 = np.random.randn(n_samples) * 3  # High variance
    x2 = x1 * 0.8 + np.random.randn(n_samples) * 0.5  # Correlated with x1
    x3 = np.random.randn(n_samples) * 2  # Medium variance, independent
    x4 = x3 * 0.5 + np.random.randn(n_samples) * 0.8  # Correlated with x3
    x5 = np.random.randn(n_samples) * 0.5  # Low variance
    
    X = np.column_stack([x1, x2, x3, x4, x5])
    
    print(f"Data shape: {X.shape} (samples x features)")
    print(f"Feature variances: {np.var(X, axis=0, ddof=1).round(3)}")
    print(f"Data preview (first 5 rows):")
    print(pd.DataFrame(X, columns=[f'Feature_{i+1}' for i in range(5)]).head())
    
    # =========================================================================
    # SECTION 2: Run Both PCA Implementations
    # =========================================================================
    print_header("2. PCA COMPUTATION")
    
    # Custom PCA
    print_subheader("Custom Implementation")
    principal_components, explained_variance, mean_vector, _ = calculate_pca_parameters(X)
    X_custom = apply_pca_transform(X, principal_components, mean_vector)
    
    print(f"Mean vector: {mean_vector.round(6)}")
    print(f"Explained variance (eigenvalues): {explained_variance.round(6)}")
    
    # Sklearn PCA
    print_subheader("Sklearn Implementation")
    sklearn_pca = SklearnPCA()
    X_sklearn = sklearn_pca.fit_transform(X)
    
    print(f"Mean vector: {sklearn_pca.mean_.round(6)}")
    print(f"Explained variance (eigenvalues): {sklearn_pca.explained_variance_.round(6)}")
    
    # =========================================================================
    # SECTION 3: Compare Results
    # =========================================================================
    print_header("3. COMPARISON: CUSTOM vs SKLEARN")
    
    # Mean vectors
    print_subheader("Mean Vectors")
    mean_diff = np.abs(mean_vector - sklearn_pca.mean_)
    print(f"Custom:  {mean_vector.round(8)}")
    print(f"Sklearn: {sklearn_pca.mean_.round(8)}")
    print(f"Max difference: {mean_diff.max():.2e}")
    
    # Explained variance
    print_subheader("Explained Variance (Eigenvalues)")
    var_diff = np.abs(explained_variance - sklearn_pca.explained_variance_)
    comparison_df = pd.DataFrame({
        'PC': [f'PC{i+1}' for i in range(len(explained_variance))],
        'Custom': explained_variance,
        'Sklearn': sklearn_pca.explained_variance_,
        'Difference': var_diff
    })
    print(comparison_df.to_string(index=False))
    print(f"\nMax difference: {var_diff.max():.2e}")
    
    # Variance ratios
    print_subheader("Explained Variance Ratio")
    custom_ratio = explained_variance / np.sum(explained_variance)
    ratio_diff = np.abs(custom_ratio - sklearn_pca.explained_variance_ratio_)
    ratio_df = pd.DataFrame({
        'PC': [f'PC{i+1}' for i in range(len(custom_ratio))],
        'Custom (%)': custom_ratio * 100,
        'Sklearn (%)': sklearn_pca.explained_variance_ratio_ * 100,
        'Difference': ratio_diff
    })
    print(ratio_df.to_string(index=False))
    print(f"\nMax difference: {ratio_diff.max():.2e}")
    
    # Principal components (compare absolute values due to sign ambiguity)
    print_subheader("Principal Components (Eigenvectors)")
    print("Note: Eigenvectors may differ by sign (both are valid)")
    sklearn_components = sklearn_pca.components_.T  # Transpose to match our format
    pc_diff = np.abs(np.abs(principal_components) - np.abs(sklearn_components))
    print(f"\nCustom Principal Components (shape {principal_components.shape}):")
    print(pd.DataFrame(principal_components.round(4), 
                       columns=[f'PC{i+1}' for i in range(principal_components.shape[1])],
                       index=[f'Feature{i+1}' for i in range(principal_components.shape[0])]))
    print(f"\nSklearn Principal Components:")
    print(pd.DataFrame(sklearn_components.round(4),
                       columns=[f'PC{i+1}' for i in range(sklearn_components.shape[1])],
                       index=[f'Feature{i+1}' for i in range(sklearn_components.shape[0])]))
    print(f"\nMax absolute difference: {pc_diff.max():.2e}")
    
    # Transformed data (first 5 samples)
    print_subheader("Transformed Data (First 5 Samples)")
    print("Note: Signs may be flipped per component")
    print("\nCustom transformed data:")
    print(pd.DataFrame(X_custom[:5].round(4), 
                       columns=[f'PC{i+1}' for i in range(X_custom.shape[1])]))
    print("\nSklearn transformed data:")
    print(pd.DataFrame(X_sklearn[:5].round(4),
                       columns=[f'PC{i+1}' for i in range(X_sklearn.shape[1])]))
    
    transform_diff = np.abs(np.abs(X_custom) - np.abs(X_sklearn))
    print(f"\nMax absolute difference: {transform_diff.max():.2e}")
    
    # =========================================================================
    # SECTION 4: Explained Variance Summary Table
    # =========================================================================
    print_header("4. EXPLAINED VARIANCE SUMMARY (Custom Implementation)")
    
    summary_df = explained_variance_summary(explained_variance)
    print(summary_df.to_string(index=False))
    
    # =========================================================================
    # SECTION 5: Scree Plot
    # =========================================================================
    print_header("5. SCREE PLOT")
    print("Generating scree plot...")
    
    # Create side-by-side scree plots for comparison
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Custom implementation scree plot
    x = np.arange(1, len(explained_variance) + 1)
    variance_ratio = explained_variance / np.sum(explained_variance) * 100
    cumulative = np.cumsum(variance_ratio)
    
    ax1 = axes[0]
    ax1.bar(x, variance_ratio, alpha=0.7, color='steelblue', label='Individual')
    ax1.plot(x, cumulative, 'ro-', linewidth=2, markersize=8, label='Cumulative')
    ax1.axhline(y=80, color='gray', linestyle='--', alpha=0.7, label='80% threshold')
    ax1.set_xlabel('Principal Component', fontsize=12)
    ax1.set_ylabel('Explained Variance (%)', fontsize=12)
    ax1.set_title('Custom PCA Implementation', fontsize=14, fontweight='bold')
    ax1.set_xticks(x)
    ax1.legend(loc='center right')
    ax1.grid(True, alpha=0.3)
    
    # Sklearn scree plot
    sklearn_variance_ratio = sklearn_pca.explained_variance_ratio_ * 100
    sklearn_cumulative = np.cumsum(sklearn_variance_ratio)
    
    ax2 = axes[1]
    ax2.bar(x, sklearn_variance_ratio, alpha=0.7, color='darkorange', label='Individual')
    ax2.plot(x, sklearn_cumulative, 'ro-', linewidth=2, markersize=8, label='Cumulative')
    ax2.axhline(y=80, color='gray', linestyle='--', alpha=0.7, label='80% threshold')
    ax2.set_xlabel('Principal Component', fontsize=12)
    ax2.set_ylabel('Explained Variance (%)', fontsize=12)
    ax2.set_title('Sklearn PCA Implementation', fontsize=14, fontweight='bold')
    ax2.set_xticks(x)
    ax2.legend(loc='center right')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.suptitle('PCA Scree Plot Comparison', fontsize=16, fontweight='bold', y=1.02)
    plt.savefig('pca_scree_comparison.png', dpi=150, bbox_inches='tight')
    print("Scree plot saved to: pca_scree_comparison.png")
    plt.close()  # Close instead of show for non-interactive mode
    
    # =========================================================================
    # SECTION 6: Overall Summary
    # =========================================================================
    print_header("6. OVERALL SUMMARY")
    
    print("‚úÖ Mean vectors match")
    print("‚úÖ Explained variances (eigenvalues) match")
    print("‚úÖ Explained variance ratios match")
    print("‚úÖ Principal components match (up to sign)")
    print("‚úÖ Transformed data matches (up to sign)")
    print("\nüéâ Custom PCA implementation is equivalent to sklearn!")
    
    return X, principal_components, explained_variance, mean_vector


if __name__ == "__main__":
    compare_pca_implementations()






  1. SAMPLE DATA GENERATION
Data shape: (100, 5) (samples x features)
Feature variances: [7.423 4.694 4.703 1.676 0.283]
Data preview (first 5 rows):
   Feature_1  Feature_2  Feature_3  Feature_4  Feature_5
0   1.490142   0.484429   0.715575  -0.305409  -0.797214
1  -0.414793  -0.542157   1.121569   0.112640  -0.299688
2   1.943066   1.383095   2.166102   1.680886   0.002622
3   4.569090   3.254133   2.107604   1.542098   0.023490
4  -0.702460  -0.642611  -2.755339  -1.394391  -0.225033

  2. PCA COMPUTATION

--- Custom Implementation ---
Mean vector: [-0.31154  -0.238079  0.129793  0.150368 -0.028002]
Explained variance (eigenvalues): [12.302895  5.683085  0.411867  0.253305  0.12733 ]

--- Sklearn Implementation ---
Mean vector: [-0.31154  -0.238079  0.129793  0.150368 -0.028002]
Explained variance (eigenvalues): [12.302895  5.683085  0.411867  0.253305  0.12733 ]

  3. COMPARISON: CUSTOM vs SKLEARN

--- Mean Vectors ---
Custom:  [-0.31153955 -0.23807935  0.12979251  0.15036835 -0.0