In [3]:
import os
os.chdir('C:\\Users\\josep\\OneDrive\\Desktop\\perceptron-from-scratch')
print("Working directory:", os.getcwd())

import pandas as pd
from labs.feature_engineering.src.feature_engineering import load_auto_mpg, preprocess_features, preprocess_car_name, evaluate_representation

# Load data
df = load_auto_mpg('labs/feature_engineering/data/auto-mpg.tsv')
y = df['mpg'].values

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'C:\\Users\\josep\\OneDrive\\Desktop\\perceptron-from-scratch'

# 1A: Problems with raw features
"""
Using raw features may lead to:
- Poor generalization: Features like model_year (e.g., 70–82) or displacement (in cubic inches) have large ranges, dominating the perceptron's weights.
- Sensitivity to scale: Features with different units (e.g., horsepower vs. acceleration) affect weight updates unevenly.
- No handling of categorical data: origin and cylinders need encoding.
- Temporal mismatch: 2019 cars may have different feature distributions (e.g., hybrid engines), reducing model relevance.
"""

# 1B: Feature representation choices


In [None]:
feature_configs = {
    'cylinders': 'one-hot',  # Discrete (3,4,5,6,8); one-hot captures categories without assuming order
    'displacement': 'standard',  # Continuous; standardize to normalize scale
    'horsepower': 'standard',  # Continuous; standardize for consistency
    'weight': 'standard',  # Continuous; standardize to balance contribution
    'acceleration': 'standard',  # Continuous; standardize to normalize
    'model_year': 'one-hot',  # Discrete (70–82); one-hot avoids ordinal assumption
    'origin': 'one-hot'  # Categorical (1,2,3); one-hot for proper encoding
}

"""
Tradeoffs:
- cylinders: one-hot vs. raw (ordinal assumption incorrect); drop loses info.
- displacement/horsepower/weight/acceleration: standard vs. raw (scale issues); drop loses key predictors.
- model_year: one-hot vs. raw (non-linear effect); drop ignores temporal trends.
- origin: one-hot vs. raw (categorical, not ordinal); drop loses origin effect.
"""

# 1C: Car name transformation


In [5]:
car_name_feature = preprocess_car_name(df)

NameError: name 'preprocess_car_name' is not defined

"""
Car name (text) can be transformed by:
- Length of name: Simple numeric feature, may correlate with brand complexity.
- Brand extraction: One-hot encode manufacturer (e.g., Ford, Toyota), but requires parsing.
- Word embeddings: Advanced, but complex for perceptron.
Tradeoff: Length is simple but less informative; brand is more meaningful but needs cleaning.
"""

# 1D: Evaluate feature representations


In [6]:
representations = [
    {'cylinders': 'raw', 'displacement': 'raw', 'horsepower': 'raw', 'weight': 'raw', 'acceleration': 'raw', 'model_year': 'raw', 'origin': 'raw'},
    feature_configs,
    {'cylinders': 'one-hot', 'displacement': 'standard', 'horsepower': 'standard', 'weight': 'standard', 'acceleration': 'drop', 'model_year': 'one-hot', 'origin': 'one-hot'}
]
for i, config in enumerate(representations):
    X = preprocess_features(df, config)
    accuracy = evaluate_representation(X, y, method='cross_val')
    print(f"Representation {i+1} accuracy (10-fold CV): {accuracy:.4f}")

NameError: name 'preprocess_features' is not defined

"""
Best method: 10-fold cross-validation (option 4).
- Full data (option 1): Overfits, no generalization estimate.
- Train+test sum (option 2): Biases toward training performance.
- Test split (option 3): Single split may be unrepresentative.
- Cross-val (option 4): Robust estimate of generalization, uses all data efficiently.
"""