#### MODEL EVALUATION AND REFINEMENT

##### UNDER FITTING

In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# Load the dataset
df = pd.read_csv("/Users/cansezgin/Python-Output/automobile.csv")

# Clean the data: ensure 'horsepower' and 'price' are numeric and remove missing values
df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df_cleaned = df.dropna(subset=['horsepower', 'price'])  # drop rows with missing values in those columns

# Define the feature and target
X = df_cleaned[['horsepower']]
y = df_cleaned['price']

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create an empty list to store R² scores
Rsqu_test = []

# Try polynomial degrees from 1 to 4
order = [1, 2, 3, 4]

for n in order:
    # Create polynomial features
    pr = PolynomialFeatures(degree=n)
    
    # Transform both training and test features
    x_train_pr = pr.fit_transform(x_train)
    x_test_pr = pr.transform(x_test)
    
    # Fit linear regression model
    lr = LinearRegression()
    lr.fit(x_train_pr, y_train)
    
    # Calculate R² on test data and append to the list
    score = lr.score(x_test_pr, y_test)
    Rsqu_test.append(score)

# Print R² scores for each polynomial degree
for deg, score in zip(order, Rsqu_test):
    print(f"Polynomial Degree {deg}: R^2 = {score:.4f}")

Polynomial Degree 1: R^2 = 0.7557
Polynomial Degree 2: R^2 = 0.7629
Polynomial Degree 3: R^2 = 0.7756
Polynomial Degree 4: R^2 = 0.7818


##### OVER FITTING & RIDGE REGRESSION

In [79]:
from sklearn.linear_model import Ridge

# Use cleaned data
X = df_cleaned[['horsepower']]
y = df_cleaned['price']

# Create Ridge regression model
RidgeModel = Ridge(alpha=0.1)

# Fit the model
RidgeModel.fit(X, y)

# Predict values
Yhat = RidgeModel.predict(X)

# Optional: inspect predictions
print(Yhat[:5])  # show first 5 predictions

[14552.71841567 14552.71841567 21957.5845643  13002.86271014
 15241.54317368]


##### GRID SEARCH

In [98]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Load the dataset
df = pd.read_csv("/Users/cansezgin/Python-Output/automobile.csv")

# Clean column names (remove leading/trailing spaces)
df.columns = df.columns.str.strip()

# Define relevant columns and clean data
cols = ['horsepower', 'curb-weight', 'engine-size', 'highway-mpg', 'price']
for col in cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df_cleaned = df.dropna(subset=cols)

# Define features and target
x_data = df_cleaned[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']]
y_data = df_cleaned['price']

# Ridge regression with grid search
parameters1 = [{'alpha': [0.001, 0.1, 1, 10, 100, 1000, 10000, 100000]}]
RR = Ridge()
Grid1 = GridSearchCV(RR, parameters1, cv=4)
Grid1.fit(x_data, y_data)

# Output best estimator and scores
print("Best Ridge Estimator:", Grid1.best_estimator_)

scores = Grid1.cv_results_
print("\nMean test scores for each alpha:")
for alpha, score in zip(parameters1[0]['alpha'], scores['mean_test_score']):
    print(f"alpha = {alpha:<8} --> R² mean score = {score:.4f}")

Best Ridge Estimator: Ridge(alpha=10000)

Mean test scores for each alpha:
alpha = 0.001    --> R² mean score = 0.6470
alpha = 0.1      --> R² mean score = 0.6470
alpha = 1        --> R² mean score = 0.6470
alpha = 10       --> R² mean score = 0.6471
alpha = 100      --> R² mean score = 0.6475
alpha = 1000     --> R² mean score = 0.6498
alpha = 10000    --> R² mean score = 0.6525
alpha = 100000   --> R² mean score = 0.6419


In [108]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

parameters1 = [{'alpha': [0.001, 0.1, 1, 10, 100, 1000, 10000, 100000]}]
RR = Ridge()

# Enable return of training scores
Grid1 = GridSearchCV(RR, parameters1, cv=4, return_train_score=True)
Grid1.fit(x_data, y_data)

scores = Grid1.cv_results_

# Nicely formatted output
for param, mean_test, mean_train in zip(scores['params'], 
                                        scores['mean_test_score'], 
                                        scores['mean_train_score']):
    print(f"{param} --> R² test: {mean_test:.4f}, R² train: {mean_train:.4f}")

{'alpha': 0.001} --> R² test: 0.6470, R² train: 0.8164
{'alpha': 0.1} --> R² test: 0.6470, R² train: 0.8164
{'alpha': 1} --> R² test: 0.6470, R² train: 0.8164
{'alpha': 10} --> R² test: 0.6471, R² train: 0.8164
{'alpha': 100} --> R² test: 0.6475, R² train: 0.8164
{'alpha': 1000} --> R² test: 0.6498, R² train: 0.8163
{'alpha': 10000} --> R² test: 0.6525, R² train: 0.8151
{'alpha': 100000} --> R² test: 0.6419, R² train: 0.7925
