In [10]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
df = pd.read_csv("https://raw.githubusercontent.com/jtao/AdvancedML/main/data/Auto.csv")
X = df[['cylinders', "displacement", "weight"]]
y = df[["mpg"]]

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Drop non-numeric columns and handle missing values
numeric_df = df.select_dtypes(include=['number']).dropna()

# Calculate correlation coefficients
correlation_matrix = numeric_df.corr()

# Extract correlations with the target variable (mpg)
mpg_correlation = correlation_matrix['mpg'].abs().sort_values(ascending=False)

# Select top 3 features
top_features = mpg_correlation[1:4]  # Exclude the target variable itself
top_features_names = top_features.index.tolist()

print("Top 3 features correlated with mpg:")
print(top_features)

Top 3 features correlated with mpg:
weight          0.831739
displacement    0.804443
cylinders       0.776260
Name: mpg, dtype: float64


In [12]:
from sklearn.linear_model import LinearRegression

# Selecting the top 3 features
X_train_top = X_train[['weight', 'displacement', 'cylinders']]
X_test_top = X_test[['weight', 'displacement', 'cylinders']]

# Initialize the linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train_top, y_train)

# Predict on the test set
y_pred = model.predict(X_test_top)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)



Mean Squared Error: 22.863675694575868
R^2 Score: 0.6274423284683546


In [13]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Selecting the top correlated feature
X_train_ridge = X_train[['weight']]
X_test_ridge = X_test[['weight']]

# Define a pipeline for Ridge regression with standardization
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features
    ('ridge', Ridge())             # Apply Ridge regression
])

# Train the Ridge regression model
pipeline.fit(X_train_ridge, y_train)

# Predict on the test set
y_pred_ridge = pipeline.predict(X_test_ridge)

# Evaluate the model
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("Ridge Regression Mean Squared Error:", mse_ridge)
print("Ridge Regression R^2 Score:", r2_ridge)


Ridge Regression Mean Squared Error: 23.419094078225463
Ridge Regression R^2 Score: 0.6183919298140623
