In [10]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
df = pd.read_csv("https://raw.githubusercontent.com/jtao/AdvancedML/main/data/Auto.csv")
X = df[['cylinders', "displacement", "weight"]]
y = df[["mpg"]]

# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# drop non-numeric columns and handle missing values
numeric_df = df.select_dtypes(include=['number']).dropna()

# calculate correlation coefficients
correlation_matrix = numeric_df.corr()

# extract correlations with the target variable (mpg)
mpg_correlation = correlation_matrix['mpg'].abs().sort_values(ascending=False)

# select top 3 features
top_features = mpg_correlation[1:4]  # Exclude the target variable itself
top_features_names = top_features.index.tolist()

print("Top 3 features correlated with mpg:")
print(top_features)

Top 3 features correlated with mpg:
weight          0.831739
displacement    0.804443
cylinders       0.776260
Name: mpg, dtype: float64


In [17]:
from sklearn.linear_model import LinearRegression

# select top 3 features
X_train_top = X_train[['weight', 'displacement', 'cylinders']]
X_test_top = X_test[['weight', 'displacement', 'cylinders']]

# initialize linear regression model
linear_model = LinearRegression()

# train model
linear_model.fit(X_train, y_train)

# predict on test set
y_pred_lr = linear_model.predict(X_test)

# evaluate model
mse = mean_squared_error(y_test, y_pred)
lr_score = r2_score(y_test, y_pred_lr)

print("Mean Squared Error:", mse)
print("R^2 Score:", lr_score)

Mean Squared Error: 22.863675694575868
R^2 Score: 0.6274423284683568


In [23]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# select top correlated feature
X_train_ridge = X_train[['weight']]
X_test_ridge = X_test[['weight']]

# define pipeline for ridge regression with standardization
ridge_model = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

# train ridge regression model
ridge_model.fit(X_train, y_train)

# predict on test set
y_pred_ridge = ridge_model.predict(X_test)

# evaluate model
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
ridge_score = r2_score(y_test, y_pred_ridge)

print("Ridge Regression Mean Squared Error:", mse_ridge)
print("Ridge Regression R^2 Score:", ridge_score)


Ridge Regression Mean Squared Error: 22.81872013350512
Ridge Regression R^2 Score: 0.628174867688148


In [24]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# define neural network architecture
def create_neural_network(input_shape):
    model = Sequential([
        Dense(64, activation='relu', input_shape=input_shape),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(1)  # Output layer
    ])
    return model

# define pipeline for preprocessing
preprocessing_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# preprocess data
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train)
X_test_preprocessed = preprocessing_pipeline.transform(X_test)

# create neural network model
input_shape = X_train_preprocessed.shape[1:]
model = create_neural_network(input_shape)

# compile model
model.compile(optimizer='adam', loss='mean_squared_error')

# train model
history = model.fit(X_train_preprocessed, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

# predict on test set
y_pred_nn = model.predict(X_test_preprocessed)

# calculate mse and r^2 score
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

print("Neural Network Mean Squared Error:", mse_nn)
print("Neural Network R^2 Score:", r2_nn)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Neural Network Mean Squared Error: 22.818204075698276
Neural Network R^2 Score: 0.6281832767164015
