PART 2
Multivariate Regression Implementation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load the preprocessed dataset for cars
cars_data = pd.read_csv("cars.csv")

# Function to normalize features
def normalize_features(X):
    return (X - X.mean()) / X.std()

# Function to initialize parameters (weights and bias)
def initialize_parameters(num_features):
    # Initialize weights to zeros and bias to zero
    theta = np.zeros((num_features, 1))
    return theta

# Function to compute the cost (loss) function
def compute_cost(X, y, theta):
    m = len(y)
    predictions = np.dot(X, theta)
    squared_errors = np.square(predictions - y)
    J = (1 / (2 * m)) * np.sum(squared_errors)
    return J

# Function to perform gradient descent
def gradient_descent(X, y, theta, learning_rate, num_iterations):
    m = len(y)
    cost_history = []
    for i in range(num_iterations):
        predictions = np.dot(X, theta)
        errors = predictions - y
        gradient = (1 / m) * np.dot(X.T, errors)
        theta -= learning_rate * gradient
        cost = compute_cost(X, y, theta)
        cost_history.append(cost)
    return theta, cost_history

# Function to train the model
def train_model(X_train, y_train, learning_rate, num_iterations):
    # Initialize parameters
    num_features = X_train.shape[1]
    theta = initialize_parameters(num_features)
    
    # Perform gradient descent
    theta, cost_history = gradient_descent(X_train, y_train, theta, learning_rate, num_iterations)
    
    return theta, cost_history

# Feature selection
# Here, we'll select features Price in Thousands (target) and Horsepower
selected_features = ["Horsepower"]
X = cars_data[selected_features].values
y = cars_data["Price in Thousands"].values.reshape(-1, 1)

# Normalize features
X_normalized = normalize_features(X)

# Add bias term (intercept)
X_normalized_with_bias = np.c_[np.ones((X_normalized.shape[0], 1)), X_normalized]

# Split data into training and testing sets (you need to implement this)
# X_train, X_test, y_train, y_test = train_test_split(X_normalized_with_bias, y, test_size=0.2, random_state=42)

# Hyperparameters
learning_rate = 0.01
num_iterations = 1000

# Train the model
theta, cost_history = train_model(X_train, y_train, learning_rate, num_iterations)

# Visualize the cost history
plt.plot(cost_history)
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.title('Cost vs. Iterations')
plt.show()

# Predictions
predictions = np.dot(X_test, theta)

# Visualize the predicted vs. actual values
plt.scatter(y_test, predictions)
plt.xlabel('Actual Price in Thousands')
plt.ylabel('Predicted Price in Thousands')
plt.title('Actual vs. Predicted Price in Thousands')
plt.show()


PART 3
Manual K-Fold Cross Validation Implementation

In [None]:
import numpy as np
import pandas as pd

# Load the preprocessed dataset for cars
cars_data = pd.read_csv("cars.csv")

# Function to normalize features
def normalize_features(X):
    return (X - X.mean()) / X.std()

# Function to initialize parameters (weights and bias)
def initialize_parameters(num_features):
    # Initialize weights to zeros and bias to zero
    theta = np.zeros((num_features, 1))
    return theta

# Function to compute the cost (loss) function
def compute_cost(X, y, theta):
    m = len(y)
    predictions = np.dot(X, theta)
    squared_errors = np.square(predictions - y)
    J = (1 / (2 * m)) * np.sum(squared_errors)
    return J

# Function to perform gradient descent
def gradient_descent(X, y, theta, learning_rate, num_iterations):
    m = len(y)
    cost_history = []
    for i in range(num_iterations):
        predictions = np.dot(X, theta)
        errors = predictions - y
        gradient = (1 / m) * np.dot(X.T, errors)
        theta -= learning_rate * gradient
        cost = compute_cost(X, y, theta)
        cost_history.append(cost)
    return theta, cost_history

# Function to perform K-Fold Cross-Validation
def k_fold_cross_validation(X, y, k, learning_rate, num_iterations):
    m = len(y)
    fold_size = m // k
    mse_scores = []
    
    for fold in range(k):
        # Split data into training and validation sets for this fold
        start_idx = fold * fold_size
        end_idx = (fold + 1) * fold_size
        
        X_train = np.concatenate((X[:start_idx], X[end_idx:]), axis=0)
        y_train = np.concatenate((y[:start_idx], y[end_idx:]), axis=0)
        
        X_val = X[start_idx:end_idx]
        y_val = y[start_idx:end_idx]
        
        # Normalize features
        X_train_normalized = normalize_features(X_train)
        X_val_normalized = normalize_features(X_val)
        
        # Add bias term (intercept)
        X_train_normalized_with_bias = np.c_[np.ones((X_train_normalized.shape[0], 1)), X_train_normalized]
        X_val_normalized_with_bias = np.c_[np.ones((X_val_normalized.shape[0], 1)), X_val_normalized]
        
        # Initialize parameters
        num_features = X_train_normalized_with_bias.shape[1]
        theta = initialize_parameters(num_features)
        
        # Perform gradient descent
        theta, _ = gradient_descent(X_train_normalized_with_bias, y_train, theta, learning_rate, num_iterations)
        
        # Compute mean squared error for this fold
        predictions = np.dot(X_val_normalized_with_bias, theta)
        mse = np.mean(np.square(predictions - y_val))
        mse_scores.append(mse)
        
    return np.mean(mse_scores)

# Feature selection
# Here, we'll select features Price in Thousands (target) and Horsepower
selected_features = ["Horsepower"]
X = cars_data[selected_features].values
y = cars_data["Price in Thousands"].values.reshape(-1, 1)

# Hyperparameters
learning_rate = 0.01
num_iterations = 1000
k = 5  # Number of folds for cross-validation

# Perform K-Fold Cross-Validation
mse_score = k_fold_cross_validation(X, y, k, learning_rate, num_iterations)

print("Mean Squared Error (MSE) across {} folds: {:.4f}".format(k, mse_score))


PART 4
Comparison with Built-in Python Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Load the preprocessed dataset for cars
cars_data = pd.read_csv("cars.csv")

# Feature selection
selected_features = ["Horsepower"]
X = cars_data[selected_features].values
y = cars_data["Price in Thousands"].values.reshape(-1, 1)

# Implement Multivariate Regression using scikit-learn
model_lr = LinearRegression()
model_lr.fit(X, y)

# Predictions
predictions_lr = model_lr.predict(X)

# Compute mean squared error (MSE)
mse_lr = mean_squared_error(y, predictions_lr)

print("Mean Squared Error (MSE) using scikit-learn Linear Regression: {:.4f}".format(mse_lr))

# Implement K-Fold Cross-Validation using scikit-learn
cv_scores = cross_val_score(model_lr, X, y, cv=5, scoring='neg_mean_squared_error')
mse_cv = -cv_scores.mean()

print("Mean Squared Error (MSE) using scikit-learn K-Fold Cross-Validation: {:.4f}".format(mse_cv))
