In [12]:
%pip install ucimlrepo

import numpy as np
import pandas as pd
import requests
from io import StringIO
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import KFold

#-----------------------------------------------------------------
# 1. LINEAR REGRESSION IMPLEMENTATION USING NORMAL EQUATIONS
#-----------------------------------------------------------------

class LinearRegression:
    def __init__(self, fit_intercept=True):
        """
        Linear Regression using normal equations

        Parameters
        ----------
        fit_intercept : bool, default=True
            Whether to calculate the intercept for this model.
            If set to False, no intercept will be used in calculations.
        """
        self.fit_intercept = fit_intercept
        self.coefficients = None
        self.intercept = None

    def fit(self, X, y):
        """
        Fit the model using the normal equations.

        Parameters
        ----------
        X : training array, shape = [n_samples, n_features]
        y : target values, shape = [n_samples]
        """
        X_copy = X.copy()

        if self.fit_intercept:
            X_copy = np.column_stack((np.ones(X.shape[0]), X_copy))

        # calculate X^TX -1
        XTX = np.dot(X_copy.T, X_copy)
        XTX_inv = np.linalg.inv(XTX)

        # calculate X^Ty
        XTy = np.dot(X_copy.T, y)

        # calculate coefficients
        theta = np.dot(XTX_inv, XTy)

        if self.fit_intercept:
            self.intercept = theta[0]
            self.coefficients = theta[1:]
        else:
            self.intercept = 0
            self.coefficients = theta

        return self

    def predict(self, X):
        """
        Predict using the linear model.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Samples.

        Returns
        -------
        y : array of shape = [n_samples]
            Returns predicted values.
        """

        if self.fit_intercept:
            return self.intercept + np.dot(X, self.coefficients)
        else:
            return np.dot(X, self.coefficients)

    def predict_class(self, X, threshold=0.5):
        """
        Predict class labels for samples in X.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Samples.

        Returns
        -------
        y : array of shape = [n_samples]
            predict class labels. (0 or 1)
        """
        y_predict_continous = self.predict(X)
        return (y_predict_continous >= threshold).astype(int)

#-----------------------------------------------------------------
# 2. LOAD HOUSING DATASET AND SPAMBASE DATASET
#-----------------------------------------------------------------

def fetch_housing_data():
    """
    Fetch Housing dataset from predefined training and testing files

    Returns:
    --------
    X_train : numpy array
        Training features
    y_train : numpy array
        Training targets
    X_test : numpy array
        Testing features
    y_test : numpy array
        Testing targets
    feature_names : list
        List of feature names
    """
    # URLs for training and testing data
    train_url = "https://www.khoury.northeastern.edu/home/vip/teach/MLcourse/data/housing_train.txt"
    test_url = "https://www.khoury.northeastern.edu/home/vip/teach/MLcourse/data/housing_test.txt"

    # Feature names
    feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS',
                    'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

    # Load training data
    train_response = requests.get(train_url)
    train_response.raise_for_status()  # Raise an exception for HTTP errors
    train_data = pd.read_csv(StringIO(train_response.text), sep="\s+", header=None)
    train_data.columns = feature_names

    # Load testing data
    test_response = requests.get(test_url)
    test_response.raise_for_status()
    test_data = pd.read_csv(StringIO(test_response.text), sep="\s+", header=None)
    test_data.columns = feature_names

    # Extract features and targets
    X_train = train_data.iloc[:, :-1].values
    y_train = train_data.iloc[:, -1].values

    X_test = test_data.iloc[:, :-1].values
    y_test = test_data.iloc[:, -1].values

    return X_train, y_train, X_test, y_test, feature_names

def fetch_spambase_data():
    """
    Fetch Spambase dataset from UCI repository

    Returns:
    --------
    X : numpy array
        Features
    y : numpy array
        Targets
    """
    spambase = fetch_ucirepo(id=94)

    # Extract features and targets
    X = spambase.data.features.values
    y = spambase.data.targets.values.ravel()

    return X, y

#-----------------------------------------------------------------
# 3. MAIN FUNCTION: HOUSING DATASET ANALYSIS
#-----------------------------------------------------------------

def main():
    print("LINEAR REGRESSION ON HOUSING DATASET")
    print("=" * 70)
    print("\nLoading Housing dataset from predefined train/test files...")

    # Load the Housing dataset
    X_train, y_train, X_test, y_test, feature_names = fetch_housing_data()

    print(f"Dataset loaded successfully:")
    print(f"  Training set: {X_train.shape[0]} samples, {X_train.shape[1]} features")
    print(f"  Testing set: {X_test.shape[0]} samples, {X_test.shape[1]} features")

    # Normalize the data
    scaler = MinMaxScaler()
    X_train_norm = scaler.fit_transform(X_train)
    X_test_norm = scaler.transform(X_test)

    # Linear Regression Training
    model = LinearRegression()
    model.fit(X_train_norm, y_train)

    # Predict
    y_train_pred = model.predict(X_train_norm)
    y_test_pred = model.predict(X_test_norm)

    # Calculate MSE & R2
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)

    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    print(f"   Training MSE: {train_mse:.4f}, R²: {train_r2:.4f}")
    print(f"   Testing MSE: {test_mse:.4f}, R²: {test_r2:.4f}")

    # Print model coefficients
    print(f"\nIntercept: {model.intercept:.4f}")
    print("\nFeature coefficients:")
    for i, feature in enumerate(feature_names[:-1]):
        print(f"   {feature}: {model.coefficients[i]:.4f}")

    print("\n\n")
    print("LINEAR REGRESSION ON SPAMBASE DATASET")
    print("=" * 70)
    print("\nLoading spambase dataset from predefined train/test files...")

    # Load spambase dataset
    X, y = fetch_spambase_data()

    print(f"Dataset loaded successfully:")
    print(f"  Features: {X.shape[0]} samples, {X.shape[1]} features")

    # k-fold cross validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

    fold_train_accuracies = []
    fold_test_accuracies = []

    for train_index, test_index in kf.split(X):
        # Split data
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Apply normalization
        scaler = MinMaxScaler()
        X_train_norm = scaler.fit_transform(X_train)
        X_test_norm = scaler.transform(X_test)

        # Training with Linear Regression Model
        model = LinearRegression()
        model.fit(X_train_norm, y_train)

        best_threshold, best_accuracy = None, 0

        # Find an optimal threshold
        for threshold in thresholds:
            # Predict
            y_pred_class = model.predict_class(X_test_norm, threshold=threshold)
            # Calculate accuray
            accuracy = accuracy_score(y_test, y_pred_class)

            print(f"   Threshold: {threshold}, Accuracy: {accuracy:.4f}")

            # Update best
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_threshold = threshold

        print(f"   Best threshold: {best_threshold}, Best accuracy: {best_accuracy:.4f}")

        y_train_pred = model.predict_class(X_train_norm, threshold=best_threshold)
        y_test_pred = model.predict_class(X_test_norm, threshold=best_threshold)

        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)

        fold_train_accuracies.append(train_accuracy)
        fold_test_accuracies.append(test_accuracy)

    # Calculate accuracy average and standard deviations
    print(fold_train_accuracies)
    avg_train_accuracy = np.mean(fold_train_accuracies)
    avg_test_accuracy = np.mean(fold_test_accuracies)
    std_train_accuracy = np.std(fold_train_accuracies)
    std_test_accuracy = np.std(fold_test_accuracies)


    print("\n" + "=" * 70)
    print(f"CROSS-VALIDATION RESULTS (5 FOLDS)")
    print("=" * 70)

    print("\nAverage Performance Metrics:")
    print(f"  Training Accuracy: {avg_train_accuracy:.4f} ± {std_train_accuracy:.4f}")
    print(f"  Validation Accuracy: {avg_test_accuracy:.4f} ± {std_test_accuracy:.4f}")


if __name__ == "__main__":
    main()





LINEAR REGRESSION ON HOUSING DATASET

Loading Housing dataset from predefined train/test files...
Dataset loaded successfully:
  Training set: 433 samples, 13 features
  Testing set: 74 samples, 13 features
   Training MSE: 22.0813, R²: 0.7546
   Testing MSE: 22.6383, R²: 0.4577

Intercept: 27.7723

Feature coefficients:
   CRIM: -8.9982
   ZN: 4.5894
   INDUS: -0.0688
   CHAS: 3.0720
   NOX: -8.2510
   RM: 19.3690
   AGE: 0.6951
   DIS: -17.5841
   RAD: 8.5933
   TAX: -7.5473
   PTRATIO: -8.8079
   B: 3.8441
   LSTAT: -20.6554



LINEAR REGRESSION ON SPAMBASE DATASET

Loading spambase dataset from predefined train/test files...
Dataset loaded successfully:
  Features: 4601 samples, 57 features
   Threshold: 0.1, Accuracy: 0.5939
   Threshold: 0.2, Accuracy: 0.7231
   Threshold: 0.3, Accuracy: 0.8534
   Threshold: 0.4, Accuracy: 0.9077
   Threshold: 0.5, Accuracy: 0.8817
   Threshold: 0.6, Accuracy: 0.8371
   Threshold: 0.7, Accuracy: 0.7818
   Threshold: 0.8, Accuracy: 0.7318
   Thres

# Decision(Regression) Tree vs. Linear Regression
## Housing Data Comparison
For housing data, regression tree shows the result of training MSE: 6.6557 and testing MSE: 32.3248, while linear regression shows the result of training MSE: 22.0813 and testing MSE: 22.6383.

Linear Regregssion is clearly better on this dataset. Lower testing MSE means better predictive performance on new data.

## Spambase Data Comparison
For spambase data, decision tree shows the result of training accuracy: 96.60% and testing accuracy: 92.48%, while linear regression shows the result of training accuracy: 91.12% and testing accuracy: 90.87%.

Both models demonstrate viable approaches to spam classification:
The Decision Tree achieves higher absolute accuracy at the cost of some generalization capability, making it suitable for applications where correctly identifying as many spam emails as possible is the top priority. The Linear Regression offers exceptional consistency and stability, making it appropriate for scenarios where reliable, consistent performance is more important than maximizing raw accuracy.