In [6]:
import numpy as np  # Importing numpy for numerical operations
import pandas as pd  # Importing pandas for data manipulation and analysis
import matplotlib.pyplot as plt  # Importing matplotlib for plotting
from sklearn.model_selection import train_test_split  # Importing train_test_split to split data into train and test


In [7]:
# Load the advertising dataset
data = pd.read_csv("Data/Advertising.csv")  # Ensure the file path is correctly formatted

# Display the first few rows of the dataset to understand its structure
print(data.head())


   Unnamed: 0     TV  radio  newspaper  sales
0           1  230.1   37.8       69.2   22.1
1           2   44.5   39.3       45.1   10.4
2           3   17.2   45.9       69.3    9.3
3           4  151.5   41.3       58.5   18.5
4           5  180.8   10.8       58.4   12.9


In [8]:
# Extracting feature columns (independent variables)
X = data[["TV", "radio", "newspaper"]]  # Feature matrix containing TV, radio, and newspaper columns

# Extracting the target variable (dependent variable)
y = data["sales"]  # Target variable representing sales

# Show the first few rows of features and target
print(X.head())
print(y.head())


      TV  radio  newspaper
0  230.1   37.8       69.2
1   44.5   39.3       45.1
2   17.2   45.9       69.3
3  151.5   41.3       58.5
4  180.8   10.8       58.4
0    22.1
1    10.4
2     9.3
3    18.5
4    12.9
Name: sales, dtype: float64


In [9]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)  # Random state ensures reproducibility


In [10]:
# Adding a bias term (intercept) to the feature matrix for training data
X_train_bias = np.c_[np.ones(X_train.shape[0]), X_train]  # Adding a column of ones for the intercept term
X_test_bias = np.c_[np.ones(X_test.shape[0]), X_test]  # Same for test data


In [11]:
# Converting target variables to numpy arrays for matrix operations
y_train_np = y_train.to_numpy()  # Convert y_train to numpy array
y_test_np = y_test.to_numpy()  # Convert y_test to numpy array


In [12]:
# Apply the Normal Equation to calculate regression coefficients
beta = np.linalg.inv(X_train_bias.T @ X_train_bias) @ X_train_bias.T @ y_train_np  # Formula for beta coefficients


In [13]:
# Make predictions on the training data using the learned coefficients
y_train_pred = X_train_bias @ beta  # Predicted values for training set


In [14]:
# Compute Residual Sum of Squares (RSS) and Total Sum of Squares (TSS)
RSS = np.sum((y_train_np - y_train_pred) ** 2)  # RSS is the sum of squared residuals
TSS = np.sum((y_train_np - y_train_np.mean()) ** 2)  # TSS is the total sum of squares


In [15]:
# Calculating R-squared (R²) to evaluate the model's goodness of fit
R_squared = 1 - (RSS / TSS)  # R² = 1 - (RSS/TSS)


In [16]:
# Calculate Residual Standard Error (RSE)
n_train, p_train = X_train_bias.shape  # n_train = number of samples, p_train = number of features (including intercept)
RSE = np.sqrt(RSS / (n_train - p_train))  # Formula for RSE


In [17]:
# Calculate the F-statistic for model evaluation
F_stat = ((TSS - RSS) / (p_train - 1)) / (RSS / (n_train - p_train))  # Formula for F-statistic


In [18]:
# Output the results
print(f"Coefficients (beta): {beta}")
print(f"RSS: {RSS:.4f}")
print(f"R²: {R_squared:.4f}")
print(f"RSE: {RSE:.4f}")
print(f"F-statistic: {F_stat:.4f}")


Coefficients (beta): [2.87696662 0.04656457 0.17915812 0.00345046]
RSS: 463.0187
R²: 0.8903
RSE: 1.7808
F-statistic: 394.9981


In [19]:
# Function to perform regression and print results for a single feature
def simple_regression(feature_column):
    X_single = data[feature_column]  # Select one feature (e.g., TV)
    
    # Splitting the data
    X_train_single, X_test_single, y_train_single, y_test_single = train_test_split(X_single, y, random_state=1)

    # Reshape the feature columns to 2D arrays as required by numpy operations
    X_train_single_reshaped = X_train_single.to_numpy().reshape(-1, 1)
    X_test_single_reshaped = X_test_single.to_numpy().reshape(-1, 1)

    # Add a column of ones (bias term) to the feature matrices
    X_train_single_bias = np.c_[np.ones(X_train_single_reshaped.shape[0]), X_train_single_reshaped]
    X_test_single_bias = np.c_[np.ones(X_test_single_reshaped.shape[0]), X_test_single_reshaped]

    # Calculate the regression coefficients using the Normal Equation
    beta_single = np.linalg.inv(X_train_single_bias.T @ X_train_single_bias) @ X_train_single_bias.T @ y_train_single
    
    # Predicting sales based on the model
    y_train_single_pred = X_train_single_bias @ beta_single

    # Compute RSS and TSS
    RSS_single = np.sum((y_train_single - y_train_single_pred) ** 2)
    TSS_single = np.sum((y_train_single - y_train_single.mean()) ** 2)

    # Calculate R-squared (R²)
    R_squared_single = 1 - (RSS_single / TSS_single)

    # Calculate RSE
    n_single, p_single = X_train_single_bias.shape
    RSE_single = np.sqrt(RSS_single / (n_single - p_single))

    # Calculate the F-statistic
    F_stat_single = ((TSS_single - RSS_single) / (p_single - 1)) / (RSS_single / (n_single - p_single))

    # Display the results for the single predictor
    print(f"Results for {feature_column}:")
    print(f"Coefficients: {beta_single}")
    print(f"RSS: {RSS_single:.4f}")
    print(f"R²: {R_squared_single:.4f}")
    print(f"RSE: {RSE_single:.4f}")
    print(f"F-statistic: {F_stat_single:.4f}")
    print("\n")


In [20]:
# Running the regression for each individual predictor
for feature in ["TV", "radio", "newspaper"]:
    simple_regression(feature)


Results for TV:
Coefficients: [6.91197262 0.04802945]
RSS: 1587.8472
R²: 0.6238
RSE: 3.2755
F-statistic: 245.4367


Results for radio:
Coefficients: [9.33859455 0.19338969]
RSS: 2937.6264
R²: 0.3041
RSE: 4.4552
F-statistic: 64.6606


Results for newspaper:
Coefficients: [11.76557671  0.06888299]
RSS: 3898.6081
R²: 0.0764
RSE: 5.1324
F-statistic: 12.2411


