# Advertising Data

### Linear Regression

#### Prepping Data

In [None]:
import numpy as np
import pandas as pd

class LinearRegression:
    # initialize a learning rate and parameters
    def __init__(self, learning_rate=0.01, epochs=1000):
        # the learning rate will update and the number of iterations for training increaes
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = None

    # training the model to fit on a line of best fit 
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        for _ in range(self.epochs):
            # Calculate predicted values
            y_predicted = np.dot(X, self.weights) + self.bias
            
            # Calculate gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)
            
            # Update weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    # predict the info 
    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

        
    # Mean Squared Error - evaluate the models performane 
    def mean_squared_error(self, y_true, y_pred):
        # Calculate the MSE by averaging 
        return np.mean((y_true - y_pred) ** 2)

a

### Logistic Regression

In [19]:
# 
class LogisticRegression:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = None

        
    # apply the sigmoid function to the linear combination
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    # fit the logistic regression model
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Training Loop - update weights and bias
        for _ in range(self.epochs):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)
            
            # Compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)
            
            # Update weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        return [1 if i > 0.5 else 0 for i in y_predicted]
    
    def accuracy(self, y_true, y_pred):
        return np.mean(y_true == y_pred)


In [20]:
# Load and preprocess the dataset
def load_data():
    data = pd.read_csv("Advertising.csv")
    X = data[["TV", "Radio", "Newspaper"]].values
    y = data["Sales"].values

    
    # Normalize features (optional but recommended)
    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
    
    # Convert sales into binary classes for logistic regression
    y_logistic = (y > np.median(y)).astype(int)
    
    # Manually split into train and test sets
    np.random.seed(42)
    indices = np.random.permutation(len(X))
    test_size = int(0.2 * len(X))
    
    # Split  linear regression
    X_train = X[indices[test_size:]]
    X_test = X[indices[:test_size]]
    y_train = y[indices[test_size:]]
    y_test = y[indices[:test_size]]
    
    # Split logistic regression
    X_train_log = X[indices[test_size:]]
    X_test_log = X[indices[:test_size]]
    y_train_log = y_logistic[indices[test_size:]]
    y_test_log = y_logistic[indices[:test_size]]
    
    return X_train, X_test, y_train, y_test, X_train_log, X_test_log, y_train_log, y_test_log

# Train and evaluate models
X_train, X_test, y_train, y_test, X_train_log, X_test_log, y_train_log, y_test_log = load_data()

# Linear Regression Model
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)
y_pred = linear_regression.predict(X_test)
mse = linear_regression.mean_squared_error(y_test, y_pred)
print(f"Linear Regression MSE: {mse}")

# Logistic Regression Model
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train_log, y_train_log)
y_pred_log = logistic_regression.predict(X_test_log)
accuracy = logistic_regression.accuracy(y_test_log, y_pred_log)
print(f"Logistic Regression Accuracy: {accuracy}")


Linear Regression MSE: 3.175711859938966
Logistic Regression Accuracy: 0.925


### KNN

In [18]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('Advertising.csv')

# Standardize the features (TV, Radio, Newspaper) using Z-score standardization
def standardize_data(data):
    return (data - data.mean()) / data.std()

    # Standarize the data 

df['TV'] = standardize_data(df['TV'])
df['Radio'] = standardize_data(df['Radio'])
df['Newspaper'] = standardize_data(df['Newspaper'])

# Step 3: Split the data into training and testing sets (80-20 split)
def train_test_split(data, test_size=0.2):
    # Shuffle the data
    data = data.sample(frac=1, random_state=42).reset_index(drop=True)
    test_data_size = int(len(data) * test_size)

    # Feature and the target 
    X_train = data.iloc[:-test_data_size, :-1].values  
    y_train = data.iloc[:-test_data_size, -1].values    
    X_test = data.iloc[-test_data_size:, :-1].values  
    y_test = data.iloc[-test_data_size:, -1].values   
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(df)


# Function to compute Euclidean distance between two points
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# Function to predict the target value for a given test set
def knn_predict(X_train, y_train, X_test, k=5):
    y_pred = []
    
    for test_point in X_test:
        # Calculate distance between test_point and all training points
        distances = [euclidean_distance(test_point, train_point) for train_point in X_train]
        
        # Get indices of the k nearest neighbors
        # Get the labels (Sales) of the k nearest neighbors
        k_neighbors_indices = np.argsort(distances)[:k]        
        k_nearest_labels = y_train[k_neighbors_indices]

        # Predict by averaging the target values of the k nearest neighbors
        prediction = np.mean(k_nearest_labels)
        y_pred.append(prediction)
        
    return np.array(y_pred)

# Calculate R² (R-squared) to evaluate accuracy
def r_squared(y_true, y_pred):
    # Residual sum of squares
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    return r2

# Try different k values
k = 3
y_pred = knn_predict(X_train, y_train, X_test, k)
r2 = r_squared(y_test, y_pred)
print(f'KNN Accuracy k={k}: {r2}')


KNN Accuracy k=3: 0.26695776243494773


### Decision Tree

In [16]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('Advertising.csv')

# Standardize the features
def standardize_data(data):
    return (data - data.mean()) / data.std()

df['TV'] = standardize_data(df['TV'])
df['Radio'] = standardize_data(df['Radio'])
df['Newspaper'] = standardize_data(df['Newspaper'])

# Split the data into features and target variable
X = df[['TV', 'Radio', 'Newspaper']].values
y = df['Sales'].values

# Train-Test Split function (80-20 split)
def train_test_split(X, y, test_size=0.2):
    # Shuffle the dataset
    indices = np.random.permutation(len(X))
    test_size = int(len(X) * test_size)
    
    train_indices = indices[test_size:]
    test_indices = indices[:test_size]
    
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y)

# Gini Impurity function
def gini_impurity(y):
    classes = np.unique(y)
    impurity = 1
    for c in classes:
        prob_of_c = np.sum(y == c) / len(y)
        impurity -= prob_of_c ** 2
    return impurity

# Split function based on a feature and threshold
def split_data(X, y, feature_index, threshold):
    left_mask = X[:, feature_index] <= threshold
    right_mask = ~left_mask
    X_left, y_left = X[left_mask], y[left_mask]
    X_right, y_right = X[right_mask], y[right_mask]
    return X_left, y_left, X_right, y_right

# Decision Tree Node class
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

        # fit 
    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        unique_classes = np.unique(y)
        
        # Stopping condition: Pure node or max depth or too few samples
        if len(unique_classes) == 1 or depth == self.max_depth or num_samples < self.min_samples_split:
            return np.mean(y) 
        
        best_gini = float('inf')
        best_split = None
        best_left = None
        best_right = None
        
        # Iterate over all features and thresholds to find the best split
        for feature_index in range(num_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                X_left, y_left, X_right, y_right = split_data(X, y, feature_index, threshold)
                gini = (len(X_left) / len(X)) * gini_impurity(y_left) + (len(X_right) / len(X)) * gini_impurity(y_right)
                
                if gini < best_gini:
                    best_gini = gini
                    best_split = (feature_index, threshold)
                    best_left, best_right = (X_left, y_left), (X_right, y_right)

        # Create subtrees 
        left_node = self._build_tree(best_left[0], best_left[1], depth + 1)
        right_node = self._build_tree(best_right[0], best_right[1], depth + 1)
        
        return (best_split, left_node, right_node)

    def predict(self, X):
        return [self._predict_sample(x, self.tree) for x in X]

    def _predict_sample(self, x, node):
        if isinstance(node, (int, float)):
            return node
        feature_index, threshold = node[0]
        if x[feature_index] <= threshold:
            return self._predict_sample(x, node[1])
        else:
            return self._predict_sample(x, node[2])

# R-Squared Accuracy function
def r_squared(y_true, y_pred):
    total_variance = np.sum((y_true - np.mean(y_true)) ** 2)
    residual_variance = np.sum((y_true - y_pred) ** 2)
    return 1 - (residual_variance / total_variance)

# Train the decision tree model
tree = DecisionTree(max_depth=5, min_samples_split=10)
tree.fit(X_train, y_train)

# Make predictions on the test set
y_pred = tree.predict(X_test)
r2 = r_squared(y_test, y_pred)
print(f' Decision Tree (Accuracy): {r2}')


 Decision Tree (Accuracy): 0.8765723773254737


### Bayesian Classifier

In [21]:
import numpy as np
import pandas as pd

# Bayesian Classifier 
class BayesianClassifier:
    def __init__(self):
        self.means = {}
        self.variances = {}
        self.priors = {}
    
    def fit(self, X, y):
        self.classes = np.unique(y)
        for c in self.classes:
            X_c = X[y == c]
            self.means[c] = np.mean(X_c, axis=0)
            self.variances[c] = np.var(X_c, axis=0)
            self.priors[c] = X_c.shape[0] / X.shape[0]

    # Create Predicition
    def predict(self, X):
        predictions = []
        for x in X:
            posteriors = []
            for c in self.classes:
                prior = np.log(self.priors[c])
                likelihood = -0.5 * np.sum(np.log(2 * np.pi * self.variances[c])) - 0.5 * np.sum(((x - self.means[c]) ** 2) / (self.variances[c]))
                posteriors.append(prior + likelihood)
            predictions.append(self.classes[np.argmax(posteriors)])
        return predictions
    
    def accuracy(self, y_true, y_pred):
        return np.mean(y_true == y_pred)

# Load and preprocess the dataset
def load_data():
    data = pd.read_csv("Advertising.csv")
    X = data[["TV", "Radio", "Newspaper"]].values
    y = data["Sales"].values
    
    # Normalize features (optional but recommended)
    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
    
    # Convert sales into binary classes 
    y_classification = (y > np.median(y)).astype(int)
    
    # Manually split into train and test sets
    np.random.seed(42)
    indices = np.random.permutation(len(X))
    test_size = int(0.2 * len(X))
    
    # Split for Bayesian classifier
    X_train_class = X[indices[test_size:]]
    X_test_class = X[indices[:test_size]]
    y_train_class = y_classification[indices[test_size:]]
    y_test_class = y_classification[indices[:test_size]]
    
    return X_train_class, X_test_class, y_train_class, y_test_class

# Train and evaluate Bayesian Classifier model
X_train_class, X_test_class, y_train_class, y_test_class = load_data()

# Bayesian Classifier Model
bayesian_classifier = BayesianClassifier()
bayesian_classifier.fit(X_train_class, y_train_class)
y_pred_bayes = bayesian_classifier.predict(X_test_class)
accuracy_bayes = bayesian_classifier.accuracy(y_test_class, y_pred_bayes)
print(f"Bayesian Classifier Accuracy: {accuracy_bayes}")


Bayesian Classifier Accuracy: 0.9
