### Mushroom

### Linear & Logistic Regression

In [16]:
import numpy as np
import pandas as pd
import random

# sigmoid 
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Logistic Regression
class LogisticRegression:
    def __init__(self, lr=0.01, epochs=1000):
        # Initialize, the learning rate, number of iterations
        self.lr = lr
        self.epochs = epochs
        self.weights = None
        self.bias = None

        # fit logistic regression model
    def fit(self, X, y):
        # Get number of samples and features
        m, n = X.shape
        # initialize weights and bias
        self.weights = np.zeros(n)
        self.bias = 0

        # Training loop        
        for _ in range(self.epochs):
            # Calculate the linear combination of inputs and weight
            linear_model = np.dot(X, self.weights) + self.bias
            # apply the sigmoid
            predictions = sigmoid(linear_model)

            # compute the gradients 
            dw = (1/m) * np.dot(X.T, (predictions - y))
            db = (1/m) * np.sum(predictions - y)

            # update the weights and bias
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    # make predictions 
    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        predictions = sigmoid(linear_model)
        return np.array([1 if p >= 0.5 else 0 for p in predictions])

        


In [None]:
class LinearRegression:
    def __init__(self, lr=0.01, epochs=1000):
        # initialize the learning rate
        self.lr = lr
        self.epochs = epochs
        self.weights = None
        self.bias = None
    
    def fit(self, X, y):
        # fit the linear regression
        # get the number, initialize weights to zeeros
        m, n = X.shape
        self.weights = np.zeros(n)
        self.bias = 0

        # run a loop for gradient descent
        for _ in range(self.epochs):
            predictions = np.dot(X, self.weights) + self.bias
            error = predictions - y

            # compute the gradients with respect to the weights
            dw = (2/m) * np.dot(X.T, error)
            db = (2/m) * np.sum(error)

            # update the weights and bias
            self.weights -= self.lr * dw
            self.bias -= self.lr * db
    
    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

# train test split
def train_test_split(X, y, test_size=0.2):
    indices = list(range(len(X)))
    random.shuffle(indices)
    split_idx = int(len(X) * (1 - test_size))
    train_indices, test_indices = indices[:split_idx], indices[split_idx:]
    return X[train_indices], X[test_indices], y[train_indices], y[test_indices]

# Load and preprocess dataset
data = pd.read_csv("mushroom.data", header=None)
labels = np.array([1 if val == 'p' else 0 for val in data[0]])
features = np.array(data.iloc[:, 1:])
unique_values = [list(set(col)) for col in zip(*features)]
encoded_features = np.array([[unique_values[i].index(val) for i, val in enumerate(row)] for row in features])

# Avoid division by zero in normalization
max_vals = np.max(encoded_features, axis=0)
max_vals[max_vals == 0] = 1  # Replace zeros with ones to prevent division errors
encoded_features = encoded_features / max_vals

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(encoded_features, labels, test_size=0.2)

# Training and testing logistic regression
log_reg = LogisticRegression(lr=0.01, epochs=1000)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
log_accuracy = np.mean(y_pred_log == y_test)

# Training and testing linear regression
lin_reg = LinearRegression(lr=0.01, epochs=1000)
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)
y_pred_lin = np.clip(y_pred_lin, -1, 1)  # Clipping to prevent large MSE
lin_mse = np.mean(np.square(y_pred_lin - y_test))


log_accuracy, lin_mse

In [10]:
import numpy as np
import pandas as pd
import random

class NaiveBayesClassifier:
    def __init__(self):
        self.class_priors = {}
        self.feature_probs = {}
        self.classes = None
    
    def fit(self, X, y):
        self.classes = np.unique(y)
        feature_count = X.shape[1]
        
        for c in self.classes:
            X_c = X[y == c]
            self.class_priors[c] = len(X_c) / len(X)
            
            self.feature_probs[c] = {}
            for j in range(feature_count):
                values, counts = np.unique(X_c[:, j], return_counts=True)
                self.feature_probs[c][j] = {v: (counts[i] + 1) / (len(X_c) + len(values)) for i, v in enumerate(values)}
    
    def predict(self, X):
        predictions = []
        for x in X:
            class_probs = {}
            for c in self.classes:
                class_probs[c] = np.log(self.class_priors[c])
                for j, value in enumerate(x):
                    if value in self.feature_probs[c][j]:
                        class_probs[c] += np.log(self.feature_probs[c][j][value])
                    else:
                        class_probs[c] += np.log(1 / (len(self.feature_probs[c][j]) + 1))
            
            predictions.append(max(class_probs, key=class_probs.get))
        return np.array(predictions)

def train_test_split(X, y, test_size=0.2):
    indices = list(range(len(X)))
    random.shuffle(indices)
    split_idx = int(len(X) * (1 - test_size))
    train_indices, test_indices = indices[:split_idx], indices[split_idx:]
    return X[train_indices], X[test_indices], y[train_indices], y[test_indices]

# Load and preprocess dataset
data = pd.read_csv("mushroom.data", header=None)
labels = np.array([1 if val == 'p' else 0 for val in data[0]])
features = np.array(data.iloc[:, 1:])
unique_values = [list(set(col)) for col in zip(*features)]
encoded_features = np.array([[unique_values[i].index(val) for i, val in enumerate(row)] for row in features])

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(encoded_features, labels, test_size=0.2)

# Training and testing Naive Bayes classifier
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X_train, y_train)
y_pred_nb = nb_classifier.predict(X_test)
nb_accuracy = np.mean(y_pred_nb == y_test)

nb_accuracy


np.float64(0.8812307692307693)

In [11]:
import numpy as np
import pandas as pd
import random
from collections import Counter

class KNNClassifier:
    def __init__(self, k=3):
        self.k = k
        self.X_train = None
        self.y_train = None
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def predict(self, X):
        predictions = []
        for x in X:
            distances = np.linalg.norm(self.X_train - x, axis=1)
            k_nearest_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_nearest_indices]
            most_common = Counter(k_nearest_labels).most_common(1)[0][0]
            predictions.append(most_common)
        return np.array(predictions)

def train_test_split(X, y, test_size=0.2):
    indices = list(range(len(X)))
    random.shuffle(indices)
    split_idx = int(len(X) * (1 - test_size))
    train_indices, test_indices = indices[:split_idx], indices[split_idx:]
    return X[train_indices], X[test_indices], y[train_indices], y[test_indices]

# Load and preprocess dataset
data = pd.read_csv("mushroom.data", header=None)
labels = np.array([1 if val == 'p' else 0 for val in data[0]])
features = np.array(data.iloc[:, 1:])
unique_values = [list(set(col)) for col in zip(*features)]
encoded_features = np.array([[unique_values[i].index(val) for i, val in enumerate(row)] for row in features])

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(encoded_features, labels, test_size=0.2)

# Training and testing KNN classifier
knn_classifier = KNNClassifier(k=5)
knn_classifier.fit(X_train, y_train)
y_pred_knn = knn_classifier.predict(X_test)
knn_accuracy = np.mean(y_pred_knn == y_test)

knn_accuracy


np.float64(0.9993846153846154)

In [13]:
import numpy as np
import pandas as pd
import random
from collections import Counter

class DecisionTreeNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    
    def is_leaf(self):
        return self.value is not None

class DecisionTreeClassifier:
    def __init__(self, max_depth=10):
        self.max_depth = max_depth
        self.root = None
    
    def fit(self, X, y):
        self.root = self._grow_tree(X, y, depth=0)
    
    def _grow_tree(self, X, y, depth):
        num_samples, num_features = X.shape
        
        if depth >= self.max_depth or len(set(y)) == 1:
            return DecisionTreeNode(value=Counter(y).most_common(1)[0][0])
        
        best_feature, best_threshold = self._best_split(X, y)
        if best_feature is None:
            return DecisionTreeNode(value=Counter(y).most_common(1)[0][0])
        
        left_indices = X[:, best_feature] <= best_threshold
        right_indices = ~left_indices
        left_child = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
        right_child = self._grow_tree(X[right_indices], y[right_indices], depth + 1)
        
        return DecisionTreeNode(feature=best_feature, threshold=best_threshold, left=left_child, right=right_child)
    
    def _best_split(self, X, y):
        best_gini = float('inf')
        best_feature = None
        best_threshold = None
        
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_mask = X[:, feature] <= threshold
                right_mask = ~left_mask
                gini = self._gini_index(y[left_mask], y[right_mask])
                
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = threshold
        
        return best_feature, best_threshold
    
    def _gini_index(self, left_labels, right_labels):
        total_samples = len(left_labels) + len(right_labels)
        gini = 0
        for labels in [left_labels, right_labels]:
            if len(labels) == 0:
                continue
            class_counts = np.array(list(Counter(labels).values()))
            probs = class_counts / len(labels)
            gini += (len(labels) / total_samples) * (1 - np.sum(probs ** 2))
        return gini
    
    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])
    
    def _traverse_tree(self, x, node):
        if node.is_leaf():
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

def train_test_split(X, y, test_size=0.2):
    indices = list(range(len(X)))
    random.shuffle(indices)
    split_idx = int(len(X) * (1 - test_size))
    train_indices, test_indices = indices[:split_idx], indices[split_idx:]
    return X[train_indices], X[test_indices], y[train_indices], y[test_indices]

# Load and preprocess dataset
data = pd.read_csv("mushroom.data", header=None)
labels = np.array([1 if val == 'p' else 0 for val in data[0]])
features = np.array(data.iloc[:, 1:])
unique_values = [list(set(col)) for col in zip(*features)]
encoded_features = np.array([[unique_values[i].index(val) for i, val in enumerate(row)] for row in features])

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(encoded_features, labels, test_size=0.2)

# Training and testing Decision Tree classifier
dt_classifier = DecisionTreeClassifier(max_depth=10)
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_test)
dt_accuracy = np.mean(y_pred_dt == y_test)

dt_accuracy


np.float64(1.0)

In [14]:
print(f"MSE - Linear: {lin_mse * 100:.2f}%")
print(f"Logistic Regression Accuracy: {log_accuracy * 100:.2f}%")
print(f"Decision Tree Accuracy: {dt_accuracy * 100:.2f}%")
print(f"KNN Classification Accuracy: {knn_accuracy * 100:.2f}%")
print(f"Naive Bayes: {nb_accuracy * 100:.2f}%")

MSE - Linear: 3.28%
Logistic Regression Accuracy: 93.78%
Decision Tree Accuracy: 100.00%
KNN Classification Accuracy: 99.94%
Naive Bayes: 88.12%
