In [72]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, normalized_mutual_info_score


## K classification


In [73]:
class NaiveBayesClassifier:
    """
    Gaussian Naive Bayes for continuous features.
    """
    def __init__(self):
        self.classes_     = None
        self.class_prior_ = None
        self.mean_        = None
        self.var_         = None

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)
        self.classes_, counts = np.unique(y, return_counts=True)
        n_classes  = len(self.classes_)
        n_features = X.shape[1]

        # estimate P(c)
        self.class_prior_ = counts / counts.sum()

        # estimate featurewise mean & variance per class
        self.mean_ = np.zeros((n_classes, n_features))
        self.var_  = np.zeros((n_classes, n_features))
        for idx, c in enumerate(self.classes_):
            Xc = X[y == c]
            self.mean_[idx] = Xc.mean(axis=0)
            # add small epsilon to variance for numeric stability
            self.var_[idx]  = Xc.var(axis=0) + 1e-9

    def _log_gaussian(self, X):
        """
        Compute log P(X|c) for each class.
        Returns an array of shape (n_samples, n_classes).
        """
        n_samples, _ = X.shape
        log_probs = []
        for idx in range(len(self.classes_)):
            mu  = self.mean_[idx]
            var = self.var_[idx]
            # constant term for multivariate Gaussian
            const = -0.5 * np.sum(np.log(2 * np.pi * var))
            # exponent term
            expo  = -0.5 * np.sum(((X - mu) ** 2) / var, axis=1)
            log_probs.append(const + expo)
        return np.vstack(log_probs).T

    def predict(self, X):
        X = np.asarray(X)
        log_likelihood = self._log_gaussian(X)              # (n_samples, n_classes)
        log_prior      = np.log(self.class_prior_)          # (n_classes,)
        scores         = log_likelihood + log_prior[np.newaxis, :]
        class_idxs     = np.argmax(scores, axis=1)
        return self.classes_[class_idxs]

    def score(self, X, y):
        return np.mean(self.predict(X) == np.asarray(y))


## Q2 Random picking


In [76]:
def run_wine_percentage_dataset(file_path, training_percentages=[0.20, 0.60, 0.90]):
    """
    For each of the wine quality datasets, generate a training set by randomly picking
    X% of the data (X = 20, 60, 90) for training, use the rest for testing, train a
    Naive Bayes classifier, and report the test accuracy.
    Accuracy = (correctly classified samples in test set) / (total test samples).
    """
    results = {}
    df = pd.read_csv(file_path, sep=";")
    X = df.drop("quality", axis=1).values
    y = df["quality"].values

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    for pct in training_percentages:
        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, y, train_size=pct, random_state=42
        )
        clf = NaiveBayesClassifier()
        clf.fit(X_train, y_train)
        test_accuracy = clf.score(X_test, y_test)
        results[pct] = test_accuracy
        print(f"Training percentage: {int(pct*100)}% -> Test Accuracy: {test_accuracy*100:.2f}%")
    return results

# Run the experiments for the red wine dataset.
print("=== Wine Quality (Red) Dataset Splits ===")
results_red = run_wine_percentage_dataset(
    "data/winequality-red.csv",
    training_percentages=[0.20, 0.60, 0.90]
)

# Run the experiments for the white wine dataset.
print("\n=== Wine Quality (White) Dataset Splits ===")
results_white = run_wine_percentage_dataset(
    "data/winequality-white.csv",
    training_percentages=[0.20, 0.60, 0.90]
)


=== Wine Quality (Red) Dataset Splits ===
Training percentage: 20% -> Test Accuracy: 53.12%
Training percentage: 60% -> Test Accuracy: 53.44%
Training percentage: 90% -> Test Accuracy: 55.62%

=== Wine Quality (White) Dataset Splits ===
Training percentage: 20% -> Test Accuracy: 44.09%
Training percentage: 60% -> Test Accuracy: 43.01%
Training percentage: 90% -> Test Accuracy: 43.88%


## Q4

In [68]:


def initialize_centroids(X, k):
    """
    Randomly select k data points from X as centroids.
    """
    n_samples = X.shape[0]
    indices = np.random.choice(n_samples, k, replace=False)
    return X[indices]

def assign_clusters(X, centroids):
    """
    Assign each data point in X to the nearest centroid based on Euclidean distance.
    """
    dists = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
    return np.argmin(dists, axis=1)

def update_centroids(X, labels, k):
    """
    Update each centroid as the mean of all points assigned to that centroid.
    If a centroid has no assigned points, re-initialize it randomly.
    """
    centroids = np.zeros((k, X.shape[1]))
    for i in range(k):
        if np.any(labels == i):
            centroids[i] = np.mean(X[labels == i], axis=0)
        else:
            centroids[i] = X[np.random.choice(X.shape[0])]
    return centroids

def kmeans(X, k, max_iter=100, tol=1e-4):
    """
    Run the standard K-means algorithm.
    
    Parameters:
      - X: Data array of shape (n_samples, n_features).
      - k: Number of clusters.
      
    Returns:
      - labels: Cluster assignments for each sample.
      - centroids: Final centroids.
    """
    centroids = initialize_centroids(X, k)
    for _ in range(max_iter):
        labels = assign_clusters(X, centroids)
        new_centroids = update_centroids(X, labels, k)
        if np.linalg.norm(new_centroids - centroids) < tol:
            break
        centroids = new_centroids
    return labels, centroids

def modified_kmeans(X, ground_truth, k_range=range(5, 16), random_state=42):
    """
    Run K-means for a range of k values, choose the best k by maximizing the silhouette score,
    and compute the normalized mutual information (NMI) with the ground-truth labels.
    
    Parameters:
      - X: Data array (scaled) of shape (n_samples, n_features).
      - ground_truth: Ground truth labels.
      - k_range: Iterable of k values to try.
      - random_state: Seed for reproducibility.
      
    Returns:
      - best_k: The chosen number of clusters.
      - best_labels: Cluster assignments with best k.
      - nmi: Normalized mutual information between best_labels and ground_truth.
      - sil_scores: Dictionary of k to its silhouette score.
    """
    np.random.seed(random_state)
    best_k = None
    best_sil_score = -1
    best_labels = None
    sil_scores = {}
    
    for k in k_range:
        labels, _ = kmeans(X, k)
        if len(np.unique(labels)) > 1:
            sil = silhouette_score(X, labels)
        else:
            sil = -1
        sil_scores[k] = sil
        if sil > best_sil_score:
            best_sil_score = sil
            best_k = k
            best_labels = labels
            
    nmi = normalized_mutual_info_score(ground_truth, best_labels)
    return best_k, best_labels, nmi, sil_scores

def run_wine_modified_kmeans(file_path, k_range=range(5, 16), random_state=42):
    """
    Load a wine quality dataset, scale its features, run the modified K-means for k in k_range,
    and print the best number of clusters and the corresponding NMI with the ground truth.
    
    Parameters:
      - file_path: Path to the wine quality CSV file.
      - k_range: Range of k values to try.
      - random_state: Seed for reproducibility.
      
    Returns:
      - best_k: The final number of clusters chosen.
      - nmi: Normalized mutual information value with ground truth.
      - sil_scores: Silhouette scores for all k values.
    """
    df = pd.read_csv(file_path, sep=";")
    # The target column "quality" is used as ground truth.
    X = df.drop("quality", axis=1).values
    ground_truth = df["quality"].values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    best_k, best_labels, nmi, sil_scores = modified_kmeans(X_scaled, ground_truth, k_range=k_range, random_state=random_state)
    print(f"Dataset: {file_path}")
    print(f"  Best k: {best_k}")
    print(f"  Normalized Mutual Information (NMI): {nmi:.4f}")
    return best_k, nmi, sil_scores

if __name__ == '__main__':
    print("=== Modified K-means on Wine Quality (Red) Dataset ===")
    best_k_red, nmi_red, sil_red = run_wine_modified_kmeans("data/winequality-red.csv", k_range=range(5, 16))
    
    print("\n=== Modified K-means on Wine Quality (White) Dataset ===")
    best_k_white, nmi_white, sil_white = run_wine_modified_kmeans("data/winequality-white.csv", k_range=range(5, 16))


=== Modified K-means on Wine Quality (Red) Dataset ===
Dataset: data/winequality-red.csv
  Best k: 7
  Normalized Mutual Information (NMI): 0.0981

=== Modified K-means on Wine Quality (White) Dataset ===
Dataset: data/winequality-white.csv
  Best k: 6
  Normalized Mutual Information (NMI): 0.0741


## Q6

Adult Numeric Dataset Test Accuracy: 63.34%
