In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

def sigmoid(x):
    return 1 / (1 + np.exp(-np.clip(x, -709, 709)))

def sigmoid_derivative(x):
    return x * (1 - x)

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

class MLP:
    def __init__(self, input_size, hidden_sizes, output_size, activation='relu', learning_rate=0.01):
        self.input_size = input_size
        self.hidden_sizes = hidden_sizes
        self.output_size = output_size
        self.learning_rate = learning_rate
        
        self.weights = []
        self.biases = []
        
        layer_sizes = [input_size] + hidden_sizes + [output_size]
        for i in range(1, len(layer_sizes)):
            self.weights.append(np.random.randn(layer_sizes[i-1], layer_sizes[i]) * 0.01)
            self.biases.append(np.zeros((1, layer_sizes[i])))
        
        if activation == 'relu':
            self.activation = relu
            self.activation_derivative = relu_derivative
        elif activation == 'sigmoid':
            self.activation = sigmoid
            self.activation_derivative = sigmoid_derivative
    
    def forward_propagation(self, X):
        self.layer_outputs = [X]
        for i in range(len(self.weights)):
            z = np.dot(self.layer_outputs[-1], self.weights[i]) + self.biases[i]
            a = self.activation(z)
            self.layer_outputs.append(a)
        return self.layer_outputs[-1]
    
    def backward_propagation(self, X, y):
        m = X.shape[0]
        delta = self.layer_outputs[-1] - y
        gradients = []
        
        for i in reversed(range(len(self.weights))):
            dW = np.dot(self.layer_outputs[i].T, delta) / m
            db = np.sum(delta, axis=0, keepdims=True) / m
            gradients.append((dW, db))
            if i > 0:
                delta = np.dot(delta, self.weights[i].T) * self.activation_derivative(self.layer_outputs[i])
        
        return list(reversed(gradients))
    
    def update_parameters(self, gradients):
        for i, (dW, db) in enumerate(gradients):
            self.weights[i] -= self.learning_rate * dW
            self.biases[i] -= self.learning_rate * db
    
    def fit(self, X, y, epochs=1000, batch_size=32):
        for epoch in range(epochs):
            for i in range(0, X.shape[0], batch_size):
                X_batch = X[i:i+batch_size]
                y_batch = y[i:i+batch_size]
                
                self.forward_propagation(X_batch)
                gradients = self.backward_propagation(X_batch, y_batch)
                self.update_parameters(gradients)
            
            if epoch % 100 == 0:
                loss = np.mean((self.forward_propagation(X) - y) ** 2)
                print(f"Epoch {epoch}, Loss: {loss:.4f}")
    
    def predict(self, X):
        return self.forward_propagation(X)

class AutoEncoder:
    def __init__(self, input_size, hidden_sizes, latent_size):
        self.input_size = input_size
        self.hidden_sizes = hidden_sizes
        self.latent_size = latent_size
        
        encoder_sizes = hidden_sizes + [latent_size]
        decoder_sizes = hidden_sizes[::-1] + [input_size]
        
        self.encoder = MLP(input_size, encoder_sizes[:-1], encoder_sizes[-1])
        self.decoder = MLP(latent_size, decoder_sizes[:-1], decoder_sizes[-1])
    
    def fit(self, X, epochs=1000, batch_size=32):
        for epoch in range(epochs):
            for i in range(0, X.shape[0], batch_size):
                X_batch = X[i:i+batch_size]
                
                # Forward pass
                latent = self.encoder.forward_propagation(X_batch)
                reconstructed = self.decoder.forward_propagation(latent)
                
                # Backward pass
                decoder_gradients = self.decoder.backward_propagation(latent, X_batch)
                encoder_gradients = self.encoder.backward_propagation(X_batch, latent)
                
                # Update parameters
                self.decoder.update_parameters(decoder_gradients)
                self.encoder.update_parameters(encoder_gradients)
            
            if epoch % 100 == 0:
                loss = np.mean((self.decoder.forward_propagation(self.encoder.forward_propagation(X)) - X) ** 2)
                print(f"Epoch {epoch}, Loss: {loss:.4f}")
    
    def get_latent(self, X):
        return self.encoder.predict(X)

class KNN:
    def __init__(self, k=3):
        self.k = k
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def predict(self, X):
        predictions = []
        for x in X:
            distances = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            most_common = np.bincount(k_nearest_labels).argmax()
            predictions.append(most_common)
        return np.array(predictions)

# Load and preprocess the data
data = pd.read_csv('../../data/external/spotify.csv')  
X = data.drop('track_genre', axis=1)
y = data['track_genre']

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Encode categorical columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train KNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

# Make predictions
y_pred = knn_classifier.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("AutoEncoder + KNN Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Train MLP classifier
input_size = X_train.shape[1]
mlp_classifier = MLPClassifier(hidden_layer_sizes=(64, 32), activation='logistic', max_iter=1000)
mlp_classifier.fit(X_train, y_train)

# Make predictions
y_pred_mlp = mlp_classifier.predict(X_test)

# Calculate metrics
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
precision_mlp = precision_score(y_test, y_pred_mlp, average='weighted')
recall_mlp = recall_score(y_test, y_pred_mlp, average='weighted')
f1_mlp = f1_score(y_test, y_pred_mlp, average='weighted')

print("\nMLP Classifier Results:")
print(f"Accuracy: {accuracy_mlp:.4f}")
print(f"Precision: {precision_mlp:.4f}")
print(f"Recall: {recall_mlp:.4f}")
print(f"F1 Score: {f1_mlp:.4f}")

# Compare results
print("\nComparison:")
print(f"AutoEncoder + KNN F1 Score: {f1:.4f}")
print(f"MLP Classifier F1 Score: {f1_mlp:.4f}")
print(f"Difference: {abs(f1 - f1_mlp):.4f}")




AutoEncoder + KNN Results:
Accuracy: 0.2954
Precision: 0.2963
Recall: 0.2954
F1 Score: 0.2885





MLP Classifier Results:
Accuracy: 0.9426
Precision: 0.9435
Recall: 0.9426
F1 Score: 0.9427

Comparison:
AutoEncoder + KNN F1 Score: 0.2885
MLP Classifier F1 Score: 0.9427
Difference: 0.6542


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

def sigmoid(x):
    return 1 / (1 + np.exp(-np.clip(x, -709, 709)))

def sigmoid_derivative(x):
    return x * (1 - x)

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

class MLP:
    def __init__(self, input_size, hidden_sizes, output_size, activation='relu', learning_rate=0.01):
        self.input_size = input_size
        self.hidden_sizes = hidden_sizes
        self.output_size = output_size
        self.learning_rate = learning_rate
        
        self.weights = []
        self.biases = []
        
        layer_sizes = [input_size] + hidden_sizes + [output_size]
        for i in range(1, len(layer_sizes)):
            self.weights.append(np.random.randn(layer_sizes[i-1], layer_sizes[i]) * np.sqrt(2. / layer_sizes[i-1]))
            self.biases.append(np.zeros((1, layer_sizes[i])))
        
        if activation == 'relu':
            self.activation = relu
            self.activation_derivative = relu_derivative
        elif activation == 'sigmoid':
            self.activation = sigmoid
            self.activation_derivative = sigmoid_derivative
    
    def forward_propagation(self, X):
        self.layer_outputs = [X]
        for i in range(len(self.weights)):
            z = np.dot(self.layer_outputs[-1], self.weights[i]) + self.biases[i]
            if i < len(self.weights) - 1:
                a = self.activation(z)
            else:
                a = sigmoid(z) if self.output_size == 1 else z
            self.layer_outputs.append(a)
        return self.layer_outputs[-1]
    
    def backward_propagation(self, X, y):
        m = X.shape[0]
        delta = self.layer_outputs[-1] - y
        gradients = []
        
        for i in reversed(range(len(self.weights))):
            dW = np.dot(self.layer_outputs[i].T, delta) / m
            db = np.sum(delta, axis=0, keepdims=True) / m
            gradients.append((dW, db))
            if i > 0:
                delta = np.dot(delta, self.weights[i].T) * self.activation_derivative(self.layer_outputs[i])
        
        return list(reversed(gradients))
    
    def update_parameters(self, gradients):
        for i, (dW, db) in enumerate(gradients):
            self.weights[i] -= self.learning_rate * dW
            self.biases[i] -= self.learning_rate * db
    
    def fit(self, X, y, epochs=1000, batch_size=32):
        for epoch in range(epochs):
            for i in range(0, X.shape[0], batch_size):
                X_batch = X[i:i+batch_size]
                y_batch = y[i:i+batch_size]
                
                self.forward_propagation(X_batch)
                gradients = self.backward_propagation(X_batch, y_batch)
                self.update_parameters(gradients)
            
            if epoch % 100 == 0:
                loss = np.mean((self.forward_propagation(X) - y) ** 2)
                print(f"Epoch {epoch}, Loss: {loss:.4f}")
    
    def predict(self, X):
        return self.forward_propagation(X)

class AutoEncoder:
    def __init__(self, input_size, hidden_sizes, latent_size):
        self.input_size = input_size
        self.hidden_sizes = hidden_sizes
        self.latent_size = latent_size
        
        encoder_sizes = hidden_sizes + [latent_size]
        decoder_sizes = hidden_sizes[::-1] + [input_size]
        
        self.encoder = MLP(input_size, encoder_sizes[:-1], encoder_sizes[-1])
        self.decoder = MLP(latent_size, decoder_sizes[:-1], decoder_sizes[-1])
    
    def fit(self, X, epochs=1000, batch_size=32):
        for epoch in range(epochs):
            for i in range(0, X.shape[0], batch_size):
                X_batch = X[i:i+batch_size]
                
                # Forward pass
                latent = self.encoder.forward_propagation(X_batch)
                reconstructed = self.decoder.forward_propagation(latent)
                
                # Backward pass
                decoder_gradients = self.decoder.backward_propagation(latent, X_batch)
                encoder_gradients = self.encoder.backward_propagation(X_batch, latent)
                
                # Update parameters
                self.decoder.update_parameters(decoder_gradients)
                self.encoder.update_parameters(encoder_gradients)
            
            if epoch % 100 == 0:
                loss = np.mean((self.decoder.forward_propagation(self.encoder.forward_propagation(X)) - X) ** 2)
                print(f"Epoch {epoch}, Loss: {loss:.4f}")
    
    def get_latent(self, X):
        return self.encoder.predict(X)

class KNN:
    def __init__(self, k=3, distance_metric='euclidean', batch_size=100):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None
        self.batch_size = batch_size
        self.distance_functions = {
            'euclidean': self.euclidean_distance,
            'manhattan': self.manhattan_distance,
            'cosine': self.cosine_distance
        }

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        num_samples = X.shape[0]
        predictions = np.zeros(num_samples, dtype=self.y_train.dtype)
        
        for i in range(0, num_samples, self.batch_size):
            batch = X[i:i+self.batch_size]
            distances = self.distance_functions[self.distance_metric](batch, self.X_train)
            k_indices = np.argpartition(distances, self.k, axis=1)[:, :self.k]
            k_nearest_labels = self.y_train[k_indices]
            batch_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=k_nearest_labels)
            predictions[i:i+self.batch_size] = batch_predictions
        
        return predictions

    @staticmethod
    def euclidean_distance(X1, X2):
        return np.sqrt(np.sum((X1[:, np.newaxis, :] - X2) ** 2, axis=2))

    @staticmethod
    def manhattan_distance(X1, X2):
        return np.sum(np.abs(X1[:, np.newaxis, :] - X2), axis=2)

    @staticmethod
    def cosine_distance(X1, X2):
        dot_product = np.einsum('ijk,jk->ij', X1[:, np.newaxis, :], X2)
        norm_X1 = np.linalg.norm(X1, axis=1)
        norm_X2 = np.linalg.norm(X2, axis=1)
        return 1 - (dot_product / (norm_X1[:, np.newaxis] * norm_X2))

def load_and_preprocess_data(file_path, target_column):
    data = pd.read_csv(file_path)
    
    # Separate features and target
    X = data.drop([target_column], axis=1)
    y = data[target_column]
    
    # Convert non-numeric columns to numeric where possible
    X = X.apply(pd.to_numeric, errors='coerce')
    
    # Drop columns that cannot be converted to numeric
    X = X.select_dtypes(include=[np.number]).dropna(axis=1, how='any')

    # Normalize the features (Min-Max scaling)
    X = (X - X.min()) / (X.max() - X.min())
    
    # Handle categorical labels
    if y.dtype == 'object':
        label_mapping = {label: idx for idx, label in enumerate(np.unique(y))}
        y = y.map(label_mapping)
    
    return X.values, y.values

def custom_train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    
    test_size = int(X.shape[0] * test_size)
    train_indices = indices[:-test_size]
    test_indices = indices[-test_size:]
    
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    
    return X_train, X_test, y_train, y_test

class PCA:
    def __init__(self, n_components):
        self.n_components = n_components
        self.components = None
        self.mean = None

    def fit(self, X):
        # Center the data
        self.mean = np.mean(X, axis=0)
        X_centered = X - self.mean

        # Compute the covariance matrix
        cov = np.cov(X_centered, rowvar=False)

        # Compute eigenvalues and eigenvectors
        eigenvalues, eigenvectors = np.linalg.eigh(cov)

        # Sort eigenvectors by decreasing eigenvalues
        idx = np.argsort(eigenvalues)[::-1]
        eigenvectors = eigenvectors[:, idx]

        # Store the first n_components eigenvectors
        self.components = eigenvectors[:, :self.n_components]

    def transform(self, X):
        # Check if mean has been computed
        if self.mean is None:
            raise ValueError("PCA has not been fitted. Call fit() before using transform().")
        
        # Center the data
        X_centered = X - self.mean

        # Project the data onto the principal components
        return np.dot(X_centered, self.components)

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

def knn_classification(X_train, X_test, y_train, y_test, k, distance_metric):
    knn = KNN(k=k, distance_metric=distance_metric)
    knn.fit(X_train, y_train)
    
    # Predict and evaluate performance
    y_pred = knn.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    f1_micro = f1_score(y_test, y_pred, average='micro')

    return accuracy, precision, recall, f1_macro, f1_micro

def Q9_main():
    # Load the dataset
    file_path = '../../data/external/spotify.csv'  # Update with your dataset path
    target_column = 'track_genre'  # Replace with the actual target column
    
    # Load and preprocess data
    X, y = load_and_preprocess_data(file_path, target_column)
    
    # Step 1: Perform PCA
    optimal_components = 3
    pca = PCA(n_components=optimal_components)
    X_reduced = pca.fit_transform(X)
    
    # Split the data into training and test sets using custom function
    X_train, X_test, y_train, y_test = custom_train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_reduced, X_test_reduced, _, _ = custom_train_test_split(X_reduced, y, test_size=0.2, random_state=42)
    
    # Step 2: Apply KNN with the best {k, distance metric} pair (k=19, distance='manhattan')
    k_best = 19
    distance_best = 'manhattan'
    
    print("\nPerforming KNN on full dataset...")
    full_accuracy, full_precision, full_recall, full_f1_macro, full_f1_micro = knn_classification(
        X_train, X_test, y_train, y_test, k_best, distance_best)
    
    print("\nPerforming KNN on PCA-reduced dataset...")
    reduced_accuracy, reduced_precision, reduced_recall, reduced_f1_macro, reduced_f1_micro = knn_classification(
        X_train_reduced, X_test_reduced, y_train, y_test, k_best, distance_best)
    
    # Step 3: Print evaluation metrics for full and reduced datasets
    print("\n---- KNN Performance on Full Dataset ----")
    print("Accuracy: {:.2f}".format(full_accuracy))
    print("Precision: {:.2f}".format(full_precision))
    print("Recall: {:.2f}".format(full_recall))
    print("F1 Score (Macro): {:.2f}".format(full_f1_macro))
    print("F1 Score (Micro): {:.2f}".format(full_f1_micro))
    
    print("\n---- KNN Performance on PCA-Reduced Dataset ----")
    print("Accuracy: {:.2f}".format(reduced_accuracy))
    print("Precision: {:.2f}".format(reduced_precision))
    print("Recall: {:.2f}".format(reduced_recall))
    print("F1 Score (Macro): {:.2f}".format(reduced_f1_macro))
    print("F1 Score (Micro): {:.2f}".format(reduced_f1_micro))
    
    # Train AutoEncoder on PCA-reduced dataset
    print("\nTraining AutoEncoder on PCA-reduced dataset...")
    autoencoder = AutoEncoder(input_size=optimal_components, hidden_sizes=[64, 32], latent_size=optimal_components)
    autoencoder.fit(X_train_reduced, epochs=1000, batch_size=32)
    
    # Get latent representation
    X_train_latent = autoencoder.get_latent(X_train_reduced)
    X_test_latent = autoencoder.get_latent(X_test_reduced)
    
    # Apply KNN on latent representation
    print("\nPerforming KNN on AutoEncoder latent representation...")
    latent_accuracy, latent_precision, latent_recall, latent_f1_macro, latent_f1_micro = knn_classification(
        X_train_latent, X_test_latent, y_train, y_test, k_best, distance_best)
    
    # Print evaluation metrics for latent representation
    print("\n---- KNN Performance on AutoEncoder Latent Representation ----")
    print("Accuracy: {:.2f}".format(latent_accuracy))
    print("Precision: {:.2f}".format(latent_precision))
    print("Recall: {:.2f}".format(latent_recall))
    print("F1 Score (Macro): {:.2f}".format(latent_f1_macro))
    print("F1 Score (Micro): {:.2f}".format(latent_f1_micro))
    
    # Train MLP classifier on original dataset
    print("\nTraining MLP classifier on original dataset...")
    input_size = X_train.shape[1]
    mlp_classifier = MLP(input_size=input_size, hidden_sizes=[64, 32], output_size=1, activation='sigmoid', learning_rate=0.01)
    mlp_classifier.fit(X_train, y_train, epochs=1000, batch_size=32)
    
    # Make predictions
    y_pred_mlp = mlp_classifier.predict(X_test)
    y_pred_mlp = (y_pred_mlp > 0.5).astype(int).flatten()
    
    # Calculate metrics
    accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
    precision_mlp = precision_score(y_test, y_pred_mlp, average='weighted')
    recall_mlp = recall_score(y_test, y_pred_mlp, average='weighted')
    f1_mlp = f1_score(y_test, y_pred_mlp, average='weighted')
    
    print("\nMLP Classifier Results:")
    print(f"Accuracy: {accuracy_mlp:.4f}")
    print(f"Precision: {precision_mlp:.4f}")
    print(f"Recall: {recall_mlp:.4f}")
    print(f"F1 Score: {f1_mlp:.4f}")
    
    # Compare results
    print("\nComparison:")
    print(f"AutoEncoder + KNN F1 Score: {latent_f1_macro:.4f}")
    print(f"MLP Classifier F1 Score: {f1_mlp:.4f}")
    print(f"Difference: {abs(latent_f1_macro - f1_mlp):.4f}")

# Run the main function
Q9_main()






Performing KNN on full dataset...

Performing KNN on PCA-reduced dataset...

---- KNN Performance on Full Dataset ----
Accuracy: 0.51
Precision: 0.51
Recall: 0.51
F1 Score (Macro): 0.50
F1 Score (Micro): 0.51

---- KNN Performance on PCA-Reduced Dataset ----
Accuracy: 0.13
Precision: 0.12
Recall: 0.13
F1 Score (Macro): 0.12
F1 Score (Micro): 0.13

Training AutoEncoder on PCA-reduced dataset...
Epoch 0, Loss: 0.0257
Epoch 100, Loss: 0.0064
Epoch 200, Loss: 0.0056
Epoch 300, Loss: 0.0052
Epoch 400, Loss: 0.0049
Epoch 500, Loss: 0.0047
Epoch 600, Loss: 0.0046
Epoch 700, Loss: 0.0045
Epoch 800, Loss: 0.0044
Epoch 900, Loss: 0.0043

Performing KNN on AutoEncoder latent representation...

---- KNN Performance on AutoEncoder Latent Representation ----
Accuracy: 0.12
Precision: 0.11
Recall: 0.12
F1 Score (Macro): 0.11
F1 Score (Micro): 0.12

Training MLP classifier on original dataset...


ValueError: shapes (32,32) and (1,32) not aligned: 32 (dim 1) != 1 (dim 0)

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

def sigmoid(x):
    return 1 / (1 + np.exp(-np.clip(x, -709, 709)))

def sigmoid_derivative(x):
    return x * (1 - x)

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

class MLP:
    def __init__(self, input_size, hidden_sizes, output_size, activation='relu', learning_rate=0.01):
        self.input_size = input_size
        self.hidden_sizes = hidden_sizes
        self.output_size = output_size
        self.learning_rate = learning_rate
        
        self.weights = []
        self.biases = []
        
        layer_sizes = [input_size] + hidden_sizes + [output_size]
        for i in range(1, len(layer_sizes)):
            self.weights.append(np.random.randn(layer_sizes[i-1], layer_sizes[i]) * np.sqrt(2. / layer_sizes[i-1]))
            self.biases.append(np.zeros((1, layer_sizes[i])))
        
        if activation == 'relu':
            self.activation = relu
            self.activation_derivative = relu_derivative
        elif activation == 'sigmoid':
            self.activation = sigmoid
            self.activation_derivative = sigmoid_derivative
    
    def forward_propagation(self, X):
        self.layer_outputs = [X]
        for i in range(len(self.weights)):
            z = np.dot(self.layer_outputs[-1], self.weights[i]) + self.biases[i]
            if i < len(self.weights) - 1:
                a = self.activation(z)
            else:
                a = sigmoid(z) if self.output_size == 1 else z
            self.layer_outputs.append(a)
        return self.layer_outputs[-1]
    
    def backward_propagation(self, X, y):
        m = X.shape[0]
        delta = self.layer_outputs[-1] - y.reshape(-1, 1)
        gradients = []
        
        for i in reversed(range(len(self.weights))):
            dW = np.dot(self.layer_outputs[i].T, delta) / m
            db = np.sum(delta, axis=0, keepdims=True) / m
            gradients.append((dW, db))
            if i > 0:
                delta = np.dot(delta, self.weights[i].T) * self.activation_derivative(self.layer_outputs[i])
        
        return list(reversed(gradients))
    
    def update_parameters(self, gradients):
        for i, (dW, db) in enumerate(gradients):
            self.weights[i] -= self.learning_rate * dW
            self.biases[i] -= self.learning_rate * db
    
    def fit(self, X, y, epochs=1000, batch_size=32):
        for epoch in range(epochs):
            for i in range(0, X.shape[0], batch_size):
                X_batch = X[i:i+batch_size]
                y_batch = y[i:i+batch_size]
                
                self.forward_propagation(X_batch)
                gradients = self.backward_propagation(X_batch, y_batch)
                self.update_parameters(gradients)
            
            if epoch % 100 == 0:
                loss = np.mean((self.forward_propagation(X) - y.reshape(-1, 1)) ** 2)
                print(f"Epoch {epoch}, Loss: {loss:.4f}")
    
    def predict(self, X):
        return self.forward_propagation(X)

class AutoEncoder:
    def __init__(self, input_size, hidden_sizes, latent_size):
        self.input_size = input_size
        self.hidden_sizes = hidden_sizes
        self.latent_size = latent_size
        
        encoder_sizes = hidden_sizes + [latent_size]
        decoder_sizes = hidden_sizes[::-1] + [input_size]
        
        self.encoder = MLP(input_size, encoder_sizes[:-1], encoder_sizes[-1])
        self.decoder = MLP(latent_size, decoder_sizes[:-1], decoder_sizes[-1])
    
    def fit(self, X, epochs=1000, batch_size=32):
        for epoch in range(epochs):
            for i in range(0, X.shape[0], batch_size):
                X_batch = X[i:i+batch_size]
                
                # Forward pass
                latent = self.encoder.forward_propagation(X_batch)
                reconstructed = self.decoder.forward_propagation(latent)
                
                # Backward pass
                decoder_gradients = self.decoder.backward_propagation(latent, X_batch)
                encoder_gradients = self.encoder.backward_propagation(X_batch, latent)
                
                # Update parameters
                self.decoder.update_parameters(decoder_gradients)
                self.encoder.update_parameters(encoder_gradients)
            
            if epoch % 100 == 0:
                loss = np.mean((self.decoder.forward_propagation(self.encoder.forward_propagation(X)) - X) ** 2)
                print(f"Epoch {epoch}, Loss: {loss:.4f}")
    
    def get_latent(self, X):
        return self.encoder.predict(X)

class KNN:
    def __init__(self, k=3, distance_metric='euclidean', batch_size=100):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None
        self.batch_size = batch_size
        self.distance_functions = {
            'euclidean': self.euclidean_distance,
            'manhattan': self.manhattan_distance,
            'cosine': self.cosine_distance
        }

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        num_samples = X.shape[0]
        predictions = np.zeros(num_samples, dtype=self.y_train.dtype)
        
        for i in range(0, num_samples, self.batch_size):
            batch = X[i:i+self.batch_size]
            distances = self.distance_functions[self.distance_metric](batch, self.X_train)
            k_indices = np.argpartition(distances, self.k, axis=1)[:, :self.k]
            k_nearest_labels = self.y_train[k_indices]
            batch_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=k_nearest_labels)
            predictions[i:i+self.batch_size] = batch_predictions
        
        return predictions

    @staticmethod
    def euclidean_distance(X1, X2):
        return np.sqrt(np.sum((X1[:, np.newaxis, :] - X2) ** 2, axis=2))

    @staticmethod
    def manhattan_distance(X1, X2):
        return np.sum(np.abs(X1[:, np.newaxis, :] - X2), axis=2)

    @staticmethod
    def cosine_distance(X1, X2):
        dot_product = np.einsum('ijk,jk->ij', X1[:, np.newaxis, :], X2)
        norm_X1 = np.linalg.norm(X1, axis=1)
        norm_X2 = np.linalg.norm(X2, axis=1)
        return 1 - (dot_product / (norm_X1[:, np.newaxis] * norm_X2))

def load_and_preprocess_data(file_path, target_column):
    data = pd.read_csv(file_path)
    
    # Separate features and target
    X = data.drop([target_column], axis=1)
    y = data[target_column]
    
    # Convert non-numeric columns to numeric where possible
    X = X.apply(pd.to_numeric, errors='coerce')
    
    # Drop columns that cannot be converted to numeric
    X = X.select_dtypes(include=[np.number]).dropna(axis=1, how='any')

    # Normalize the features (Min-Max scaling)
    X = (X - X.min()) / (X.max() - X.min())
    
    # Handle categorical labels
    if y.dtype == 'object':
        label_mapping = {label: idx for idx, label in enumerate(np.unique(y))}
        y = y.map(label_mapping)
    
    return X.values, y.values

def custom_train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    
    test_size = int(X.shape[0] * test_size)
    train_indices = indices[:-test_size]
    test_indices = indices[-test_size:]
    
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    
    return X_train, X_test, y_train, y_test

class PCA:
    def __init__(self, n_components):
        self.n_components = n_components
        self.components = None
        self.mean = None

    def fit(self, X):
        # Center the data
        self.mean = np.mean(X, axis=0)
        X_centered = X - self.mean

        # Compute the covariance matrix
        cov = np.cov(X_centered, rowvar=False)

        # Compute eigenvalues and eigenvectors
        eigenvalues, eigenvectors = np.linalg.eigh(cov)

        # Sort eigenvectors by decreasing eigenvalues
        idx = np.argsort(eigenvalues)[::-1]
        eigenvectors = eigenvectors[:, idx]

        # Store the first n_components eigenvectors
        self.components = eigenvectors[:, :self.n_components]

    def transform(self, X):
        # Check if mean has been computed
        if self.mean is None:
            raise ValueError("PCA has not been fitted. Call fit() before using transform().")
        
        # Center the data
        X_centered = X - self.mean

        # Project the data onto the principal components
        return np.dot(X_centered, self.components)

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

def knn_classification(X_train, X_test, y_train, y_test, k, distance_metric):
    knn = KNN(k=k, distance_metric=distance_metric)
    knn.fit(X_train, y_train)
    
    # Predict and evaluate performance
    y_pred = knn.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    f1_micro = f1_score(y_test, y_pred, average='micro')

    return accuracy, precision, recall, f1_macro, f1_micro

def Q9_main():
    # Load the dataset
    file_path = '../../data/external/spotify.csv'  # Update with your dataset path
    target_column = 'track_genre'  # Replace with the actual target column
    
    # Load and preprocess data
    X, y = load_and_preprocess_data(file_path, target_column)
    
    # Step 1: Perform PCA
    optimal_components = 3
    pca = PCA(n_components=optimal_components)
    X_reduced = pca.fit_transform(X)
    
    # Split the data into training and test sets using custom function
    X_train, X_test, y_train, y_test = custom_train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_reduced, X_test_reduced, _, _ = custom_train_test_split(X_reduced, y, test_size=0.2, random_state=42)
    
    # Step 2: Apply KNN with the best {k, distance metric} pair (k=19, distance='manhattan')
    k_best = 19
    distance_best = 'manhattan'
    
    print("\nPerforming KNN on full dataset...")
    full_accuracy, full_precision, full_recall, full_f1_macro, full_f1_micro = knn_classification(
        X_train, X_test, y_train, y_test, k_best, distance_best)
    
    print("\nPerforming KNN on PCA-reduced dataset...")
    reduced_accuracy, reduced_precision, reduced_recall, reduced_f1_macro, reduced_f1_micro = knn_classification(
        X_train_reduced, X_test_reduced, y_train, y_test, k_best, distance_best)
    
    # Step 3: Print evaluation metrics for full and reduced datasets
    print("\n---- KNN Performance on Full Dataset ----")
    print("Accuracy: {:.2f}".format(full_accuracy))
    print("Precision: {:.2f}".format(full_precision))
    print("Recall: {:.2f}".format(full_recall))
    print("F1 Score (Macro): {:.2f}".format(full_f1_macro))
    print("F1 Score (Micro): {:.2f}".format(full_f1_micro))
    
    print("\n---- KNN Performance on PCA-Reduced Dataset ----")
    print("Accuracy: {:.2f}".format(reduced_accuracy))
    print("Precision: {:.2f}".format(reduced_precision))
    print("Recall: {:.2f}".format(reduced_recall))
    print("F1 Score (Macro): {:.2f}".format(reduced_f1_macro))
    print("F1 Score (Micro): {:.2f}".format(reduced_f1_micro))
    
    # Train AutoEncoder on PCA-reduced dataset
    print("\nTraining AutoEncoder on PCA-reduced dataset...")
    autoencoder = AutoEncoder(input_size=optimal_components, hidden_sizes=[64, 32], latent_size=optimal_components)
    autoencoder.fit(X_train_reduced, epochs=1000, batch_size=32)
    
    # Get latent representation
    X_train_latent = autoencoder.get_latent(X_train_reduced)
    X_test_latent = autoencoder.get_latent(X_test_reduced)
    
    # Apply KNN on latent representation
    print("\nPerforming KNN on AutoEncoder latent representation...")
    latent_accuracy, latent_precision, latent_recall, latent_f1_macro, latent_f1_micro = knn_classification(
        X_train_latent, X_test_latent, y_train, y_test, k_best, distance_best)
    
    # Print evaluation metrics for latent representation
    print("\n---- KNN Performance on AutoEncoder Latent Representation ----")
    print("Accuracy: {:.2f}".format(latent_accuracy))
    print("Precision: {:.2f}".format(latent_precision))
    print("Recall: {:.2f}".format(latent_recall))
    print("F1 Score (Macro): {:.2f}".format(latent_f1_macro))
    print("F1 Score (Micro): {:.2f}".format(latent_f1_micro))
    
    # Train MLP classifier on original dataset
    print("\nTraining MLP classifier on original dataset...")
    input_size = X_train.shape[1]
    mlp_classifier = MLP(input_size=input_size, hidden_sizes=[64, 32], output_size=1, activation='sigmoid', learning_rate=0.01)
    mlp_classifier.fit(X_train, y_train, epochs=1000, batch_size=32)
    
    # Make predictions
    y_pred_mlp = mlp_classifier.predict(X_test)
    y_pred_mlp = (y_pred_mlp > 0.5).astype(int).flatten()
    
    # Calculate metrics
    accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
    precision_mlp = precision_score(y_test, y_pred_mlp, average='weighted')
    recall_mlp = recall_score(y_test, y_pred_mlp, average='weighted')
    f1_mlp = f1_score(y_test, y_pred_mlp, average='weighted')
    
    print("\nMLP Classifier Results:")
    print(f"Accuracy: {accuracy_mlp:.4f}")
    print(f"Precision: {precision_mlp:.4f}")
    print(f"Recall: {recall_mlp:.4f}")
    print(f"F1 Score: {f1_mlp:.4f}")
    
    # Compare results
    print("\nComparison:")
    print(f"AutoEncoder + KNN F1 Score: {latent_f1_macro:.4f}")
    print(f"MLP Classifier F1 Score: {f1_mlp:.4f}")
    print(f"Difference: {abs(latent_f1_macro - f1_mlp):.4f}")

# Run the main function
Q9_main()




Performing KNN on full dataset...

Performing KNN on PCA-reduced dataset...

---- KNN Performance on Full Dataset ----
Accuracy: 0.51
Precision: 0.51
Recall: 0.51
F1 Score (Macro): 0.50
F1 Score (Micro): 0.51

---- KNN Performance on PCA-Reduced Dataset ----
Accuracy: 0.13
Precision: 0.12
Recall: 0.13
F1 Score (Macro): 0.12
F1 Score (Micro): 0.13

Training AutoEncoder on PCA-reduced dataset...


ValueError: operands could not be broadcast together with shapes (32,3) (96,1) 

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

class MLP:
    def __init__(self, input_size, hidden_sizes, output_size, task='classification', activation='relu', 
                 learning_rate=0.01, epochs=100, batch_size=32, optimizer='sgd', patience=5, min_delta=0.001):
        self.input_size = input_size
        self.hidden_sizes = hidden_sizes
        self.output_size = output_size
        self.task = task
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.patience = patience
        self.min_delta = min_delta

        self.weights = []
        self.biases = []
        layer_sizes = [input_size] + hidden_sizes + [output_size]
        for i in range(1, len(layer_sizes)):
            self.weights.append(np.random.randn(layer_sizes[i-1], layer_sizes[i]) * np.sqrt(2. / layer_sizes[i-1]))
            self.biases.append(np.zeros((1, layer_sizes[i])))

        self.set_activation(activation)

    def set_activation(self, activation):
        if activation == 'sigmoid':
            self.activation = self.sigmoid
            self.activation_derivative = self.sigmoid_derivative
        elif activation == 'tanh':
            self.activation = self.tanh
            self.activation_derivative = self.tanh_derivative
        elif activation == 'relu':
            self.activation = self.relu
            self.activation_derivative = self.relu_derivative
        elif activation == 'linear':
            self.activation = self.linear
            self.activation_derivative = self.linear_derivative
        else:
            raise ValueError("Unsupported activation function")

    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-np.clip(x, -709, 709)))

    @staticmethod
    def sigmoid_derivative(x):
        return x * (1 - x)

    @staticmethod
    def tanh(x):
        return np.tanh(x)

    @staticmethod
    def tanh_derivative(x):
        return 1 - np.tanh(x)**2

    @staticmethod
    def relu(x):
        return np.maximum(0, x)

    @staticmethod
    def relu_derivative(x):
        return np.where(x > 0, 1, 0)

    @staticmethod
    def linear(x):
        return x

    @staticmethod
    def linear_derivative(x):
        return np.ones_like(x)

    def forward_propagation(self, X):
        self.layer_outputs = [X]
        for i in range(len(self.weights)):
            z = np.dot(self.layer_outputs[-1], self.weights[i]) + self.biases[i]
            if i < len(self.weights) - 1:
                a = self.activation(z)
            else:
                if self.task == 'classification':
                    a = self.sigmoid(z)
                else:
                    a = z
            self.layer_outputs.append(a)
        return self.layer_outputs[-1]

    def backward_propagation(self, X, y):
        m = X.shape[0]
        if self.task == 'classification':
            delta = self.layer_outputs[-1] - y
        else:
            delta = self.layer_outputs[-1] - y.reshape(-1, 1)
        gradients = []
        for i in reversed(range(len(self.weights))):
            dW = np.dot(self.layer_outputs[i].T, delta) / m
            db = np.sum(delta, axis=0, keepdims=True) / m
            gradients.append((dW, db))
            if i > 0:
                delta = np.dot(delta, self.weights[i].T) * self.activation_derivative(self.layer_outputs[i])
        return list(reversed(gradients))

    def update_parameters(self, gradients):
        for i, (dW, db) in enumerate(gradients):
            self.weights[i] -= self.learning_rate * dW
            self.biases[i] -= self.learning_rate * db

    def fit(self, X, y, X_val=None, y_val=None):
        self.losses = []
        self.val_losses = []
        best_val_loss = float('inf')
        patience_counter = 0

        for epoch in range(self.epochs):
            for i in range(0, X.shape[0], self.batch_size):
                batch_X = X[i:i+self.batch_size]
                batch_y = y[i:i+self.batch_size]
                
                y_pred = self.forward_propagation(batch_X)
                gradients = self.backward_propagation(batch_X, batch_y)
                self.update_parameters(gradients)
            
            loss = self.compute_loss(X, y)
            self.losses.append(loss)
            
            if X_val is not None and y_val is not None:
                val_loss = self.compute_loss(X_val, y_val)
                self.val_losses.append(val_loss)
                
                if val_loss < best_val_loss - self.min_delta:
                    best_val_loss = val_loss
                    patience_counter = 0
                else:
                    patience_counter += 1

                if patience_counter >= self.patience:
                    print(f"Early stopping at epoch {epoch}")
                    break
            
            if (epoch + 1) % 100 == 0:
                print(f"Epoch {epoch + 1}/{self.epochs}, Loss: {loss:.4f}")

    def predict(self, X):
        return self.forward_propagation(X)

    def compute_loss(self, X, y):
        y_pred = self.forward_propagation(X)
        if self.task == 'classification':
            return -np.mean(y * np.log(y_pred + 1e-8) + (1 - y) * np.log(1 - y_pred + 1e-8))
        else:
            return np.mean((y_pred - y.reshape(-1, 1))**2)

# Load the Spotify dataset for classification
spotify_df = pd.read_csv('../../data/external/spotify.csv')

# Preprocess the dataset
X_spotify = spotify_df.drop('track_genre', axis=1)
y_spotify = spotify_df['track_genre']

# Convert non-numeric columns to numeric where possible
X_spotify = X_spotify.apply(pd.to_numeric, errors='coerce')

# Drop columns that cannot be converted to numeric
X_spotify = X_spotify.select_dtypes(include=[np.number]).dropna(axis=1, how='any')

# Encode the target labels
label_encoder = LabelEncoder()
y_spotify_encoded = label_encoder.fit_transform(y_spotify)

# One-hot encode the target labels
onehot_encoder = OneHotEncoder(sparse=False)
y_spotify_onehot = onehot_encoder.fit_transform(y_spotify_encoded.reshape(-1, 1))

# Split the data into training, validation, and test sets
X_train_s, X_temp_s, y_train_s, y_temp_s = train_test_split(X_spotify, y_spotify_onehot, test_size=0.3, random_state=42)
X_val_s, X_test_s, y_val_s, y_test_s = train_test_split(X_temp_s, y_temp_s, test_size=0.5, random_state=42)

# Standardize the data
scaler_s = StandardScaler()
X_train_s_standardized = scaler_s.fit_transform(X_train_s)
X_val_s_standardized = scaler_s.transform(X_val_s)
X_test_s_standardized = scaler_s.transform(X_test_s)

# Convert back to DataFrame for consistency
X_train_s = pd.DataFrame(X_train_s_standardized, columns=X_spotify.columns)
X_val_s = pd.DataFrame(X_val_s_standardized, columns=X_spotify.columns)
X_test_s = pd.DataFrame(X_test_s_standardized, columns=X_spotify.columns)

# Train model for classification
output_size = y_spotify_onehot.shape[1]
model_classification = MLP(input_size=X_train_s.shape[1], hidden_sizes=[64, 32], output_size=output_size, task='classification', activation='relu', epochs=1000)
model_classification.fit(X_train_s, y_train_s, X_val_s, y_val_s)

# Evaluate classification model
y_pred_classification = model_classification.predict(X_test_s)
y_pred_classification_labels = np.argmax(y_pred_classification, axis=1)
y_test_labels = np.argmax(y_test_s, axis=1)

accuracy = accuracy_score(y_test_labels, y_pred_classification_labels)
precision = precision_score(y_test_labels, y_pred_classification_labels, average='weighted')
recall = recall_score(y_test_labels, y_pred_classification_labels, average='weighted')
f1 = f1_score(y_test_labels, y_pred_classification_labels, average='weighted')

print(f"Classification Accuracy: {accuracy:.4f}")
print(f"Classification Precision: {precision:.4f}")
print(f"Classification Recall: {recall:.4f}")
print(f"Classification F1 Score: {f1:.4f}")

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

In [4]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score





# Example data (replace with actual data)
    # Load the dataset
file_path = '../../data/external/spotify.csv'  # Update with your dataset path
target_column = 'track_genre'  # Replace with the actual target column
    
    # Load and preprocess data
X, y = load_and_preprocess_data(file_path, target_column)
X_train, X_test, y_train, y_test = custom_train_test_split(X, y, test_size=0.2, random_state=42)
# X_train_reduced, X_test_reduced, _, _ = custom_train_test_split(X_reduced, y, test_size=0.2, random_state=42)

# Run the MLP classification
mlp_classification(X_train, X_test, y_train, y_test)


ValueError: cannot reshape array of size 32 into shape (114)

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from models.MLP.MLP import MLP  # Assuming MLP and AutoEncoder are defined in models/MLP/MLP.py
from models.KNN.KNN import KNN  # Assuming KNN is defined in models/KNN/KNN.py
from models.PCA.PCA import PCA  # Assuming PCA is defined in models/PCA/PCA.py


class AutoEncoder:
    def __init__(self, input_size, hidden_sizes, latent_size, activation='relu', learning_rate=0.01):
        self.input_size = input_size
        self.hidden_sizes = hidden_sizes
        self.latent_size = latent_size
        
        # Define the encoder and decoder using the MLP class
        encoder_sizes = hidden_sizes + [latent_size]
        decoder_sizes = hidden_sizes[::-1] + [input_size]
        
        self.encoder = MLP(input_size, encoder_sizes[:-1], encoder_sizes[-1], activation=activation, learning_rate=learning_rate)
        self.decoder = MLP(latent_size, decoder_sizes[:-1], decoder_sizes[-1], activation=activation, learning_rate=learning_rate)
    
    def fit(self, X, epochs=1000, batch_size=32):
        for epoch in range(epochs):
            for i in range(0, X.shape[0], batch_size):
                X_batch = X[i:i+batch_size]
                
                # Forward pass
                latent = self.encoder.forward_propagation(X_batch)
                reconstructed = self.decoder.forward_propagation(latent)
                
                # Backward pass
                decoder_gradients = self.decoder.backward_propagation(latent, X_batch)
                encoder_gradients = self.encoder.backward_propagation(X_batch, latent)
                
                # Update parameters
                self.decoder.update_parameters(decoder_gradients)
                self.encoder.update_parameters(encoder_gradients)
            
            if epoch % 100 == 0:
                loss = np.mean((self.decoder.forward_propagation(self.encoder.forward_propagation(X)) - X) ** 2)
                print(f"Epoch {epoch}, Loss: {loss:.4f}")
    
    def get_latent(self, X):
        return self.encoder.predict(X)


def load_and_preprocess_data(file_path, target_column):
    data = pd.read_csv(file_path)
    X = data.drop([target_column], axis=1)
    y = data[target_column]
    X = X.apply(pd.to_numeric, errors='coerce')
    X = X.select_dtypes(include=[np.number]).dropna(axis=1, how='any')
    X = (X - X.min()) / (X.max() - X.min())
    if y.dtype == 'object':
        label_mapping = {label: idx for idx, label in enumerate(np.unique(y))}
        y = y.map(label_mapping)
    return X.values, y.values

def custom_train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    test_size = int(X.shape[0] * test_size)
    train_indices = indices[:-test_size]
    test_indices = indices[-test_size:]
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    return X_train, X_test, y_train, y_test

def knn_classification(X_train, X_test, y_train, y_test, k, distance_metric):
    knn = KNN(k=k, distance_metric=distance_metric)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    f1_micro = f1_score(y_test, y_pred, average='micro')
    return accuracy, precision, recall, f1_macro, f1_micro

def Q9_main():
    file_path = '../../data/external/spotify.csv'
    target_column = 'track_genre'
    X, y = load_and_preprocess_data(file_path, target_column)
    optimal_components = 3
    pca = PCA(n_components=optimal_components)
    X_reduced = pca.fit_transform(X)
    X_train, X_test, y_train, y_test = custom_train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_reduced, X_test_reduced, _, _ = custom_train_test_split(X_reduced, y, test_size=0.2, random_state=42)
    k_best = 19
    distance_best = 'manhattan'
    
    print("\nPerforming KNN on full dataset...")
    full_accuracy, full_precision, full_recall, full_f1_macro, full_f1_micro = knn_classification(
        X_train, X_test, y_train, y_test, k_best, distance_best)
    
    print("\nPerforming KNN on PCA-reduced dataset...")
    reduced_accuracy, reduced_precision, reduced_recall, reduced_f1_macro, reduced_f1_micro = knn_classification(
        X_train_reduced, X_test_reduced, y_train, y_test, k_best, distance_best)
    
    print("\n---- KNN Performance on Full Dataset ----")
    print("Accuracy: {:.2f}".format(full_accuracy))
    print("Precision: {:.2f}".format(full_precision))
    print("Recall: {:.2f}".format(full_recall))
    print("F1 Score (Macro): {:.2f}".format(full_f1_macro))
    print("F1 Score (Micro): {:.2f}".format(full_f1_micro))
    
    print("\n---- KNN Performance on PCA-Reduced Dataset ----")
    print("Accuracy: {:.2f}".format(reduced_accuracy))
    print("Precision: {:.2f}".format(reduced_precision))
    print("Recall: {:.2f}".format(reduced_recall))
    print("F1 Score (Macro): {:.2f}".format(reduced_f1_macro))
    print("F1 Score (Micro): {:.2f}".format(reduced_f1_micro))
    
    print("\nTraining AutoEncoder on PCA-reduced dataset...")
    autoencoder = AutoEncoder(input_size=optimal_components, hidden_sizes=[64, 32], latent_size=optimal_components)
    autoencoder.fit(X_train_reduced, epochs=1000, batch_size=32)
    X_train_latent = autoencoder.get_latent(X_train_reduced)
    X_test_latent = autoencoder.get_latent(X_test_reduced)
    
    print("\nPerforming KNN on AutoEncoder latent representation...")
    latent_accuracy, latent_precision, latent_recall, latent_f1_macro, latent_f1_micro = knn_classification(
        X_train_latent, X_test_latent, y_train, y_test, k_best, distance_best)
    
    print("\n---- KNN Performance on AutoEncoder Latent Representation ----")
    print("Accuracy: {:.2f}".format(latent_accuracy))
    print("Precision: {:.2f}".format(latent_precision))
    print("Recall: {:.2f}".format(latent_recall))
    print("F1 Score (Macro): {:.2f}".format(latent_f1_macro))
    print("F1 Score (Micro): {:.2f}".format(latent_f1_micro))
    
    print("\nTraining MLP classifier on original dataset...")
    input_size = X_train.shape[1]
    mlp_classifier = MLP(input_size=input_size, hidden_sizes=[64, 32], output_size=1, activation='sigmoid', learning_rate=0.01)
    mlp_classifier.fit(X_train, y_train, epochs=1000, batch_size=32)
    y_pred_mlp = mlp_classifier.predict(X_test)
    y_pred_mlp = (y_pred_mlp > 0.5).astype(int).flatten()
    
    accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
    precision_mlp = precision_score(y_test, y_pred_mlp, average='weighted')
    recall_mlp = recall_score(y_test, y_pred_mlp, average='weighted')
    f1_mlp = f1_score(y_test, y_pred_mlp, average='weighted')
    
    print("\nMLP Classifier Results:")
    print(f"Accuracy: {accuracy_mlp:.4f}")
    print(f"Precision: {precision_mlp:.4f}")
    print(f"Recall: {recall_mlp:.4f}")
    print(f"F1 Score: {f1_mlp:.4f}")
    
    print("\nComparison:")
    print(f"AutoEncoder + KNN F1 Score: {latent_f1_macro:.4f}")
    print(f"MLP Classifier F1 Score: {f1_mlp:.4f}")
    print(f"Difference: {abs(latent_f1_macro - f1_mlp):.4f}")

# Run the main function
Q9_main()

ModuleNotFoundError: No module named 'models'