# Number 1 :

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import mahalanobis
import matplotlib.pyplot as plt

# Load MNIST data from CSV files
train_data = pd.read_csv('mnist_train.csv')
test_data = pd.read_csv('mnist_test.csv')


In [2]:

# Separate features and labels
X_train, y_train = train_data.iloc[:, 1:].values, train_data.iloc[:, 0].values
X_test, y_test = test_data.iloc[:, 1:].values, test_data.iloc[:, 0].values

# Reduce dimensions using PCA (adjust n_components as needed)
pca = PCA(n_components=50)  # 50 is arbitrary; adjust based on testing
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Define function to compute Mahalanobis distance for KNN
def mahalanobis_distance(X_train, X_test):
    cov_matrix = np.cov(X_train, rowvar=False)
    inv_cov_matrix = np.linalg.inv(cov_matrix)
    distances = np.array([
        [mahalanobis(x_test, x_train, inv_cov_matrix) for x_train in X_train]
        for x_test in X_test
    ])
    return distances



In [3]:
from scipy.stats import mode

# Evaluate KNN with various values of k and both Euclidean and Mahalanobis distances
k_values = range(1, 21)
errors_euclidean = []
errors_mahalanobis = []

for k in k_values:
    # Euclidean distance
    knn_euclidean = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
    knn_euclidean.fit(X_train_pca, y_train)
    y_pred_euclidean = knn_euclidean.predict(X_test_pca)
    error_euclidean = 1 - accuracy_score(y_test, y_pred_euclidean)
    errors_euclidean.append(error_euclidean)

    # Mahalanobis distance
    distances = mahalanobis_distance(X_train_pca, X_test_pca)
    y_pred_mahalanobis = np.array([mode(y_train[distances[i].argsort()[:k]]).mode[0] for i in range(len(X_test_pca))])
    error_mahalanobis = 1 - accuracy_score(y_test, y_pred_mahalanobis)
    errors_mahalanobis.append(error_mahalanobis)


# Plot error rates for different values of k
plt.figure(figsize=(12, 6))
plt.plot(k_values, errors_euclidean, marker='o', label='Euclidean Distance')
plt.plot(k_values, errors_mahalanobis, marker='s', label='Mahalanobis Distance')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Error Rate')
plt.title('Error Rate vs. Number of Neighbors (k) for Euclidean and Mahalanobis Distances')
plt.legend()
plt.show()


IndexError: invalid index to scalar variable.

# Number 2 :

In [None]:
import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from scipy.cluster.hierarchy import linkage, fcluster
import matplotlib.pyplot as plt

# Load MNIST data from CSV files
train_data = pd.read_csv('mnist_train.csv')
test_data = pd.read_csv('mnist_test.csv')

# Separate features and labels
X_train, y_train = train_data.iloc[:, 1:].values, train_data.iloc[:, 0].values
X_test, y_test = test_data.iloc[:, 1:].values, test_data.iloc[:, 0].values

In [None]:
# Reduce dimensions using PCA
pca = PCA(n_components=50)  # Adjust n_components based on testing
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Define function to train GMM and calculate accuracy
def gmm_accuracy(X_train, X_test, y_train, y_test, covariance_type):
    gmm = GaussianMixture(n_components=10, covariance_type=covariance_type, random_state=0)
    gmm.fit(X_train)
    
    y_train_pred = gmm.predict(X_train)
    y_test_pred = gmm.predict(X_test)
    
    # Map predicted labels to actual labels using majority voting
    label_map = {}
    for label in np.unique(y_train):
        mask = (y_train_pred == label)
        label_map[label] = np.bincount(y_train[mask]).argmax()

    y_test_mapped = np.vectorize(label_map.get)(y_test_pred)
    accuracy = accuracy_score(y_test, y_test_mapped)
    return accuracy

# Evaluate GMM with different covariance types
covariance_types = ['full', 'tied', 'diag', 'spherical']
accuracies = {}

for cov_type in covariance_types:
    accuracy = gmm_accuracy(X_train_pca, X_test_pca, y_train, y_test, cov_type)
    accuracies[cov_type] = accuracy
    print(f"Covariance Type: {cov_type}, Accuracy: {accuracy:.4f}")

In [None]:
# Use hierarchical clustering to separate classes into layers
def hierarchical_clustering(X, num_clusters=10):
    Z = linkage(X, method='ward')
    clusters = fcluster(Z, num_clusters, criterion='maxclust')
    return clusters

# Perform hierarchical clustering on the training data
clusters = hierarchical_clustering(X_train_pca)

In [None]:
# Train GMM within each cluster
cluster_accuracies = []
for cluster_id in np.unique(clusters):
    X_train_cluster = X_train_pca[clusters == cluster_id]
    y_train_cluster = y_train[clusters == cluster_id]
    
    # Train GMM on this subset
    gmm = GaussianMixture(n_components=10, covariance_type='full', random_state=0)
    gmm.fit(X_train_cluster)
    
    y_test_pred_cluster = gmm.predict(X_test_pca)
    accuracy_cluster = accuracy_score(y_test, y_test_pred_cluster)
    cluster_accuracies.append(accuracy_cluster)

print(f"Average accuracy after hierarchical clustering: {np.mean(cluster_accuracies):.4f}")

# Plotting the results
plt.figure(figsize=(10, 6))
plt.bar(accuracies.keys(), accuracies.values())
plt.xlabel('Covariance Type')
plt.ylabel('Accuracy')
plt.title('Accuracy of GMM with Different Covariance Types on MNIST')
plt.show()

# Number 3 : 

In [None]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Load MNIST data from CSV files
train_data = pd.read_csv('mnist_train.csv')
test_data = pd.read_csv('mnist_test.csv')

# Separate features and labels
X_train, y_train = train_data.iloc[:, 1:].values, train_data.iloc[:, 0].values
X_test, y_test = test_data.iloc[:, 1:].values, test_data.iloc[:, 0].values

# Initialize the Gaussian Naive Bayes classifier
nb_classifier = GaussianNB()

In [None]:
# Train the model on the training data
nb_classifier.fit(X_train, y_train)

In [None]:
# Predict labels on the training and testing data
y_train_pred = nb_classifier.predict(X_train)
y_test_pred = nb_classifier.predict(X_test)

In [None]:
# Calculate and display confusion matrices
train_cm = confusion_matrix(y_train, y_train_pred)
test_cm = confusion_matrix(y_test, y_test_pred)

In [None]:
# Plot confusion matrices
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
ConfusionMatrixDisplay(train_cm).plot(ax=ax[0], cmap='Blues')
ax[0].set_title("Training Confusion Matrix")

ConfusionMatrixDisplay(test_cm).plot(ax=ax[1], cmap='Blues')
ax[1].set_title("Testing Confusion Matrix")
plt.show()

# Number 4 :

In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Load MNIST data from CSV files
train_data = pd.read_csv('mnist_train.csv')
test_data = pd.read_csv('mnist_test.csv')

# Separate features and labels
X_train, y_train = train_data.iloc[:, 1:].values, train_data.iloc[:, 0].values
X_test, y_test = test_data.iloc[:, 1:].values, test_data.iloc[:, 0].values

# Initialize and train a Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=0)
dt_classifier.fit(X_train, y_train)

In [None]:
# Predict labels for both training and testing data
y_train_pred_dt = dt_classifier.predict(X_train)
y_test_pred_dt = dt_classifier.predict(X_test)

# Initialize and train a Random Forest classifier
rf_classifier = RandomForestClassifier(
    n_estimators=100,           # Number of trees in the forest
    criterion='gini',           # Function to measure the quality of a split
    max_depth=None,             # Maximum depth of the tree (None means nodes are expanded until pure)
    min_samples_split=2,        # Minimum samples required to split a node
    min_samples_leaf=1,         # Minimum samples required to be at a leaf node
    max_features='sqrt',        # Number of features to consider when looking for the best split
    bootstrap=True,             # Whether bootstrap samples are used when building trees
    random_state=0
)
rf_classifier.fit(X_train, y_train)

In [None]:
# Predict labels for both training and testing data
y_train_pred_rf = rf_classifier.predict(X_train)
y_test_pred_rf = rf_classifier.predict(X_test)

# Calculate and display confusion matrices for Decision Tree
train_cm_dt = confusion_matrix(y_train, y_train_pred_dt)
test_cm_dt = confusion_matrix(y_test, y_test_pred_dt)

In [None]:
# Calculate and display confusion matrices for Random Forest
train_cm_rf = confusion_matrix(y_train, y_train_pred_rf)
test_cm_rf = confusion_matrix(y_test, y_test_pred_rf)

# Plot confusion matrices
fig, ax = plt.subplots(2, 2, figsize=(14, 12))
ConfusionMatrixDisplay(train_cm_dt).plot(ax=ax[0, 0], cmap='Blues')
ax[0, 0].set_title("Decision Tree - Training Confusion Matrix")

ConfusionMatrixDisplay(test_cm_dt).plot(ax=ax[0, 1], cmap='Blues')
ax[0, 1].set_title("Decision Tree - Testing Confusion Matrix")

ConfusionMatrixDisplay(train_cm_rf).plot(ax=ax[1, 0], cmap='Greens')
ax[1, 0].set_title("Random Forest - Training Confusion Matrix")

ConfusionMatrixDisplay(test_cm_rf).plot(ax=ax[1, 1], cmap='Greens')
ax[1, 1].set_title("Random Forest - Testing Confusion Matrix")

plt.show()

# Number 5 :

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Load MNIST data from CSV files
train_data = pd.read_csv('mnist_train.csv')
test_data = pd.read_csv('mnist_test.csv')

# Separate features and labels
X_train, y_train = train_data.iloc[:, 1:].values, train_data.iloc[:, 0].values
X_test, y_test = test_data.iloc[:, 1:].values, test_data.iloc[:, 0].values

# Initialize lists to store results for plotting
c_values = [0.1, 1, 10, 100]
train_errors_poly, test_errors_poly = [], []
train_errors_rbf, test_errors_rbf = [], []

In [None]:
# Polynomial kernel SVM
for C in c_values:
    svm_poly = SVC(kernel='poly', degree=3, C=C, random_state=0)
    svm_poly.fit(X_train, y_train)
    
    # Predictions and error calculation
    y_train_pred_poly = svm_poly.predict(X_train)
    y_test_pred_poly = svm_poly.predict(X_test)
    train_errors_poly.append(np.mean(y_train != y_train_pred_poly))
    test_errors_poly.append(np.mean(y_test != y_test_pred_poly))

    # Display confusion matrix for C=1
    if C == 1:
        train_cm_poly = confusion_matrix(y_train, y_train_pred_poly)
        test_cm_poly = confusion_matrix(y_test, y_test_pred_poly)
        fig, ax = plt.subplots(1, 2, figsize=(12, 5))
        ConfusionMatrixDisplay(train_cm_poly).plot(ax=ax[0], cmap='Blues')
        ax[0].set_title("SVM Polynomial Kernel - Training Confusion Matrix (C=1)")
        ConfusionMatrixDisplay(test_cm_poly).plot(ax=ax[1], cmap='Blues')
        ax[1].set_title("SVM Polynomial Kernel - Testing Confusion Matrix (C=1)")

In [None]:
# RBF kernel SVM
for C in c_values:
    svm_rbf = SVC(kernel='rbf', C=C, random_state=0)
    svm_rbf.fit(X_train, y_train)
    
    # Predictions and error calculation
    y_train_pred_rbf = svm_rbf.predict(X_train)
    y_test_pred_rbf = svm_rbf.predict(X_test)
    train_errors_rbf.append(np.mean(y_train != y_train_pred_rbf))
    test_errors_rbf.append(np.mean(y_test != y_test_pred_rbf))

    # Display confusion matrix for C=1
    if C == 1:
        train_cm_rbf = confusion_matrix(y_train, y_train_pred_rbf)
        test_cm_rbf = confusion_matrix(y_test, y_test_pred_rbf)
        fig, ax = plt.subplots(1, 2, figsize=(12, 5))
        ConfusionMatrixDisplay(train_cm_rbf).plot(ax=ax[0], cmap='Greens')
        ax[0].set_title("SVM RBF Kernel - Training Confusion Matrix (C=1)")
        ConfusionMatrixDisplay(test_cm_rbf).plot(ax=ax[1], cmap='Greens')
        ax[1].set_title("SVM RBF Kernel - Testing Confusion Matrix (C=1)")

In [None]:
# Plotting loss for different values of C
plt.figure(figsize=(10, 6))
plt.plot(c_values, train_errors_poly, label="Train Error (Poly Kernel)", marker='o')
plt.plot(c_values, test_errors_poly, label="Test Error (Poly Kernel)", marker='o')
plt.plot(c_values, train_errors_rbf, label="Train Error (RBF Kernel)", marker='x')
plt.plot(c_values, test_errors_rbf, label="Test Error (RBF Kernel)", marker='x')
plt.xscale('log')
plt.xlabel("C (Soft Margin)")
plt.ylabel("Error Rate")
plt.title("Error Rate vs. C for Polynomial and RBF Kernels")
plt.legend()
plt.show()