In [100]:
#importing all requried libraries
import pandas as pd
import numpy as np

#Preprocessing and model selection libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#Tesnor flow for neural networks
import tensorflow as tf
from tensorflow.keras import layers, optimizers

#Classification  and accuracy scoring libraries
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#iterative library to generate all possible combinations 
from itertools import combinations

In [101]:
#loading the data
data = pd.read_csv('tmnst DATA SET.csv')

In [102]:
#Shape of the data frame
data.shape

(74724, 785)

In [103]:
data

Unnamed: 0,labels,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,D,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,F,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,J,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,H,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74719,U,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74720,R,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74721,N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74722,J,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [104]:
#Data types in the data frame
data.dtypes

labels    object
1          int64
2          int64
3          int64
4          int64
           ...  
780        int64
781        int64
782        int64
783        int64
784        int64
Length: 785, dtype: object

In [105]:
#Removing columns which are all zeros
data = data.loc[:, (data != 0).any(axis=0)]
data.shape

(74724, 690)

In [106]:
#Please split each class into 70% train and 30% test split
X = data.iloc[:,1:]
y = data['labels']
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.3)

In [107]:
print('X_train Shape',X_train.shape)
print('X_test Shape',X_test.shape)
print('Y_train Shape',Y_train.shape)
print('Y_test Shape',Y_test.shape)

#Normalizing the data frame
X_train = X_train/255
X_test = X_test/255


X_train Shape (52306, 689)
X_test Shape (22418, 689)
Y_train Shape (52306,)
Y_test Shape (22418,)


# PROBLEM 1  Neural Network Classifier

In [108]:
# Convert Y_train from pandas DataFrame to numpy array
Y_train_array = Y_train.to_numpy()
Y_test_array = Y_test.to_numpy()
# Create an instance of OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform Y_train to one-hot encode it
Y_train_encoded = encoder.fit_transform(Y_train_array.reshape(-1, 1)).toarray()
Y_test_encoded = encoder.fit_transform(Y_test_array.reshape(-1, 1)).toarray()

# Now you can proceed with the rest of your code as before
Y_train_encoded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [109]:
#Function to build neural network
#Paramaters: hidden_layers = number of hidden layers
def build_model(hidden_layers):
    model = tf.keras.models.Sequential()
    for idx, neurons in enumerate(hidden_layers):
        model.add(layers.Dense(neurons, name=f'Hidden_Layer_{idx + 1}', activation='relu', input_shape=(X_train.shape[1],)))
    model.add(layers.Dense(26, name='Output_Layer', activation='softmax'))  # Assuming 26 output classes
    return model

In [110]:
# Define architectures to try along with their names
architectures = [
    ([5], "1-5"),
    ([10], "1-10"),
    ([20], "1-20"),
    ([25], "1-25"),
    ([5, 5], "2-5-5"),
    ([5, 10], "2-5-10"),
    ([10, 5], "2-10-5"),
    ([10, 10], "2-10-10")
]

results = []

In [None]:
# Iterate over each architecture
for architecture, name in architectures:
    # Build the model
    model = build_model(architecture)
    
    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    # Fit the model with one-hot encoded Y_train
    history = model.fit(X_train, Y_train_encoded, epochs=50, batch_size=250, validation_data=(X_test, Y_test_encoded))
    
    # Compute the number of parameters
    num_params = model.count_params()
    
    # Get final accuracy
    final_accuracy = history.history['val_accuracy'][-1]
    
    # Store results along with architecture name
    results.append((name, num_params, final_accuracy))


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



Epoch 1/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.1356 - loss: 2.9506 - val_accuracy: 0.4243 - val_loss: 2.0909
Epoch 2/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4720 - loss: 1.9026 - val_accuracy: 0.6390 - val_loss: 1.4073
Epoch 3/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6611 - loss: 1.3129 - val_accuracy: 0.7541 - val_loss: 1.0489
Epoch 4/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7699 - loss: 0.9899 - val_accuracy: 0.8051 - val_loss: 0.8856
Epoch 5/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8078 - loss: 0.8516 - val_accuracy: 0.8246 - val_loss: 0.8005
Epoch 6/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8227 - loss: 0.7782 - val_accuracy: 0.8373 - val_loss: 0.7459
Epoch 7/50
[1m210/210[0m 

[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.3007 - loss: 2.6186 - val_accuracy: 0.7986 - val_loss: 1.0187
Epoch 2/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8287 - loss: 0.8553 - val_accuracy: 0.8714 - val_loss: 0.6027
Epoch 3/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8791 - loss: 0.5596 - val_accuracy: 0.8912 - val_loss: 0.4916
Epoch 4/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8932 - loss: 0.4727 - val_accuracy: 0.9014 - val_loss: 0.4382
Epoch 5/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9061 - loss: 0.4189 - val_accuracy: 0.9087 - val_loss: 0.4091
Epoch 6/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9116 - loss: 0.3848 - val_accuracy: 0.9130 - val_loss: 0.3893
Epoch 7/50
[1m210/210[0m [32m━━━━━━

[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.4829 - loss: 2.1671 - val_accuracy: 0.8817 - val_loss: 0.5878
Epoch 2/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8957 - loss: 0.5089 - val_accuracy: 0.9127 - val_loss: 0.4030
Epoch 3/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9185 - loss: 0.3784 - val_accuracy: 0.9234 - val_loss: 0.3405
Epoch 4/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9301 - loss: 0.3187 - val_accuracy: 0.9325 - val_loss: 0.3077
Epoch 5/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9351 - loss: 0.2914 - val_accuracy: 0.9367 - val_loss: 0.2865
Epoch 6/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9404 - loss: 0.2630 - val_accuracy: 0.9399 - val_loss: 0.2727
Epoch 7/50
[1m210/210[0m [32m━━━━━━━

[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.5051 - loss: 2.0687 - val_accuracy: 0.8884 - val_loss: 0.5379
Epoch 2/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9003 - loss: 0.4649 - val_accuracy: 0.9185 - val_loss: 0.3713
Epoch 3/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9243 - loss: 0.3415 - val_accuracy: 0.9293 - val_loss: 0.3184
Epoch 4/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9337 - loss: 0.2990 - val_accuracy: 0.9365 - val_loss: 0.2855
Epoch 5/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9400 - loss: 0.2666 - val_accuracy: 0.9396 - val_loss: 0.2718
Epoch 6/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9438 - loss: 0.2516 - val_accuracy: 0.9434 - val_loss: 0.2559
Epoch 7/50
[1m210/210[0m [32m━━━━━━━

[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.0919 - loss: 3.0085 - val_accuracy: 0.3379 - val_loss: 2.1844
Epoch 2/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4461 - loss: 1.8894 - val_accuracy: 0.6571 - val_loss: 1.2691
Epoch 3/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6832 - loss: 1.1444 - val_accuracy: 0.7647 - val_loss: 0.9169
Epoch 4/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7792 - loss: 0.8621 - val_accuracy: 0.8119 - val_loss: 0.7645
Epoch 5/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8250 - loss: 0.7213 - val_accuracy: 0.8346 - val_loss: 0.6933
Epoch 6/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8394 - loss: 0.6784 - val_accuracy: 0.8439 - val_loss: 0.6557
Epoch 7/50
[1m210/210[0m [32m━━━━━━━

[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.1416 - loss: 2.9552 - val_accuracy: 0.5253 - val_loss: 1.7488
Epoch 2/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6213 - loss: 1.4568 - val_accuracy: 0.7549 - val_loss: 0.9847
Epoch 3/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7827 - loss: 0.8995 - val_accuracy: 0.8142 - val_loss: 0.7775
Epoch 4/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8246 - loss: 0.7451 - val_accuracy: 0.8355 - val_loss: 0.6958
Epoch 5/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8413 - loss: 0.6795 - val_accuracy: 0.8474 - val_loss: 0.6528
Epoch 6/50
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8517 - loss: 0.6462 - val_accuracy: 0.8596 - val_loss: 0.6229
Epoch 7/50
[1m210/210[0m [32m━━━━━━━

In [None]:
modelsdf = pd.DataFrame(results,columns = ['Architecture Name', 'Parameters','Accuracy'])

In [None]:
modelsdf

In [None]:
plt.scatter(modelsdf['Architecture Name'], modelsdf['Accuracy'], color='blue')
plt.title('Architecture vs. Accuracy')
plt.xlabel('Architecture')
plt.ylabel('Accuracy')
plt.grid(True)
plt.show()

Inference:
In the above code, different neural network architectures were created with varying numbers of layers and neurons, all using the same activation function (ReLU, Sigmoid(output)), optimizer (Adam), and loss function (Categorical Crossentropy). Each model was trained for 50 epochs with a batch size of 250.

Upon plotting a scatter plot of the accuracies achieved by these models, it was observed that as the number of neurons increased, the model accuracies also increased. Initially, when increasing the number of hidden layers, there was a slight drop in accuracy, but as the number of neurons in these layers increased, the accuracy also increased.

From this analysis, it can be inferred that increasing the number of neurons aids in improving the model's accuracy by allowing it to capture more complex patterns in the data. Additionally, increasing the depth of the network (i.e., adding more hidden layers) can lead to better performance, provided that an adequate number of neurons are present in these layers to effectively learn from the data.

# PROBLEM 2  SVM Classifier

In [None]:
#Function to load and get the accuraices of linear SVM model
#Parameters: X_train, Y_train, X_test, Y_test : split data, c = hyperparameter
def svm_linear(X_train, X_test, Y_train, Y_test,c):
    svm = SVC(kernel = 'linear', C=c)
    svm.fit(X_train, Y_train)
    num_vectors = len(svm.support_vectors_)
    train_accuracy = accuracy_score(Y_train, svm.predict(X_train))
    test_accuracy = accuracy_score(Y_test, svm.predict(X_test))
    return num_vectors, train_accuracy, test_accuracy

#Function to load and get the accuraices of Polynomial SVM model
#Parameters: X_train, Y_train, X_test, Y_test : split data, c = hyperparameter
def svm_polynomial(X_train, X_test, Y_train, Y_test, d,  c):
    svm = SVC(kernel ='poly',degree = d, C=c)
    svm.fit(X_train, Y_train)
    num_vectors = len(svm.support_vectors_)
    train_accuracy = accuracy_score(Y_train, svm.predict(X_train))
    test_accuracy = accuracy_score(Y_test, svm.predict(X_test))
    return num_vectors, train_accuracy, test_accuracy

#Function to load and get the accuraices of rbf SVM model
#Parameters: X_train, Y_train, X_test, Y_test : split data, c = hyperparameter
def svm_rbf(X_train, X_test, Y_train, Y_test, sigma):
    svm = SVC(kernel = 'rbf',gamma = 1/(2*sigma**2))
    svm.fit(X_train, Y_train)
    num_vectors = len(svm.support_vectors_)
    train_accuracy = accuracy_score(Y_train, svm.predict(X_train))
    test_accuracy = accuracy_score(Y_test, svm.predict(X_test))
    return num_vectors, train_accuracy, test_accuracy

In [None]:
#Funtion to run all the models, return accuracies 
#Parameters: c1, c2, are the classes need to be tested.
def svm_explore(X_train, X_test, Y_train, Y_test, c1, c2):
    Y_train_indices = (Y_train== c1) | (Y_train==c2)
    Y_test_indices = (Y_test==c1)| (Y_test==c2)
    X_train_filtered = X_train[Y_train_indices]
    X_test_filtered = X_test[Y_test_indices]
    Y_train_filtered = Y_train[Y_train_indices]
    Y_test_filtered = Y_test[Y_test_indices]
    results = []
    for i in range(5,51,5):
        Num_vectors, Train_accuracy, Test_accuracy = svm_linear(X_train= X_train_filtered, X_test=X_test_filtered, Y_train=Y_train_filtered, Y_test=Y_test_filtered,c=i)
        results.append((f"Linear/C={i}",Num_vectors, Train_accuracy, Test_accuracy))
    for i in range(2,6):
        Num_Vectors, Train_accuracy, Test_accuracy = svm_polynomial(X_train= X_train_filtered, X_test=X_test_filtered, Y_train=Y_train_filtered, Y_test=Y_test_filtered, d = i,c=10)
        results.append((f"Poly/d={i}",Num_vectors, Train_accuracy, Test_accuracy))
    for i in range(2,10,2):
        Num_vectors, Train_accuracy, Test_accuracy = svm_rbf(X_train= X_train_filtered, X_test=X_test_filtered, Y_train=Y_train_filtered, Y_test=Y_test_filtered, sigma = i)
        results.append((f"RBF/sigma={i}",Num_vectors, Train_accuracy, Test_accuracy))
    return pd.DataFrame(results, columns = ['Name', 'Number of Vectors','Train Accuracy', 'Test Accuracy'])


In [None]:
c1= 'A'
c2 = 'Z'
res = svm_explore(X_train, X_test, Y_train, Y_test, c1, c2)

In [None]:
res

In [None]:
# Extracting data for plotting
linear_data = res.iloc[:10]
poly_data = res.iloc[10:14]
rbf_data = res.iloc[14:]

# Plotting training and test accuracies vs. complexity parameters for Linear SVM
plt.figure(figsize=(12, 20))
plt.subplot(3, 1, 1)
plt.plot(range(len(linear_data)), linear_data['Train Accuracy'], marker='o', label='Train Accuracy')
plt.plot(range(len(linear_data)), linear_data['Test Accuracy'], marker='o', label='Test Accuracy')
plt.xticks(range(len(linear_data)), linear_data['Name'], rotation=45)
plt.xlabel('Linear SVM Models')
plt.ylabel('Accuracy')
plt.title('Linear SVM Accuracy vs. Complexity Parameters')
plt.legend()

# Plotting training and test accuracies vs. complexity parameters for Polynomial SVM
plt.subplot(3, 1, 2)
plt.plot(range(len(poly_data)), poly_data['Train Accuracy'], marker='o', label='Train Accuracy')
plt.plot(range(len(poly_data)), poly_data['Test Accuracy'], marker='o', label='Test Accuracy')
plt.xticks(range(len(poly_data)), poly_data['Name'], rotation=45)
plt.xlabel('Polynomial SVM Models')
plt.ylabel('Accuracy')
plt.title('Polynomial SVM Accuracy vs. Complexity Parameters')
plt.legend()

# Plotting training and test accuracies vs. complexity parameters for RBF SVM
plt.subplot(3, 1, 3)
plt.plot(range(len(rbf_data)), rbf_data['Train Accuracy'], marker='o', label='Train Accuracy')
plt.plot(range(len(rbf_data)), rbf_data['Test Accuracy'], marker='o', label='Test Accuracy')
plt.xticks(range(len(rbf_data)), rbf_data['Name'], rotation=45)
plt.xlabel('RBF SVM Models')
plt.ylabel('Accuracy')
plt.title('RBF SVM Accuracy vs. Complexity Parameters')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
#Scatter plot of Support vectors vs Training Accuracy
scatter = res.iloc[:9]
plt.scatter(res.iloc[:,1],res.iloc[:,3])
plt.xlabel('Accuracy')
plt.ylabel('Number of support Vectors')
plt.title('Number of support Vectors Vs Training Accuracy')
plt.show()

Inferences:
1. In the Linear model SVM, the model is robust to changes in the hyperparameter, with consistent high accuracies and a constant number of support vectors.
2. In the Polynomial model SVM, increasing the polynomial degree leads to decreased accuracies, potentially indicating overfitting, while the number of support vectors remains constant.
3. In the RBF model SVM, increasing the sigma value generally leads to improved accuracies, and the number of support vectors varies, indicating changes in the complexity of the decision boundary.

# PROBLEM 3  Random Forrest

In [None]:
#Funtion to call random forrest and run it for different hyperparameters:
#Parameters: c1, c2 classes need to be tested.
def random_forrest_explore(X_train, Y_train, Y_test, X_test, c1, c2):
    Y_train_indices = (Y_train==c1)|(Y_train==c2)
    Y_test_indices = (Y_test==c1)|(Y_test==c2)
    X_train_filtered = X_train[Y_train_indices]
    X_test_filtered = X_test[Y_test_indices]
    Y_train_filtered = Y_train[Y_train_indices]
    Y_test_filtered = Y_test[Y_test_indices]
    results = []
    for i in range(5,101,5): #Hyperparameter loop to increase the trees.
        for j in range(3,10,1): #To change the depths
            randomForest = RandomForestClassifier(n_estimators=i,max_depth=j)
            randomForest.fit(X_train_filtered,Y_train_filtered)
            train_accuracy = accuracy_score(Y_train_filtered, randomForest.predict(X_train_filtered))
            test_accuracy = accuracy_score(Y_test_filtered, randomForest.predict(X_test_filtered))
            results.append((i,j, train_accuracy, test_accuracy))
    return pd.DataFrame(results, columns=['Number of Trees','Depth','Train Accuracy', 'Test Accuracy'])

In [None]:
res3 = random_forrest_explore(X_train, Y_train, Y_test, X_test, c1='A', c2='Z')

In [None]:
res3

In [None]:
res3

In [None]:
# Plot 1: Depth on the x-axis with one line for training and one for test for different numbers of trees
plt.figure(figsize=(15, 10))
for n_trees, group in res3.groupby('Number of Trees'):
    plt.plot(group['Depth'], group['Train Accuracy'], label=f'Train (n_trees={n_trees})')
    plt.plot(group['Depth'], group['Test Accuracy'], label=f'Test (n_trees={n_trees})')
plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Depth')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize='large')
plt.grid(True)
plt.show()

# Plot 2: Number of trees on the x-axis with one line for training and one for test for different depths
plt.figure(figsize=(12, 6))
for depth, group in res3.groupby('Depth'):
    plt.plot(group['Number of Trees'], group['Train Accuracy'], label=f'Train (depth={depth})')
    plt.plot(group['Number of Trees'], group['Test Accuracy'], label=f'Test (depth={depth})')
plt.xlabel('Number of Trees')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Number of Trees')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
#plotting using plotly expresses just to get labels on hovering
fig1 = px.line(res3, x='Depth', y=['Train Accuracy', 'Test Accuracy'], color='Number of Trees',
               labels={'value': 'Accuracy', 'Depth': 'Depth', 'Number of Trees': 'Number of Trees'},
               title='Accuracy vs Depth')
fig2 = px.line(res3, x='Number of Trees', y=['Train Accuracy', 'Test Accuracy'], color='Depth',
               labels={'value': 'Accuracy', 'Number of Trees': 'Number of Trees', 'Depth': 'Depth'},
               title='Accuracy vs Number of Trees')

fig1.show()
fig2.show()


1. Increasing Depth Effect:
    As the depth of trees increases, both training and test accuracies tend to improve.
    This indicates that deeper trees can capture more intricate patterns in the training data, leading to better performance on     both training and test sets.
2. Increasing Number of Trees Effect:
    Similarly, increasing the number of trees generally results in higher accuracies for both training and test sets.
    This suggests that having more trees in the forest improves the model's ability to generalize to unseen data.
3. Tradeoff Analysis:
    Deeper trees (higher depth) can lead to overfitting, where the model learns to capture noise in the training data rather       than generalizable patterns. 
    Increasing the number of trees helps reduce overfitting by averaging predictions from multiple trees, leading to better         generalization performance.
4. Optimal Parameters:
    it seems that moderate depths and a larger number of trees lead to better generalization performance.

Summary:

From the obtained accuracies for all the models, the tradeoff between the number of trees and the depth of trees in Random Forrest involves balancing model complexity and generalization performance. Increasing the number of trees generally improves generalization, while increasing the depth of trees can lead to higher accuracy on the training set but may risk overfitting. 

# PROBLEM 4  Pair-wise Classifier


In [None]:
#Fisher discriminant function to calculate fisher scores:
def fisher_discriminant(X, y):
    class_means = []
    for class_label in np.unique(y):
        class_means.append(np.mean(X[y == class_label], axis=0))
    
    overall_mean = np.mean(X, axis=0)
    
    S_within = np.zeros(X.shape[1])
    for class_label, class_mean in zip(np.unique(y), class_means):
        class_data = X[y == class_label]
        diff = class_data - class_mean
        S_within += np.sum(diff ** 2, axis=0)
    
    S_between = np.zeros(X.shape[1])
    for class_label, class_mean in zip(np.unique(y), class_means):
        n = np.sum(y == class_label)
        diff = (class_mean - overall_mean) ** 2
        S_between += n * diff
    
    fisher_values = np.abs(S_between / S_within)
    return fisher_values

In [None]:
data.shape

In [None]:
#Get all the unique classes in the dataset.
unique_classes = sorted(data['labels'].unique())
class_pairs = list(combinations(unique_classes, 2))
len(class_pairs)

In [None]:
#Store results in the results list
results = []

#Iterate over pairs and calculate fisher score and SVM.
for class_pair in class_pairs:
    class_1, class_2 = class_pair
    
    # Extract data for the current class pair
    X_train_pair = X_train[(Y_train == class_1) | (Y_train == class_2)]
    y_train_pair = Y_train[(Y_train == class_1) | (Y_train == class_2)]
    
    X_val_pair = X_test[(Y_test == class_1) | (Y_test == class_2)]
    y_val_pair = Y_test[(Y_test == class_1) | (Y_test == class_2)]
    
    # Calculate Fisher discriminant for the current pair
    fisher_values = fisher_discriminant(X_train_pair, y_train_pair)
    
    # Select top 30 Fisher dimensions with positive values
    positive_fisher_indices = np.where(fisher_values > 0)[0]
    top_fisher_indices = np.argsort(fisher_values[positive_fisher_indices])[::-1][:30]
    
    # Map the indices back to the original indices
    top_fisher_indices = positive_fisher_indices[top_fisher_indices]
    
    # Train Linear SVM
    svm = SVC(kernel='linear', C=1.0)
    svm.fit(X_train_pair.iloc[:, top_fisher_indices], y_train_pair)
    y_pred = svm.predict(X_val_pair.iloc[:, top_fisher_indices])
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val_pair, y_pred)
    print(accuracy)
    # Append results
    results.append((class_1, class_2, accuracy))

In [None]:
result4 = pd.DataFrame(results, columns = ['class-1','class-2','Validation Accuracy'])

In [None]:
result4['pairs'] = result4['class-1'] + '-' + result4['class-2']
result4

In [None]:
#Top features in the data set
top_fisher_indices

In [None]:
#Funtion to categorize diffcult vs easy:
#Parameters: Validationa accuracies. threshold can be any 
def categorize_difficulty(validation_accuracies, threshold=0.95):
    difficult_pairs = []
    easy_pairs = []
    
    for i, accuracy in enumerate(validation_accuracies):
        if accuracy < threshold:
            difficult_pairs.append(i)
        else:
            easy_pairs.append(i)
    
    return difficult_pairs, easy_pairs

In [None]:
#calculate the diffcult and easy pairs
validation_accuracies=result4['Validation Accuracy']
difficult_pairs, easy_pairs = categorize_difficulty(validation_accuracies)

In [None]:
difficult_pairs

In [None]:
easy_pairs

1. Pairwise Comparison:
   The results allows for a pairwise comparison of validation accuracies between different pairs of classes.
   This pairwise comparison provides insights into how well a classifier performs when distinguishing between each pair of        classes.

2. Variation in Accuracies:
    The validation accuracies vary across different pairs of classes, ranging from 0.975 to 0.993.
    This variation suggests that the difficulty of distinguishing between different pairs of classes may vary, with some pairs     being easier to classify than others.
    
3. Identification of Challenging Class Pairs:
    Pairs with lower validation accuracies may indicate more challenging class distinctions for the classifier.

Summary:
       
  The provided table offers insights into the classifier's performance when distinguishing between different pairs of classes. It highlights variations in classification difficulty across class pairs and provides valuable information for understanding and potentially improving the classifier's overall performance.






