# Cifar10 Dataset Classification

## Import the dataset

In [1]:
import tensorflow_datasets as tfds
from tensorflow.keras.datasets import cifar10  # Import cifar10 from keras.datasets 

# Load the CIFAR-10 dataset using tensorflow_datasets
dataset, info = tfds.load('cifar10', with_info=True)

# Extract the class names from the dataset info
class_names = info.features['label'].names

# Print the class names
print("Class names in CIFAR-10 dataset:")
print(class_names)



Class names in CIFAR-10 dataset:
['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']


## Import necessary libraries 

In [2]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import numpy as np
from sklearn.model_selection import GridSearchCV

In [3]:
# Load CIFAR-10 dataset
(X_train, y_train), (X_test, y_test) = cifar10.load_data()

# Print shapes to confirm
print(f"Training data shape: {X_train.shape}, Training labels shape: {y_train.shape}")
print(f"Test data shape: {X_test.shape}, Test labels shape: {y_test.shape}")

Training data shape: (50000, 32, 32, 3), Training labels shape: (50000, 1)
Test data shape: (10000, 32, 32, 3), Test labels shape: (10000, 1)


In [4]:
# Preprocess the images :  Convert the images to grayscales by selecting the first color channel
x_images_training = X_train[:, :, :, 0]  # Using the first channel
x_train = x_images_training.reshape((X_train.shape[0], 32 * 32))  # Flattening images into 2D
y_train = y_train.flatten()  # Flatten the labels

x_images_test = X_test[:, :, :, 0]  # Using the first channel for test set
x_test = x_images_test.reshape((X_test.shape[0], 32 * 32))  # Flattening images into 2D
y_test = y_test.flatten()  # Flatten the labels

# Print transformed shapes
print(f"Transformed training data shape: {x_train.shape}")
print(f"Transformed test data shape: {x_test.shape}")

Transformed training data shape: (50000, 1024)
Transformed test data shape: (10000, 1024)


## Apply PCA to reduce dimensionality

In [5]:
# Apply PCA to reduce dimensionality, retaining 99% of the variance
pca = PCA(whiten=True)
pca.fit(x_train)
total_variance = sum(pca.explained_variance_)
k = 0
current_variance = 0

# Calculate the number of components needed to preserve 99% of the variance
while current_variance / total_variance < 0.99:
    current_variance += pca.explained_variance_[k]
    k += 1

print(f"Number of components after PCA: {k}")

# Transform training and test data using PCA
pca_cifar = PCA(n_components=k, whiten=True)
x_train_transformed = pca_cifar.fit_transform(x_train)
x_test_transformed = pca_cifar.transform(x_test)

# Print shapes after PCA transformation
print(f"PCA-transformed training data shape: {x_train_transformed.shape}")
print(f"PCA-transformed test data shape: {x_test_transformed.shape}")

Number of components after PCA: 429
PCA-transformed training data shape: (50000, 429)
PCA-transformed test data shape: (10000, 429)


## Initialize Classifiers

In [6]:
# Initialize classifiers
clf1 = RandomForestClassifier(n_estimators=k, n_jobs=-1, max_depth=2000, max_leaf_nodes=2350)
clf2 = LogisticRegression(n_jobs=-1, multi_class="auto", solver='lbfgs', max_iter=1000)
clf3 = KNeighborsClassifier(n_jobs=-1)
clf4 = SVC(tol=0.0001)

## Training Classifier

In [7]:
# Train the classifiers
print("Training classifiers...")
clf1.fit(x_train_transformed, y_train)
clf2.fit(x_train_transformed, y_train)
clf3.fit(x_train_transformed, y_train)
clf4.fit(x_train_transformed, y_train)
print("Training complete.")

Training classifiers...
Training complete.


## Testing the classifier

In [8]:
# Predict on test data
print("Predicting on test data...")
y_test_predicted1 = clf1.predict(x_test_transformed)
y_test_predicted2 = clf2.predict(x_test_transformed)
y_test_predicted3 = clf3.predict(x_test_transformed)
y_test_predicted4 = clf4.predict(x_test_transformed)

Predicting on test data...


## Final Prediction

In [9]:
# Voting mechanism for final predictions
dic = {i: class_names[i] for i in range(len(class_names))}
ans = []

for i in range(len(y_test_predicted1)):
    arr = np.array([y_test_predicted1[i], y_test_predicted2[i], y_test_predicted3[i], y_test_predicted4[i]])
    ans.append(dic[np.argmax(np.bincount(arr))])

## Calculate accuracy for the ensemble

In [11]:
from sklearn.metrics import accuracy_score
y_test_class_names = [class_names[i] for i in y_test]
accuracy = accuracy_score(y_test_class_names, ans)

print(f"Accuracy of the ensemble model: {accuracy * 100:.2f}%")

Accuracy of the ensemble model: 35.79%


# Save predictions to CSV

In [12]:
np.savetxt(fname="answers.csv", X=ans, delimiter=',', fmt="%s")
print("Predictions saved to 'answers.csv'.")

Predictions saved to 'answers.csv'.
