In [12]:
# To run these scripts you need have folder 'data' with sample images on the same level as main.ipynb
# Students: Vladislav Gorokhov & Assan Kabayev

In [None]:
import numpy as np

def svm(X, y):
    n_samples, n_features = X.shape
    lambda_parameter = 0.01
    learning_rate = 0.0001

    y_ = np.where(y <= 0, -1, 1)

    weight = np.zeros(n_features)
    bias = 0

    for _ in range(1000):
        for idx, x_i in enumerate(X):
            condition = y_[idx] * (np.dot(x_i, weight) - bias) >= 1
            if condition:
                weight -= learning_rate * (2 * lambda_parameter * weight)
            else:
                weight -= learning_rate * (2 * lambda_parameter * weight - np.dot(x_i, y_[idx]))
                bias -= learning_rate * y_[idx]
    return weight, bias

def predict_svm(X, weight, bias):
    return np.sign(np.dot(X, weight) - bias)

In [3]:
def logistic_regression(X, y):
    n_samples, n_features = X.shape
    learning_rate = 0.0001

    weights = np.zeros(n_features)
    bias = 0

    for _ in range(1000):
        linear_model = np.dot(X, weights) + bias
        y_predicted = 1 / (1 + np.exp(-linear_model))

        # gradients
        dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
        db = (1 / n_samples) * np.sum(y_predicted - y)
        
        weights -= learning_rate * dw
        bias -= learning_rate * db
    return weights, bias
        
def predict_logistic_regression(X, weights, bias):
    linear_model = np.dot(X, weights) + bias
    y_predicted = 1 / (1 + np.exp(-linear_model))
    y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
    return np.array(y_predicted_cls)

In [4]:
from sklearn.metrics import accuracy_score

def svn_cross_validation(X, y, k):
    n_samples = len(X)
    fold_size = n_samples // k
    accuracies = []

    for fold in range(k):
        # Split the data into training and validation sets
        val_start = fold * fold_size
        val_end = val_start + fold_size
        val_indices = list(range(val_start, val_end))
        train_indices = [i for i in range(n_samples) if i not in val_indices]

        X_train = X[train_indices]
        y_train = y[train_indices]
        X_val = X[val_indices]
        y_val = y[val_indices]

        # train
        w, b = svm(X_train, y_train)

        val_predictions = predict_svm(X_val, w, b)

        # Calculate accuracy for this fold
        accuracy = accuracy_score(y_val, val_predictions)
        accuracies.append(accuracy)

    # Calculate the average accuracy over all folds
    avg_accuracy = np.mean(accuracies)
    return avg_accuracy

def logistic_regression_cross_validation(X, y, k):
    n_samples = len(X)
    fold_size = n_samples // k
    accuracies = []

    for fold in range(k):
        # Split the data into training and validation sets
        val_start = fold * fold_size
        val_end = val_start + fold_size
        val_indices = list(range(val_start, val_end))
        train_indices = [i for i in range(n_samples) if i not in val_indices]

        X_train = X[train_indices]
        y_train = y[train_indices]
        X_val = X[val_indices]
        y_val = y[val_indices]

        # train
        w, b = logistic_regression(X_train, y_train)

        val_predictions = predict_logistic_regression(X_val, w, b)

        # Calculate accuracy for this fold
        accuracy = accuracy_score(y_val, val_predictions)
        accuracies.append(accuracy)

    # Calculate the average accuracy over all folds
    avg_accuracy = np.mean(accuracies)
    return avg_accuracy

In [8]:
import os
from skimage.io import imread
from skimage.transform import resize
from sklearn.model_selection import train_test_split

input_dir = 'data'

# Define categories
#get all file names from the folder
categories = os.listdir(input_dir)
categories.remove(".DS_Store") 

# Load the model
data = []
labels = []

# loop over the input images
for category_idx, category in enumerate(categories):
    for file in os.listdir(os.path.join(input_dir, category)):
        if file == ".DS_Store":
            continue
            
        img_path = os.path.join(input_dir, category, file)
        img = imread(img_path)
        img = resize(img, (32, 32, 3))
        data.append(img.flatten())
        labels.append(category_idx)


data = np.array(data)
labels = np.array(labels)

# Split the data into a training and testing set
X_train , X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, shuffle=True, stratify=labels)

# Perform 5-fold cross-validation
avg_accuracy = svn_cross_validation(data, labels, 5)
print(f"SVM Accuracy: {avg_accuracy * 100:.4f}%")

# Perform 5-fold cross-validation
avg_accuracy = logistic_regression_cross_validation(data, labels, 5)
print(f"Logistic Regression Accuracy: {avg_accuracy * 100:.4f}%")


['gram', 'sugarcane', 'Tobacco-plant', 'Lemon', 'rice', 'Pearl_millet(bajra)', 'cotton', 'Cucumber', 'chilli', 'Cherry', 'cardamom', 'tea', 'jowar', 'Olive-tree', 'wheat', 'vigna-radiati(Mung)', 'coconut', 'Fox_nut(Makhana)', 'almond', 'clove', 'Coffee-plant', 'mustard-oil', 'jute', 'banana', 'soyabean', 'papaya', 'pineapple', 'tomato', 'sunflower', 'maize']
Average SVM Accuracy (5-Fold CV): 3.0303%
Logistic Regression Accuracy: 3.0303%


In [10]:
from sklearn.model_selection import GridSearchCV

# For SVM
class SVM:
    def __init__(self):
        self.b = None
        self.w = None

    def fit(self, X, y):
        self.w, self.b = svm(X, y)
        
    def predict(self, X):
        return predict_svm(X, self.w, self.b)

param_grid_svm = {'C': [0.01, 0.1, 1, 10, 100], 'lambda_param': [0.001, 0.01, 0.1, 1]}
grid_search_svm = GridSearchCV(SVM(), param_grid=param_grid_svm, cv=5)
grid_search_svm.fit(X_train, y_train)

# For Logistic Regression
class LogisticRegression:
    def __init__(self):
        self.b = None
        self.w = None

    def fit(self, X, y):
        self.w, self.b = logistic_regression(X, y)

    def predict(self, X):
        return predict_logistic_regression(X, self.w, self.b)

param_grid_lr = {'learning_rate': [0.0001, 0.001, 0.01, 0.1], 'n_iters': [1000, 2000, 5000]}
grid_search_lr = GridSearchCV(LogisticRegression(), param_grid=param_grid_lr, cv=5)
grid_search_lr.fit(X_train, y_train)

# Print the best hyperparameters for both models
print("Best SVM Hyperparameters:", grid_search_svm.best_params_)
print("Best Logistic Regression Hyperparameters:", grid_search_lr.best_params_)


TypeError: Cannot clone object '<__main__.SVM object at 0x164a61bd0>' (type <class '__main__.SVM'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' method.

In [11]:
# Train the SVM and Logistic Regression models with the best hyperparameters
best_svm = grid_search_svm.best_estimator_
best_lr = grid_search_lr.best_estimator_

# Train and evaluate the models on the test set
best_svm.fit(X_train, y_train)
best_lr.fit(X_train, y_train)

svm_test_predictions = best_svm.predict(X_test)
lr_test_predictions = best_lr.predict(X_test)

svm_test_accuracy = accuracy_score(y_test, svm_test_predictions)
lr_test_accuracy = accuracy_score(y_test, lr_test_predictions)

print(f"SVM Test Accuracy: {svm_test_accuracy * 100:.2f}%")
print(f"Logistic Regression Test Accuracy: {lr_test_accuracy * 100:.2f}%")

# Compare the accuracies and explain the results
if svm_test_accuracy > lr_test_accuracy:
    print("SVM performs better.")
    # Provide an explanation for the better performance.
else:
    print("Logistic Regression performs better.")
    # Provide an explanation for the better performance.


AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'