# Programming Assignment 2

## Two changes to be made in PRML library before you begin:
 * Go to prml --> linear --> logistic_regression.py and change astype(np.int) to astype(int) under `classify` function.
 * Go to prml --> linear --> fishers_linear_discriminant.py and change astype(np.int) to astype(int) under `classify` function.


In [1]:
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from prml.preprocess import PolynomialFeature
from prml.linear import (
    BayesianLogisticRegression,
    LeastSquaresClassifier,
    FishersLinearDiscriminant,
    LogisticRegression,
    Perceptron,
    SoftmaxRegression
)

## Section 1: Binary Classification

In [2]:
def create_data(add_outliers=False, add_class=False):
    x0 = np.random.normal(size=1000).reshape(-1, 2) - 1.5
    x1 = np.random.normal(size=1000).reshape(-1, 2) + 1.5
    if add_outliers:
        x_1 = np.random.normal(size=200).reshape(-1, 2) + np.array([6.5, 9.5])
        return np.concatenate([x0, x1, x_1]), np.concatenate([np.zeros(700), np.ones(400)]).astype(int)
    if add_class:
        x2 = np.random.normal(size=1000).reshape(-1, 2) + 3.5
        return np.concatenate([x0, x1, x2]), np.concatenate([np.zeros(500), np.ones(500), 2 + np.zeros(500)]).astype(int)
    return np.concatenate([x0, x1]), np.concatenate([np.zeros(500), np.ones(500)]).astype(int)


## 1a. Create a dataset using the `create_data` function with `add_class` set to `False` and `add_outliers` set to `False`.
   * Classify the output using Least Squares Classifier, Logistic Regression, Fisher's Linear Discriminant
   * Plot the data points and the decision boundary for all the 3 models
   * Write your observations


In [3]:
# your code goes here

## 1b. Create a dataset using the `create_data` function with `add_class` set to `False` and `add_outliers` set to `True`.
   * Classify the output using Least Squares Classifier, Logistic Regression, Fisher's Linear Discriminant
   * Plot the data points and the decision boundary for all the 3 models
   * Write your observations

In [4]:
# your code goes here

## Section 2: Multi-class Classification

## 2a. Create a dataset using the `create_data` function with `add_class` set to `True` and `add_outliers` set to `False`
   * Classify the output using Least Squares Classifier and Logistic Regression
   * Plot the data points and the decision boundary for above models
   * Write your observations

In [5]:
# your code goes here

## 2b. Use `abalone.csv`
   * Consider any two columns as x_train
   * Consider `class` column as y_train
   * Classify the output using Least Squares Classifier and Logistic Regression
   * Plot the data points and the decision boundary for above models
   * Write your observations

In [6]:
import pandas as pd
df = pd.read_csv("abalone.csv")
sampled_df = df.sample(n=500)

# x_train = sampled_df[['feature1','feature2']].values
# y_train = sampled_df['target_variable'].values

In [7]:
# your code goes here

## Section 3: Neural Networks

In [4]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml, make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score

from prml import nn

np.random.seed(1234)

## 3a. Use `abalone.csv`
* Consider `class` as the target variable and implement NN Classification without Regularization
* Calculate accuracy metrics (accuracy, precision, recall, F1 score, confusion matrix)

In [9]:
# your code goes here

## 3b. Use `abalone.csv`
* Consider `class` as the target variable and implement NN Classification with Regularization
* Calculate accuracy metrics (accuracy, precision, recall, F1 score, confusion matrix)

In [10]:
# your code goes here

## Section 4: Convolution Neural Network Using `cnn_data.zip`
* Download the `cnn_data.zip` from UBLearns and extract the contents. It contains `train` and `test` folders.
* The images in this dataset are 240X240 pixels RGB (3 channels).
* Implement CNN on this dataset 
* Calculate accuracy metrics (accuracy, precision, recall, F1 score, confusion matrix)
* Write your observations

In [7]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml, make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score
from PIL import Image
import os

from prml import nn

np.random.seed(1234)

# Function to load the images from train and test folders
def load_images_from_folder(folder_path):
    images = []
    labels = []
    for class_name in os.listdir(folder_path):
        class_path = os.path.join(folder_path, class_name)
        if os.path.isdir(class_path):
            for filename in os.listdir(class_path):
                img = Image.open(os.path.join(class_path, filename))
                img = img.resize((64, 64))
                if img is not None:
                    images.append(np.array(img))
                    labels.append(class_name)
    return np.array(images), np.array(labels)

train_dir = 'cnn_data/train'
test_dir = 'cnn_data/test'
train_images, train_labels = load_images_from_folder(train_dir)
test_images, test_labels = load_images_from_folder(test_dir)
train_images = train_images / 255.0
test_images = test_images / 255.0
label_binarizer = LabelBinarizer()
train_labels_one_hot = label_binarizer.fit_transform(train_labels)
test_labels_one_hot = label_binarizer.transform(test_labels)

In [2]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16
from tensorflow.keras import layers, models

# Define data paths
train_data_dir = "cnn_data/train"
test_data_dir = "cnn_data/test"

# Image dimensions
img_height, img_width = 240, 240
batch_size = 32

# Data Augmentation
train_datagen = ImageDataGenerator(
    rescale=1.0 / 255,
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

test_datagen = ImageDataGenerator(rescale=1.0 / 255)

train_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical'
)

test_generator = test_datagen.flow_from_directory(
    test_data_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical'
)

# Create and compile a CNN model (VGG16 as an example)
base_model = VGG16(include_top=False, weights='imagenet', input_shape=(img_height, img_width, 3))
base_model.trainable = False

model = models.Sequential()
model.add(base_model)
model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(3, activation='softmax'))  # 3 classes (adidas, converse, nike)

model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(train_generator, epochs=10, validation_data=test_generator)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_generator)
print("Test Accuracy:", test_accuracy)


Found 711 images belonging to 3 classes.
Found 114 images belonging to 3 classes.
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.7982456088066101


In [9]:
model

<__main__.ConvolutionalNeuralNetwork at 0x2a0db01d0>

In [8]:
class ConvolutionalNeuralNetwork(nn.Network):
    
    def __init__(self):
        super().__init__()
        with self.set_parameter():
            self.conv1 = nn.image.Convolve2d(
                nn.random.truncnormal(-2, 2, 1, (5, 5, 1, 20)),
                stride=(1, 1), pad=(0, 0))
            self.b1 = nn.array([0.1] * 20)
            self.conv2 = nn.image.Convolve2d(
                nn.random.truncnormal(-2, 2, 1, (5, 5, 20, 20)),
                stride=(1, 1), pad=(0, 0))
            self.b2 = nn.array([0.1] * 20)
            self.w3 = nn.random.truncnormal(-2, 2, 1, (4 * 4 * 20, 100))
            self.b3 = nn.array([0.1] * 100)
            self.w4 = nn.random.truncnormal(-2, 2, 1, (100, 10))
            self.b4 = nn.array([0.1] * 10)
        
    def __call__(self, x):
        h = nn.relu(self.conv1(x) + self.b1)
        h = nn.max_pooling2d(h, (2, 2), (2, 2))        
        h = nn.relu(self.conv2(h) + self.b2)
        h = nn.max_pooling2d(h, (2, 2), (2, 2))
        h = h.reshape(-1, 4 * 4 * 20)
        h = nn.relu(h @ self.w3 + self.b3)
        return h @ self.w4 + self.b4

model = ConvolutionalNeuralNetwork()
optimizer = nn.optimizer.Adam(model.parameter, 1e-3)
x_train = train_images
y_train = train_labels_one_hot
x_test = test_images
label_test = test_labels_one_hot

while True:
    indices = np.random.permutation(len(x_train))
    for index in range(0, len(x_train), 50):
        model.clear()
        x_batch = x_train[indices[index: index + 50]]
        y_batch = y_train[indices[index: index + 50]]
        logit = model(x_batch)
        log_likelihood = -nn.loss.softmax_cross_entropy(logit, y_batch).mean(0).sum()
        if optimizer.iter_count % 100 == 0:
            accuracy = accuracy_score(
                np.argmax(y_batch, axis=-1), np.argmax(logit.value, axis=-1)
            )
            print("step {:04d}".format(optimizer.iter_count), end=", ")
            print("accuracy {:.2f}".format(accuracy), end=", ")
            print("Log Likelihood {:g}".format(log_likelihood.value[0]))
        optimizer.maximize(log_likelihood)
        if optimizer.iter_count == 1000:
            break
    else:
        continue
    break

ValueError: cannot reshape array of size 10800000 into shape (50,60,60,20)

In [None]:
# Accuracy, precision, recall, F1 score and Confusion Matrix
predictions = np.argmax(model(x_test).value, axis=-1)
accuracy = accuracy_score(np.argmax(model(x_test).value, axis=-1), label_test)
precision = precision_score(label_test, predictions, average='weighted')
recall = recall_score(label_test, predictions, average='weighted')
f1 = f1_score(label_test, predictions, average='weighted')
confusion = confusion_matrix(label_test, predictions)

# Print the metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", confusion)

In [12]:
# your code goes here

## Section 5: (Bonus) Multi-class Classification using sklearn

## 5a. Use `bonus.csv`
   * Consider `quality` column as target variable
   * Split the dataset into train and test sets (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)
   * Train the model using Logistic Regression (with and without Regularization)
   (https://scikit-learn.org/stable/modules/linear_model.html)
   * Test the model on test set
   * Calculate accuracy, precision, recall, F-1 score, confusion matrix for both models (https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics)
   * Write your observations

In [13]:
# your code goes here