# Question 2: 

## Introduction

## Imports

In [43]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

## Initialising the dataset

Like in the Q1 notebook, we need to load the data in a useable form (i.e. a pandas dataframe).

In [36]:
# change the path to where the csv file is stored on your pc
path = '/Users/ryanu/Documents/Uni/ACT/SDSS-DR14-Classification/SDSS Data.csv'
data = pd.read_csv(path)
#data

I am going to start off using the same features as in Q1.

In [35]:
features = data[['u', 'g', 'r', 'i', 'z']]
labels = data['class']
#features

## Data preprocessing

In [44]:
# Labels are encoded in alphabetical order as integers starting from 0 (e.g. Galaxy is 0, QSO is 1, Star is 2)
    # This is because neural networks can only work with numerical data
    # LabelEncoder() encodes the labels as integers
    # fit_transform() is two steps in one: it calculates the mean and standard deviation, and then normalizes the data
encoded_labels = LabelEncoder().fit_transform(labels)

# Features are normalized to have a mean of 0 and a standard deviation of 1
    # This is important because neural networks tend to converge faster and perform better when the data is normalized.
    # StandardScaler() initialises the data and prepares it for normalization
    # fit_transform() is two steps in one: it calculates the mean and standard deviation, and then normalizes the data
normalized_features = StandardScaler().fit_transform(features)

# Convert features and labels into PyTorch tensors
    # PyTorch tensors are similar to NumPy arrays, but they can be used on a GPU to accelerate computing
    # torch.tensor() creates a tensor from a NumPy array
    # dtype=torch.float32 and dtype=torch.long specify the data type of the tensor
features_tensor = torch.tensor(normalized_features, dtype=torch.float32)
labels_tensor = torch.tensor(encoded_labels, dtype=torch.long) # long is used for integers labels

In [54]:
# Split the data into training, verification, and testing sets
    # train_test_split() splits the data into training and testing sets
    # test_size=0.2 specifies that 20% of the data should be used for testing
    # random_state=42 is a random seed used to shuffle the data
    # The data is split into training and validation sets in a 80:20 ratio
    # The training set is then split into training and validation sets in a 80:20 ratio
    # The final data is split into training, validation, and testing sets in a 64:16:20 ratio
features_train_val, features_test, label_train_val, label_test = train_test_split(features, labels, test_size=0.2, random_state=42)
features_train, features_val, label_train, label_val = train_test_split(features_train_val, label_train_val, test_size=0.2, random_state=42)