<a href="https://colab.research.google.com/github/Nishaviii/TestDashboard/blob/master/Asteroid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import numpy as np 
import pandas as pd
import tensorflow as tf
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader, Dataset
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
class AsteroidDataset(Dataset):
    def __init__(self, csv_path, transform=None):
        self.data = pd.read_csv(csv_path)
        imputer = SimpleImputer(strategy="median")
        ordinal_encoder = OrdinalEncoder()
        
        NEA_data= self.data.drop("name", axis=1)
        
        num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),('std_scaler', StandardScaler())])
        NEA_data_num = NEA_data.drop("pha", axis=1)
        num_attribs = list(NEA_data_num)
        cat_attribs = ["pha"]
        
        full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs),("cat", OrdinalEncoder(), cat_attribs)])
        
        NEA_data = NEA_data.dropna(subset=["pha"])
        self.data = full_pipeline.fit_transform(NEA_data)
        self.data = torch.from_numpy(self.data)
        self.transform = transform
        
    def __getitem__(self, index):
        # This method should return only 1 sample
        return self.data[index]
    
    def __len__(self):
        return len(self.data)
    
    def _getsplit_(self, batch_size=16,test_split=.3,validation_split=1/3,shuffle_dataset=True,random_seed=42):
        
        # Creating data indices for training and test splits:
        dataset_size = len(self.data)
        #print(dataset_size)
        indices = list(range(dataset_size))
        split = int(np.floor(test_split * dataset_size))
        if shuffle_dataset :
            np.random.seed(random_seed)
            np.random.shuffle(indices)
        train_indices, test_indices = indices[split:], indices[:split]
        
        # Creating data indices for testing and validation splits:
        dataset_size = len(test_indices)
        #print(dataset_size)
        indices = list(range(dataset_size))
        split = int(np.floor(validation_split * dataset_size))
        if shuffle_dataset :
            np.random.seed(random_seed)
            np.random.shuffle(indices)
        test_indices, validation_indices = indices[split:], indices[:split]

        # [:split] - Slice elements from the beginning to index split(not included):
        # [split:] - Slice elements from index split to the end of the array: 
        
        # Creating PT data samplers and loaders:
        train_sampler = SubsetRandomSampler(train_indices)
        test_sampler = SubsetRandomSampler(test_indices)
        validation_sampler = SubsetRandomSampler(validation_indices)

        train_loader = torch.utils.data.DataLoader(self.data, batch_size=len(train_indices), 
                                           sampler=train_sampler)
        test_loader = torch.utils.data.DataLoader(self.data, batch_size=len(test_indices), 
                                           sampler=test_sampler)
        validation_loader = torch.utils.data.DataLoader(self.data, batch_size=len(validation_indices),
                                                sampler=validation_sampler)

        return train_loader,test_loader,validation_loader
    
    def load_data(self): #load a single batch of data
        train_loader,test_loader,validation_loader =self._getsplit_()
        for data in train_loader:
            train = data
        for data in test_loader:
            test = data
        for data in validation_loader:
            validation = data
        return train,test,validation

In [None]:
dataset = AsteroidDataset(r"C:\Users\Nishavi Ranaweera\results (3).csv")

  if (yield from self.run_code(code, result)):


In [None]:
dataset.__getitem__(0)

tensor([-0.8431, -0.0028,  0.2310, -0.8579, -1.0385,  0.5461,  0.0000],
       dtype=torch.float64)

In [None]:
train,test,validation = dataset.load_data()

In [None]:
validation.shape

torch.Size([103409, 7])

In [None]:
trainset = torch.utils.data.DataLoader(train, batch_size=10, shuffle=True)
testset = torch.utils.data.DataLoader(test, batch_size=10, shuffle=False)
validationset = torch.utils.data.DataLoader(validation, batch_size=10, shuffle=False)

In [None]:
pha_train = []
for index, data in trainset: 
    if(data[6]):
        pha_data 

ValueError: too many values to unpack (expected 2)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(1*6, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return F.log_softmax(x, dim=1)
        return x
mlp = MLP()
print(mlp)

MLP(
  (fc1): Linear(in_features=6, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=2, bias=True)
)


In [None]:
X = torch.randn((1,6))
output = mlp(X)
output

tensor([[-0.6630, -0.7243]], grad_fn=<LogSoftmaxBackward>)

In [None]:
#calculate loss and specify our optimizer
import torch.optim as optim

loss_function = nn.CrossEntropyLoss()  #loss_function is what calculates "how far off" our classifications are from reality
optimizer = optim.Adam(mlp.parameters(), lr=0.001) #optimizer adjusts our model's adjustable parameters like the weights, to slowly, over time, fit our data. 

In [None]:
for epoch in range(3): # 3 full passes over the data
    for data in trainset:  # `data` is a batch of data
        X = data[:, :6].float()
        y = data[:,6].long()
       # X,y = data # X is the batch of features, y is the batch of targets.
        mlp.zero_grad()  # sets gradients to 0 before loss calc. You will do this likely every step.
        output = mlp(X)  # pass in the reshaped batch (recall they are 28x28 atm)
        loss = F.nll_loss(output, y)  # calc and grab the loss value
        loss.backward()  # apply this loss backwards thru the network's parameters
        optimizer.step()  # attempt to optimize weights to account for loss/gradients
    print(loss)  # print loss. We hope loss (a measure of wrong-ness) declines! 

tensor(0.0001, grad_fn=<NllLossBackward>)
tensor(0.0010, grad_fn=<NllLossBackward>)
tensor(0.0006, grad_fn=<NllLossBackward>)


In [None]:
correct = 0
total = 0
y_pred = []
with torch.no_grad():
    for data in testset:
        X = data[:, :6].float()
        y = data[:,6].long()
        output = mlp(X)
        #print(output)
        for idx, i in enumerate(output):
            y_pred.append(torch.argmax(i))
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1

print("Accuracy: ", round(correct/total, 3))

Accuracy:  0.999


In [None]:
y_pred = []
with torch.no_grad():
    for data in trainset:
        X = data[:, :6].float()
        y = data[:,6].long()
        output = mlp(X)
        #print(output)
        for idx, i in enumerate(output):
            y_pred.append(torch.argmax(i))
            

In [None]:
len(y_pred)

382254

In [None]:
train[:,6].shape

torch.Size([382254])

99% Accuracy cannot be considered as the preferred performance measure as the datset is skewed.

Confusion matrix -The general idea is to count the number of times instances of class A are classified as class B.

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(train[:,6], y_pred)

array([[381727,      0],
       [   527,      0]], dtype=int64)

row- actual class 
cloumn - predicted class
381727 (true negatives) were correctly classified as non-phas , 0 (false positives) were wrongly classifed as phas
527 (flase negatives) were wrongly classified as non-phas , 0 (true postives) were correctly classified as phas

We will write them as callable classes instead of simple functions so that parameters of the transform need not be 
passed everytime it’s called. 
For this, we just need to implement __call__ method and if required, __init__ method. 
We can then use a transform like this:https://pytorch.org/tutorials/beginner/data_loading_tutorial.html - Transforms write seperate classes for each pre-processing method
https://stackoverflow.com/questions/55588201/pytorch-transforms-on-tensordataset/55593757