<a href="https://colab.research.google.com/github/AmrMohamadSalah/Data-Classification/blob/main/DataClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Code to read file into Colaboratory:
! pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
link = 'https://drive.google.com/file/d/1RqBOAVQyZOQLPBN6eNDbBOWLDNIB_LR6/view?usp=sharing' # The shareable link
# to get the id part of the file
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('magic04.data')

In [None]:
import pandas as pd
import numpy as np
import io
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

In [None]:
dataframe = pd.read_csv('magic04.data')

In [None]:
data, labels = dataframe.iloc[:, :-1], dataframe.iloc[:, -1]
rus = RandomUnderSampler()
data_res, labels_res = rus.fit_resample(data, labels)
data_res, labels_res = np.array(data_res), np.array(labels_res)
x_train, x_test, y_train, y_test = train_test_split(data_res, labels_res, test_size=0.3, stratify=labels_res)

In [None]:
print(np.unique(y_test, return_counts=True))

(array(['g', 'h'], dtype=object), array([2007, 2006]))


In [None]:
import torch 
import torch.nn.functional as F
from torch import flatten
from sklearn.metrics import classification_report, accuracy_score
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch import nn
import time
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import math

In [None]:
class MagicDataset(Dataset):
    def __init__(self, data, labels):
        self.labels = labels
        self.data = data
        self.dict_labels = {'g': 0, 'h': 1}

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        data = torch.tensor(self.data[index])
        label = self.labels[index]
        return data, torch.tensor(self.dict_labels[label])

In [None]:
class TestModel(nn.Module):
    def __init__(self):
        super(TestModel, self).__init__()
        self.fc1 = nn.Linear(10, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return torch.sigmoid(x)

In [None]:
training_data = MagicDataset(x_train, y_train)
testing_data = MagicDataset(x_test, y_test)

In [None]:
INIT_LR = 1e-5
BATCH_SIZE = 64
EPOCHS = 50
trainDataLoader = DataLoader(training_data, shuffle=True, batch_size=BATCH_SIZE)
testDataLoader = DataLoader(testing_data, batch_size=BATCH_SIZE)

trainSteps = len(trainDataLoader.dataset) // BATCH_SIZE

In [None]:
def tvt(optimizer, mode, model, dataloader):
  if mode == 'train':
    model.train()
    numberOfBatches = trainSteps

  totalLoss = 0
  correctPred = 0
  preds = []
  batchNumber = 1
  lossFn = nn.BCELoss()
  for (data, labels) in dataloader:
        data = data.float()

        pred = model(data)
        pred = pred.squeeze(1)
        temp = pred.detach().numpy().copy()
        for i in range(len(pred)):
          temp[i] = 0. if temp[i] <= 0.5 else 1.
        # temp = torch.from_numpy(temp).requires_grad_()
        loss = lossFn(pred, labels.float())
        totalLoss += loss
        compare = [1 if i==j else 0 for i, j in zip(temp,labels)]
        correctPred += np.array(compare).sum()
        if mode == 'train':
          opt.zero_grad() 
          loss.backward()  
          opt.step()
        
        preds.extend(np.array(compare))

        batchNumber += 1
  return totalLoss/len(dataloader), correctPred, preds

In [None]:
model = TestModel()
opt = Adam(model.parameters(), lr=INIT_LR)
print("Training Model")
for e in range(EPOCHS):
  print('Epoch {0}/{1}'.format(e+1,EPOCHS))
  avgTrainLoss, correctPred, _ = tvt(opt, 'train', model, trainDataLoader)
  print("Train loss: {:.6f}, Train accuracy: {:.4f}".format(avgTrainLoss, correctPred/len(trainDataLoader.dataset)))

print('\nTesting:')
_,  _, preds = tvt(opt, 'test', model, testDataLoader)
enumLabels = []
for label in testing_data.labels:
    enumLabels.append(testing_data.dict_labels[label])
print(classification_report(np.array(enumLabels), np.array(preds)))
accuracy = accuracy_score(enumLabels, preds)
print('Accuracy = {0} '.format(accuracy))