<a href="https://colab.research.google.com/github/ShafikDissou/DataTrek2022/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import pandas as pd
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import time

In [None]:
# print (f"Python version: {sys.version}\n")
# print (f"Matplotlib version: {matplotlib.__version__}")
# print (f"Numpy version: {np.__version__}")

In [2]:
if 'drive/MyDrive/DataTrek/Chest_XRay' not in os.getcwd(): 
    os.chdir('drive/MyDrive/DataTrek/Chest_XRay')

In [14]:
BUILD_DATASET = False
class make_dataset():
    if not 'drive/MyDrive/DataTrek/Chest_XRay':
        os.chdir('drive/MyDrive/DataTrek/Chest_XRay')
    IMG_SIZE = 224
    
    df = pd.read_csv('final_set.csv')
    view = {'AP': 0,'PA': 1}
    gender = {'M': 0,'F':1}
    
    df.VIEW = df.VIEW.map(view)
    df.GENDER = df.GENDER.map(gender)

    features = df.columns[:4]
    label = df.columns[4:]
    #will be appended to create the data after
    data = []

    def make_training_data(self):
        for i in tqdm(range(len(df))):
            path = os.path.join('images/',df.loc[i,'IMG'])
            img = Image.open(path).convert('L')
            img = img.resize((self.IMG_SIZE,self.IMG_SIZE))
            img = np.array(img).astype('uint8')
            targets = np.array(df.loc[i,self.label]).astype('uint8')
            self.data.append([img,targets])
        
        np.random.seed(42)
        np.random.shuffle(self.data)
        np.save('training_data.npy',data)
        return self.data

    def split(self,data,split_ratio, cv = False):
        """
        Returns X_train, X_val, y_train, y_val if cv = False
        Returns X_train, X_val, X_test, y_train, y_val, y_test if cv = true
        split_ratio must be between 0 and 1. If cv = True, val and test will be the same sizes.
        """
        X = torch.Tensor(np.array([i[0] for i in data])).view(-1,self.IMG_SIZE,self.IMG_SIZE)
        X = X/255.0 #scaling the tensor
        y = torch.Tensor(np.array([i[1] for i in data]))

        if cv:
            cv_ratio = (1-split_ratio)/2
            train_size = int(len(X) * split_ratio)
            cv_size = int(len(X) * cv_ratio)
            cv_index = train_size + cv_size

            X_train = X[:train_size].to(device)
            y_train = y[:train_size].to(device)

            X_cv = X[train_size : cv_index].to(device)
            y_cv = y[train_size : cv_index].to(device)

            X_test = X[cv_index:].to(device)
            y_test = y[cv_index:].to(device)
        
            return X_train, y_train, X_cv, y_cv, X_test, y_test  

        else:
            train_size = int(len(X) * split_ratio)
        
            X_train = X[:train_size].to(device)
            y_train = y[:train_size].to(device)

            X_val = X[train_size:].to(device)
            y_val = y[train_size:].to(device)

            return X_train, y_train, X_val, y_val  

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1,32,5) 
        self.conv2 = nn.Conv2d(32,64,5)  
        self.conv3 = nn.Conv2d(64,128,5) 
        self._to_linear  = 24*24*128 #73728
        self.fc1 = nn.Linear(24*24*128, 512) #conv output is 73728 (Input -kernel size + 2*padding)/stride + 1 --> pool (divide by 2 then do it again)
        self.fc2 = nn.Linear(512, 15)
         

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2,2))
        x = F.max_pool2d(F.relu(self.conv2(x)), (2,2))
        x = F.max_pool2d(F.relu(self.conv3(x)), (2,2))
        
        x = x.view(-1, self._to_linear)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)

        return x

def fwd_pass(X, y, loss_func, train = False):
    if train:
        net.zero_grad()
    output = net(X.view(-1,1,224,224))
    loss = loss_func(output, y)
    loss = loss * (y*10 + (y == 0)).float()
    # print(output.shape, y.shape)
    
    if train:
        loss.backward()
        optimizer.step()
    
    return output,loss


def train(net,loss_func, BATCH_SIZE = 100, EPOCHS = 5, lr = 0.001):

    for epoch in range(EPOCHS):
        print('EPOCH :', epoch + 1)
        for i in tqdm(range(0, len(X_train), BATCH_SIZE)):
            batch_X = X_train[i: i+ BATCH_SIZE].view(-1,1,224,224)
            batch_y = y_train[i: i+ BATCH_SIZE]

            pred, loss = fwd_pass(batch_X, batch_y, loss_func, train = True)
        
        print('Loss :', round(float(loss),4))
        
    
    return pred[0]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [5]:
if BUILD_DATASET:
    dataset = make_dataset()
    data = dataset.make_training_data()
    X_train, y_train, X_val, y_val = dataset.split(data,split_ratio = 0.8)

else:
    dataset = make_dataset()
    data = np.load("training_data.npy", allow_pickle = True)
    X_train, y_train, X_val, y_val = dataset.split(data,split_ratio = 0.8)


In [13]:
MODEL_NAME = f"model - {int(time.time())}"

lr = 0.001

net = Net().to(device)
optimizer = optim.Adam(net.parameters(), lr = lr)
loss_function = nn.BCEWithLogitsLoss(reduction = 'none')

print(MODEL_NAME)

train(net,loss_function,BATCH_SIZE = 50, EPOCHS = 3, lr = lr)

model - 1644531643
EPOCH : 1


  0%|          | 0/261 [00:00<?, ?it/s]

Loss : 0.2762
EPOCH : 2


  0%|          | 0/261 [00:00<?, ?it/s]

Loss : 0.2662
EPOCH : 3


  0%|          | 0/261 [00:00<?, ?it/s]

Loss : 0.2665


tensor([-1.3858, -4.0105, -2.3543, -4.5914, -1.3574, -3.9877, -4.9500, -7.1542,
        -0.5098, -2.3872, -7.0086, -1.6561, -2.9756, -6.1631, -2.1401],
       device='cuda:0', grad_fn=<SelectBackward0>)

In [6]:
#metrics --> f1

torch.Size([13049, 15])

1

In [None]:
#if GPU memory runs out.
import gc
gc.collect()
# torch.cuda.empty_cache()

400