# Import libraries

In [None]:
import numpy as np 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import time

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torchvision
import torchvision.models as models
from torchvision.transforms import Compose, RandomResizedCrop, CenterCrop, Normalize,ToTensor,RandomHorizontalFlip,RandomVerticalFlip, Resize
from torch.optim.lr_scheduler import ReduceLROnPlateau

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold

from sklearn.metrics import accuracy_score, roc_auc_score

import os
import copy

from PIL import Image

import random
import datetime

# Setup seed for reproducibility


In [None]:

def set_seed(seed = 1234):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
set_seed()

# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device available now:', device)

In [None]:
my_train_df = pd.read_csv("./train.csv")
my_test_df = pd.read_csv('./test.csv')

In [None]:
my_train_df.head(3)

In [None]:
# Change col names

new_names = ['dcm_name', 'ID', 'sex', 'age', 'anatomy', 'diagnosis', 'benign_malignant', 'target']
my_train_df.columns = new_names
my_test_df.columns = new_names[:5]

In [None]:
# make a copy
train_df = copy.copy(my_train_df)
test_df = copy.copy(my_test_df)

# Handle missing values and Categorical data encoding

In [None]:
my_train_df['sex'].fillna("male", inplace = True) 
anatomy = ['lower extremity', 'upper extremity', 'torso']
median = my_train_df[(my_train_df['anatomy'].isin(anatomy)) & (my_train_df['target'] == 0) & (my_train_df['sex'] == 'male')]['age'].median()
#print('Median is:', median)
my_train_df['age'].fillna(median, inplace = True) 
my_train_df['anatomy'].fillna('torso', inplace = True)


# address missing anatomy values in test data
# majority of the people with missing anatomy have age 70, so select most frequent anatomy for age 70
value = my_test_df[my_test_df['age'] == 70]['anatomy'].value_counts().reset_index()['index'][0]
my_test_df['anatomy'].fillna(value, inplace = True) 

# Categorical col data encoding 
# ===TRAIN===
to_encode = ['sex', 'anatomy']
encoded_all = []

my_train_df[to_encode[0]] = my_train_df[to_encode[0]].astype(str)
my_train_df[to_encode[1]] = my_train_df[to_encode[1]].astype(str)

label_encoder = LabelEncoder()

for column in to_encode:
    encoded = label_encoder.fit_transform(my_train_df[column])
    encoded_all.append(encoded)
    
my_train_df[to_encode[0]] = encoded_all[0]
my_train_df[to_encode[1]] = encoded_all[1]

# === TEST ===
to_encode = ['sex', 'anatomy']
encoded_all = []

label_encoder = LabelEncoder()

for column in to_encode:
    encoded = label_encoder.fit_transform(my_test_df[column])
    encoded_all.append(encoded)
    
my_test_df['sex'] = encoded_all[0]
my_test_df['anatomy'] = encoded_all[1]

# Data normazlization

In [None]:
#normalize

normalized_train = preprocessing.normalize(my_train_df[['sex', 'age', 'anatomy']],axis=1)
normalized_test = preprocessing.normalize(my_test_df[['sex', 'age', 'anatomy']],axis=1)

my_train_df['sex'] = normalized_train[:, 0]
my_train_df['age'] = normalized_train[:, 1]
my_train_df['anatomy'] = normalized_train[:, 2]

my_test_df['sex'] = normalized_test[:, 0]
my_test_df['age'] = normalized_test[:, 1]
my_test_df['anatomy'] = normalized_test[:, 2]


print('Len Train: {:,}'.format(len(my_train_df)), '\n' +
      'Len Test: {:,}'.format(len(my_test_df)))



In [None]:
# Drop cols that test set does not have

my_train_df.drop(["diagnosis","benign_malignant"], axis=1, inplace=True)

In [None]:
# Add image path to csv files to help access the images
# === DICOM ===
# Create the paths
directory = './'
path_train = directory + '/train/' + my_train_df['dcm_name'] + '.dcm'
path_test = directory + '/test/' + my_test_df['dcm_name'] + '.dcm'

# Append to the original dataframes
my_train_df['path_dicom'] = path_train
my_test_df['path_dicom'] = path_test

# === JPEG ===
# Create the paths
path_train = directory + '/jpeg/train/' + my_train_df['dcm_name'] + '.jpg'
path_test = directory + '/jpeg/test/' + my_test_df['dcm_name'] + '.jpg'

# Append to the original dataframes
my_train_df['path_jpeg'] = path_train
my_test_df['path_jpeg'] = path_test


# Data Loader

In [None]:
class MelanomaDataset(Dataset):
    
    def __init__(self, dataframe, is_train=True, is_valid=False, is_test=False):
        
        self.dataframe, self.is_train, self.is_valid = dataframe, is_train, is_valid
        
        # Data Augmentation
        if is_train or is_test:
            self.transform = Compose([RandomResizedCrop((256, 256), scale=(0.4, 1.0)),
                                      RandomHorizontalFlip(),
                                      RandomVerticalFlip(),
                                      ToTensor(),
                                      Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
        else:
            self.transform = Compose([Resize(256),
                                      CenterCrop(256),
                                      ToTensor(),
                                      Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
                                      ])
            
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):
        # Select path and read image
        image_path = self.dataframe['path_jpeg'][index]
        image = Image.open(image_path)
        # For this image also import .csv information (sex, age, anatomy)
        csv_data = np.array(self.dataframe.iloc[index][['sex', 'age', 'anatomy']].values, 
                            dtype=np.float32)
        '''
        # if one-hot encoding instead of label encoding
        sex_data = np.zeros(2)
        sex_data[int(self.dataframe.iloc[index]['sex'])] = 1
        
        age_data = (self.dataframe.iloc[index]['sex']-48.87)/14.36 # mean=48.87, std=14.36
        
        ana_data = np.zeros(6)
        ana_data[int(self.dataframe.iloc[index]['anatomy'])] = 1
        
        csv_data = np.concatenate([sex_data, ana_data, np.array([age_data])])
        csv_data = np.array(csv_data, dtype=np.float32)

        '''                
        
        # Apply transforms
        image = self.transform(image)
        
        # If train/valid: image + class | If test: only image
        if self.is_train or self.is_valid:
            return (image, csv_data), self.dataframe['target'][index]
        else:
            return (image, csv_data)

# Image model - MobileNetV2

In [None]:

csv_columns = ['sex', 'age', 'anatomy']
no_columns = 3
csv_neurons = 500

class MobileNetV2Network(nn.Module):
    def __init__(self, output_size, no_columns):
        super().__init__()
        self.no_columns, self.output_size = no_columns, output_size
        
        # Define Feature part (IMAGE)
        self.features = models.mobilenet_v2(pretrained=True) # 1000 neurons out
        # (CSV data model)
        self.csv = nn.Sequential(nn.Linear(self.no_columns, int(csv_neurons)),
                                 nn.BatchNorm1d(int(csv_neurons)),
                                 nn.ReLU(),
                                 nn.Dropout(p=0.2))
        
        # Define Classification part
        self.classification = nn.Linear(1000 + int(csv_neurons), output_size)
        
        
    def forward(self, image, csv_data, prints=False):
        
        if prints: print('Input Image shape:', image.shape, '\n'+
                         'Input csv_data shape:', csv_data.shape)
        
        # Image CNN
        image = self.features(image)
        if prints: print('Features Image shape:', image.shape)
        
        # CSV FNN
        csv_data = self.csv(csv_data)
        if prints: print('CSV Data:', csv_data.shape)
            
        # Concatenate layers from image with layers from csv_data
        image_csv_data = torch.cat((image, csv_data), dim=1)
        
        # CLASSIF
        out = self.classification(image_csv_data)
        if prints: print('Out shape:', out.shape)
        
        return out

# Training parameters

In [None]:


my_train_len = len(my_train_df)
my_test_len = len(my_test_df)
output_size=1

# Out of Fold Predictions
oof = np.zeros(shape = (my_train_len, 1))

# Predictions
preds_submission = torch.zeros(size = (my_test_len, 1), dtype=torch.float32, device=device)

print('oof shape:', oof.shape, '\n' +
      'predictions shape:', preds_submission.shape)


k = 6

group_fold = GroupKFold(n_splits = k)

# Generate indices to split data into training and test set.
folds = group_fold.split(X = np.zeros(my_train_len), 
                         y = my_train_df['target'], 
                         groups = my_train_df['ID'].tolist())

epochs = 15
patience = 5
TTA = 3
num_workers = 8
learning_rate = 0.0005
weight_decay = 0.0
lr_patience = 1            # 1 model not improving until lr is decreasing
lr_factor = 0.4            # by how much the lr is decreasing

batch_size1 = 64
batch_size2 = 64

version = 'v1'             # to keep tabs on versions



# Train main function

In [None]:

def train_folds(model, version = 'v1'):
    # Creates a .txt file that will contain the logs
    f = open(f"logs_{version}.txt", "w+")
    
    
    for fold, (train_index, valid_index) in enumerate(folds):
        # Append to .txt
        with open(f"logs_{version}.txt", 'a+') as f:
            print('-'*10, 'Fold:', fold+1, '-'*10, file=f)
        print('-'*10, 'Fold:', fold+1, '-'*10)


        # --- Create Instances ---
        # Best ROC score in this fold
        best_roc = None
        # Reset patience before every fold
        patience_f = patience
        
        # Initiate the model
        model = model

        optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate, weight_decay=weight_decay)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='max', 
                                      patience=lr_patience, verbose=True, factor=lr_factor)
        criterion = nn.BCEWithLogitsLoss()


        # --- Read in Data ---
        train_data = my_train_df.iloc[train_index].reset_index(drop=True)
        valid_data = my_train_df.iloc[valid_index].reset_index(drop=True)

        # Create Data instances
        train = MelanomaDataset(train_data, 
                                is_train=True, is_valid=False, is_test=False)
        valid = MelanomaDataset(valid_data,  
                                is_train=False, is_valid=True, is_test=False)
        
        test = MelanomaDataset(my_test_df, 
                               is_train=False, is_valid=False, is_test=True)

        # Dataloaders
        train_loader = DataLoader(train, batch_size=batch_size1, shuffle=True, num_workers=num_workers)
        # shuffle=False! 
        valid_loader = DataLoader(valid, batch_size=batch_size2, shuffle=False, num_workers=num_workers)
        test_loader = DataLoader(test, batch_size=batch_size2, shuffle=False, num_workers=num_workers)


        # === EPOCHS ===
        for epoch in range(epochs):
            #break
            total = 0
            start_time = time.time()
            correct = 0
            train_losses = 0

            # === TRAIN ===
            # Sets the module in training mode.
            model.train()

            batch_idx = 0
            for (images, csv_data), labels in train_loader:
                #if batch_idx > 2:
                #    break
                # Save them to device
                images = torch.tensor(images, device=device, dtype=torch.float32)
                csv_data = torch.tensor(csv_data, device=device, dtype=torch.float32)
                labels = torch.tensor(labels, device=device, dtype=torch.float32)

                optimizer.zero_grad()

                
                out = model(images, csv_data)
                loss = criterion(out, labels.unsqueeze(1))
                loss.backward()
                optimizer.step()

                # --- Save information after this batch ---
                # Save loss
                train_losses += loss.item()
                # From log probabilities to actual probabilities
                train_preds = torch.round(torch.sigmoid(out)) # 0 and 1
                # Number of correct predictions
                correct += (train_preds.cpu() == labels.cpu().unsqueeze(1)).sum().item()
                total += len(labels)
                
                if batch_idx % 50 == 0:
                    line = 'Loss:{}, correct:{}, total:{}/{}'.format(float(train_losses)/total, float(correct)/total, batch_idx, len(train_loader))
                    print(line)

                batch_idx += 1
            # Compute Train Accuracy
            train_acc = correct / len(train_index)

            #input("Start validation?")
            # === EVAL ===
            # Sets the model in evaluation mode
            model.eval()

            # Create matrix to store evaluation predictions (for accuracy)
            valid_preds = torch.zeros(size = (len(valid_index), 1), device=device, dtype=torch.float32)


            # Disables gradients (we need to be sure no optimization happens)
            with torch.no_grad():
                for k, ((images, csv_data), labels) in enumerate(valid_loader):
                    images = torch.tensor(images, device=device, dtype=torch.float32)
                    csv_data = torch.tensor(csv_data, device=device, dtype=torch.float32)
                    labels = torch.tensor(labels, device=device, dtype=torch.float32)

                    out = model(images, csv_data)
                    pred = torch.sigmoid(out)
                    valid_preds[k*images.shape[0] : k*images.shape[0] + images.shape[0]] = pred
                    
                    
                    if k % 50 == 0:
                        line = "k:{}/{}".format(k,len(valid_loader))
                        print(line)

                # Compute accuracy
                valid_acc = accuracy_score(valid_data['target'].values, 
                                           torch.round(valid_preds.cpu()))
                # Compute ROC
                valid_roc = roc_auc_score(valid_data['target'].values, 
                                          valid_preds.cpu())

                # Compute time on Train + Eval
                duration = str(datetime.timedelta(seconds=time.time() - start_time))[:7]


                # PRINT INFO
                # Append to .txt file
                with open(f"logs_{version}.txt", 'a+') as f:
                    print('{} | Epoch: {}/{} | Loss: {:.4} | Train Acc: {:.3} | Valid Acc: {:.3} | ROC: {:.3}'.\
                     format(duration, epoch+1, epochs, train_losses, train_acc, valid_acc, valid_roc), file=f)
                # Print to console
                print('{} | Epoch: {}/{} | Loss: {:.4} | Train Acc: {:.3} | Valid Acc: {:.3} | ROC: {:.3}'.\
                     format(duration, epoch+1, epochs, train_losses, train_acc, valid_acc, valid_roc))


                # === SAVE MODEL ===

                # Update scheduler (for learning_rate)
                scheduler.step(valid_roc)

                # Update best_roc
                if not best_roc: # If best_roc = None
                    best_roc = valid_roc
                    torch.save(model.state_dict(),
                               f"Fold{fold+1}_Epoch{epoch+1}_ValidAcc_{valid_acc:.3f}_ROC_{valid_roc:.3f}.pth")
                    continue

                if valid_roc > best_roc:
                    best_roc = valid_roc
                    # Reset patience (because we have improvement)
                    patience_f = patience
                    torch.save(model.state_dict(),
                               f"Fold{fold+1}_Epoch{epoch+1}_ValidAcc_{valid_acc:.3f}_ROC_{valid_roc:.3f}.pth")
                else:
                    # Decrease patience (no improvement in ROC)
                    patience_f = patience_f - 1
                    if patience_f == 0:
                        with open(f"logs_{version}.txt", 'a+') as f:
                            print('Early stopping (no improvement since 3 models) | Best ROC: {}'.\
                                  format(best_roc), file=f)
                        print('Early stopping (no improvement since 3 models) | Best ROC: {}'.\
                              format(best_roc))
                        break


In [None]:
# --- model ---
model = MobileNetV2Network(output_size=output_size, no_columns=no_columns).to(device)

# Training
train_folds(model = model, version = version)

# Test

In [None]:
# --- TEST ---
best_model_path = './best_model.pth'

model = MobileNetV2Network(output_size=1, no_columns=no_columns).to(device)
model.load_state_dict(torch.load(best_model_path))
print("model loaded.")
# Set the model in evaluation mode
model.eval()
for i in range(TTA):
    for k, (images, csv_data) in enumerate(test_loader):
        images = torch.tensor(images, device=device, dtype=torch.float32)
        csv_data = torch.tensor(csv_data, device=device, dtype=torch.float32)

        out = model(images, csv_data)
        # Covert to probablities
        out = torch.sigmoid(out)

        # ADDS! the prediction to the matrix we already created
        preds_submission[k*images.shape[0] : k*images.shape[0] + images.shape[0]] += out
        print(k, len(test_loader))

# Divide Predictions by TTA (to average the results during TTA)
preds_submission /= TTA
pred = preds_submission.detach().cpu().numpy()
print(pred)
with open('results.txt','w') as ff:
    for i in pred:
        ff.write(str(i))
        ff.write("\n")


