In [None]:
##Large domain gap experiment

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import random
from torch.utils.data import Dataset, DataLoader
import matplotlib.image as img
from sklearn.model_selection import train_test_split

import torchvision.models as models
import cv2



In [None]:
# Fix random seed
sd = 0
np.random.seed(sd)
torch.backends.cudnn.deterministic = True
torch.manual_seed(sd)
random.seed(sd)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(sd)

In [None]:
# dict to map given label to a number
dict_gender_to_number = {'Male' : 0, 
                        'Female': 1}

In [None]:
#Labels for humans (UTKFace)
data_path = r'./dataset/humans/'

data_labels = os.listdir(data_path)

clean_labels = []

age = []
age2 = []
gender = []
race = []
for f in data_labels:
    temp = f.split('_')
#     print(temp)
    if len(temp[2])>1:
        continue
    age.append(temp[0])
    age2.append(int(temp[0]))
    gender.append(temp[1])
    race.append(temp[2])
    clean_labels.append(data_path+f)



In [None]:
  
gender_class = []
for g in gender:
    if g=='0':
        gender_class.append('Male')
    elif g=='1':
        gender_class.append('Female')
    else:
        print('ErrorG')
        print(g)
        
species_class = []
for g in race:
    species_class.append('Human')


In [None]:
#create pandas dataframe
df = {'file': clean_labels, 'gender': gender_class, 'species': species_class}

train_labels_humans = df = pd.DataFrame(data=df)
train_labels_humans = train_labels_humans.sample(frac=1)


In [None]:
#chicken data
data_path = r'./dataset/chicken/male/'

data_labels = os.listdir(data_path)

clean_labels = []

age = []
age2 = []
gender_class = []
species_class = []
race = []
for f in data_labels:
    gender_class.append('Male')
    species_class.append('Chicken')
    clean_labels.append(data_path+f)
    

    
data_path = r'./dataset/chicken/female/'  
data_labels = os.listdir(data_path)

for f in data_labels:
    gender_class.append('Female')
    species_class.append('Chicken')
    clean_labels.append(data_path+f)




In [None]:
#create pandas dataframe
df = {'file': clean_labels, 'gender': gender_class, 'species': species_class}

train_labels_chicken = df = pd.DataFrame(data=df)
train_labels_chicken = train_labels_chicken.sample(frac=1)


In [None]:
class CombinedDataset(Dataset):
    def __init__(self, data, path , transform = None):
        super().__init__()
        self.data = data.values
        self.path = path
        self.transform = transform
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,index):
        img_name = self.data[index][0]
        label = dict_gender_to_number[self.data[index][1]] 
        label = torch.tensor(label)
        img_path = os.path.join(img_name)
        image1 = cv2.imread(img_path)
        image = cv2.resize(image1, (100,100))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.transform is not None:
            image = self.transform(image)
        return image, label


In [None]:
#transforms go here

train_transform = transforms.Compose([transforms.ToTensor()])
test_transform = transforms.Compose([transforms.ToTensor()])
valid_transform = transforms.Compose([transforms.ToTensor()])

In [None]:
#functions for data
def resample_dataset_race(data,frac):
    flagsD = data['species']=='Chicken'
    flagsL = data['species']=='Human'
    data_D = data[flagsD]
    data_L = data[flagsL]
    
    data_D = equalize_dataset_gender(data_D,0.5)
    data_L = equalize_dataset_gender(data_L,0.5)
    
    baseline = min(len(data_D),len(data_L))-2
    
    data_D = data_D.sort_values('gender')
    data_L = data_L.sort_values('gender')

    remD = len(data_D)-int((frac)*baseline)
    tempD = data_D[int(0.5*remD):-int(0.5*remD)]
    remL = len(data_L)-int((1-frac)*baseline)
    tempL = data_L[int(0.5*remL):-int(0.5*remL)]
    
    print(baseline,remD,remL)

    
    
    
    frames = [tempD,tempL]
    final_split = pd.concat(frames)
    final_split = final_split.sample(frac=1)
    
    return final_split

def resample_dataset_race_old(data,frac):
    flagsD = data['species']=='Chicken'
    flagsL = data['species']=='Human'
    data_D = data[flagsD]
    data_L = data[flagsL]

    baseline = min(len(data_D),len(data_L))

    
    tempD = data_D[0:int(frac*baseline)]
    tempL = data_L[0:int((1-frac)*baseline)]
    

    
    
    frames = [tempD,tempL]
    final_split = pd.concat(frames)
    final_split = final_split.sample(frac=1)
    
    return final_split

def equalize_dataset_gender(data,frac):
    flagsM = data['gender']=='Male'
    flagsF = data['gender']=='Female'
    data_M = data[flagsM]
    data_F = data[flagsF]
    baseline = min(len(data_M),len(data_F))
    
    tempM = data_M[0:int(baseline)]
    tempF = data_F[0:int(baseline)]
    frames = [tempM,tempF]
    final_split = pd.concat(frames)
    final_split = final_split.sample(frac=1)
    
    return final_split


#functions for data
def resample_dataset_equal(data):
    flagsD = data['species']=='Chicken'
    flagsL = data['species']=='Human'
    data_D = data[flagsD]
    data_L = data[flagsL]
    
    data_D = equalize_dataset_gender(data_D,0.5)
    data_L = equalize_dataset_gender(data_L,0.5)
    
    baseline = min(len(data_D),len(data_L))
    
    data_D = data_D.sort_values('gender')
    data_L = data_L.sort_values('gender')
    
#     tempD = data_D[0:int(frac*baseline)]
#     tempL = data_L[0:int((1-frac)*baseline)]
    remD = len(data_D)-int((0.5)*baseline)
    tempD = data_D[int(0.5*remD):-int(0.5*remD)]
    remL = len(data_L)-int((0.5)*baseline)
    tempL = data_L[int(0.5*remL):-int(0.5*remL)]

    
    return tempD,tempL

In [None]:
#train test for chickens
#generate base data
train_split_t_h, test_labels_h = train_test_split(train_labels_humans, test_size=0.2)

train_split_t_c, test_labels_c = train_test_split(train_labels_chicken, test_size=0.2)




frames = [train_split_t_h,train_split_t_c]
train_split_t = pd.concat(frames)

frames = [test_labels_h,test_labels_c]
test_labels = pd.concat(frames)
print(test_labels['gender'].value_counts())



test_split_D,test_split_L = resample_dataset_equal(test_labels)


print(test_split_D['gender'].value_counts())
print(test_split_D['species'].value_counts())

print(test_split_L['gender'].value_counts())
print(test_split_L['species'].value_counts())

In [None]:
#dataloaders: no validation set due to small dataset size
batch_size = 30

test_data_D = CombinedDataset(test_split_D, data_path, test_transform )
test_data_L = CombinedDataset(test_split_L, data_path, test_transform )
test_loader_D = DataLoader(dataset = test_data_D, batch_size = batch_size, shuffle=False, num_workers=4)
test_loader_L = DataLoader(dataset = test_data_L, batch_size = batch_size, shuffle=False, num_workers=4)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
def test_performance(model,dataL,criterion):

    model.eval()
    model.to(device)

    test_loss = 0
    test_acc = 0
    temp_test_acc = []

    for data, target in dataL:

        data = data.to(device)
        target = target.to(device)

        output = model(data)

        loss = criterion(output, target)
        # update-average-validation-loss 
        test_loss += loss.item() * data.size(0)

        op_temp = output.detach().cpu().numpy()
        op_temp = np.argmax(op_temp,axis=1)

        test_acc += np.mean(op_temp==target.detach().cpu().numpy())*data.size(0)

    ttacc  = test_acc/len(dataL.sampler)
    test_loss_M = test_loss/len(dataL.sampler)
    
    test_print = 'Test Loss: {:.3f} \tTest Acc: {:.3f}'.format(
        test_loss_M, ttacc)

    print(test_print)
    return test_print, ttacc

In [None]:
def write_file(fname,string,act):
    with open(fname, act) as text_file:
        text_file.write(string+'\n')

###Combined iteration
def train_model(dark_frac,train_split_t,valid_loader,test_loader_D,test_loader_L):

    num_epochs = 20
    num_classes = 2  # for age
    learning_rate = 0.0005

    check_point_dir = 'chicken'+str(dark_frac)
    
    if not os.path.isdir(f"checkpoints/"+check_point_dir):
        os.makedirs(f"checkpoints/"+check_point_dir)
        print("Output directory is created")
        
    #make logger text file
    text_path = f"checkpoints/"+check_point_dir+"/"+"log.txt"
    try:
        os.remove(text_path)
    except OSError:
        pass
    
    write_file(text_path,'********* Chicken fraction: {} *********'.format(dark_frac),'a')
    
    train_split = resample_dataset_race(train_split_t,dark_frac)
    
    write_file(text_path,str(train_split['species'].value_counts()),'a')
    
    write_file(text_path,str(train_split['gender'].value_counts()),'a')
    
    #Dataloaders
    train_data = CombinedDataset(train_split, data_path, train_transform )

    train_loader = DataLoader(dataset = train_data, batch_size = batch_size, shuffle=True, num_workers=4)
    
    model = models.resnet34(pretrained=False)
    model.fc = nn.Linear(512, num_classes)
    model.load_state_dict(torch.load(f"model_init_2class.pt"))
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    
    optimizer = torch.optim.AdamW(
        model.parameters(), 
        lr=learning_rate, 
        betas=(0.5, 0.999), 
        weight_decay=0.08
        )

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=30, 
        eta_min=0.01 * learning_rate, verbose=True
        )
    
    
    
    # Actual training of model

    train_losses = []
    valid_losses = []

    train_accuracies = []
    val_accuracies = []

    valid_accuracy = []
    test_accuracy_D = []
    test_accuracy_L = []


    print("Training model...")

    best_val_acc = 0

    for epoch in range(1, num_epochs+1):
        # keep track of train/val loss
        train_loss = 0.0
        valid_loss = 0.0

        # training the model
        model.train()
        temp_train_acc = 0.0
        for data, target in train_loader:

            data = data.to(device)
            target = target.to(device)

            optimizer.zero_grad()                   # init gradients to zeros
            output = model(data)           
            loss = criterion(output, target)        # compute loss
            loss.backward()                         # loss backwards
            optimizer.step()                        # update model params

            train_loss += loss.item() * data.size(0)

            op_temp = output.detach().cpu().numpy()
            op_temp = np.argmax(op_temp,axis=1)

            temp_train_acc += np.mean(op_temp==target.detach().cpu().numpy())*data.size(0)
            
        
        # validate-the-model
        model.eval()
        temp_val_acc = 0.0
        for data, target in valid_loader:

            data = data.to(device)
            target = target.to(device)

            output = model(data)

            loss = criterion(output, target)

            # update-average-validation-loss 
            valid_loss += loss.item() * data.size(0)

            op_temp = output.detach().cpu().numpy()
            op_temp = np.argmax(op_temp,axis=1)

            temp_val_acc += np.mean(op_temp==target.detach().cpu().numpy())*data.size(0)

        tvacc  = np.mean(np.array(temp_val_acc))

        if tvacc>best_val_acc:
            best_val_acc = tvacc
            torch.save(model.state_dict(), f"checkpoints/"+check_point_dir+"/model_best.pt")
            print('Model saved')
            write_file(text_path,'Model saved','a')

        # calculate-average-losses
        train_loss = train_loss/len(train_loader.sampler)
        valid_loss = valid_loss/len(valid_loader.sampler)
        
        ttacc  = temp_train_acc/len(train_loader.sampler)
        tvacc  = temp_val_acc/len(valid_loader.sampler)
        
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

        train_accuracies.append(ttacc)
        val_accuracies.append(tvacc)

        scheduler.step()

        # print-training/validation-statistics 
        train_print = 'Epoch: {} \tTr Loss: {:.3f} \tTr Acc: {:.3f} \tVal Loss: {:.3f} \tVal Acc: {:.3f}'.format(
            epoch, train_loss, ttacc, valid_loss, tvacc)
        print(train_print)

        test_print_D, ttacc_D = test_performance(model,test_loader_D,criterion)
        test_print_L, ttacc_L = test_performance(model,test_loader_L,criterion)
        
        valid_accuracy.append(tvacc)
        test_accuracy_D.append(ttacc_D)
        test_accuracy_L.append(ttacc_L)

        write_file(text_path,train_print,'a')
#         with open(text_path, "w") as text_file:
#             text_file.write(train_print)
        
        write_file(text_path,test_print_D,'a')
#         with open(text_path, "w") as text_file:
#             text_file.write(test_print_D)
        
        write_file(text_path,test_print_L,'a')
#         with open(text_path, "w") as text_file:
#             text_file.write(test_print_L)
            
    path_val = f"checkpoints/"+check_point_dir+"/"+"validation_accuracy"
    path_D = f"checkpoints/"+check_point_dir+"/"+"test_accuracy_C"
    path_L = f"checkpoints/"+check_point_dir+"/"+"test_accuracy_H"
    valid_accuracy = np.array(valid_accuracy)
    test_accuracy_D = np.array(test_accuracy_D)
    test_accuracy_L = np.array(test_accuracy_L)
    np.save(path_val, valid_accuracy)
    np.save(path_D, test_accuracy_D)
    np.save(path_L, test_accuracy_L)
    


In [None]:
dark_fracs = np.linspace(0.0,1.0,11)
for dark_frac in dark_fracs:
    print('********* Chicken fraction: {} *********'.format(dark_frac))
    train_model(dark_frac,train_split_t,test_loader_D,test_loader_D,test_loader_L)