In [None]:
##Adult dataset

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import random
from torch.utils.data import Dataset, DataLoader
import matplotlib.image as img
from sklearn.model_selection import train_test_split

import torchvision.models as models

from sklearn import preprocessing



In [None]:
# Fix random seed
sd = 0
np.random.seed(sd)
torch.backends.cudnn.deterministic = True
torch.manual_seed(sd)
random.seed(sd)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(sd)

In [None]:
header = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 
                  'capital-loss', 'hours-per-week', 'native-country', 'salary']

df_temp = pd.read_csv("data/adults.csv",index_col=False, skipinitialspace=True,header=None,names=header)
df = df_temp

In [None]:
## Drop rows with NaN value
df = df.replace('?', np.nan)
df[pd.isnull(df).any(axis=1)].shape
df.dropna(inplace=True)

In [None]:
##drop the education-num column because there is education column
df.drop('education-num', axis=1, inplace=True)

In [None]:
#Convert categoricals to numerical

def convert_to_int(columns):
  for column in columns:
    unique_values = df[column].unique().tolist()
    dic = {}
    for indx, val in enumerate(unique_values):
      dic[val]=indx
    df[column] = df[column].map(dic).astype(int)
    print(column + " done!")

def convert_to_onehot(data,columns):
  dummies = pd.get_dummies(data[columns])
  data = data.drop(columns, axis=1)
  data = pd.concat([data, dummies], axis=1)
  return data

categorical_columns = ['workclass','education','marital-status','occupation','relationship','sex','native-country']#,'race'
label_column = ['salary']

convert_to_int(label_column)
show_unique_values(label_column)

df = convert_to_onehot(df,categorical_columns)
df = convert_to_onehot(df,['race'])

In [None]:
#Normalize columns
normalize_columns = ['age', 'fnlwgt', 'capital-gain','capital-loss','hours-per-week']

def normalize(columns):
  scaler = preprocessing.StandardScaler()
  df[columns] = scaler.fit_transform(df[columns])
normalize(normalize_columns)


In [None]:
class AdultsDataset(Dataset):
    def __init__(self, data,labels):
        super().__init__()
        self.data = data.values
        self.label = labels.values
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,index):
        dat = torch.tensor(self.data[index])
        label = torch.tensor(self.label[index]) 

        return dat, label


In [None]:
#functions for resampling data
def resample_dataset_race(data,frac):
    flagsD = data['sex_Male']==1#'Black'
    flagsL = data['sex_Female']==1#'White'
    data_D = data[flagsD]
    data_L = data[flagsL]
    
    data_D = equalize_dataset_gender(data_D,0.5)
    data_L = equalize_dataset_gender(data_L,0.5)
    
    baseline = min(len(data_D),len(data_L))-2
    
    data_D = data_D.sort_values('salary')
    data_L = data_L.sort_values('salary')
    
    remD = len(data_D)-int((frac)*baseline)
    tempD = data_D[int(0.5*remD):-int(0.5*remD)]
    remL = len(data_L)-int((1-frac)*baseline)
    tempL = data_L[int(0.5*remL):-int(0.5*remL)]
    
    print(baseline,remD,remL)
    
    
    
    frames = [tempD,tempL]
    final_split = pd.concat(frames)
    final_split = final_split.sample(frac=1)
    
    return final_split

def resample_dataset_race_old(data,frac):
    flagsD = data['sex_Male']==1#'Black'
    flagsL = data['sex_Female']==1#'White'
    data_D = data[flagsD]
    data_L = data[flagsL]
    
    
    baseline = min(len(data_D),len(data_L))
    
    
    tempD = data_D[0:int(frac*baseline)]
    tempL = data_L[0:int((1-frac)*baseline)]
    
    
    
    
    
    frames = [tempD,tempL]
    final_split = pd.concat(frames)
    final_split = final_split.sample(frac=1)
    
    return final_split

def equalize_dataset_gender(data,frac):
    flagsM = data['salary']==0
    flagsF = data['salary']==1
    data_M = data[flagsM]
    data_F = data[flagsF]
    baseline = min(len(data_M),len(data_F))
    
    tempM = data_M[0:int(baseline)]
    tempF = data_F[0:int(baseline)]
    frames = [tempM,tempF]
    final_split = pd.concat(frames)
    final_split = final_split.sample(frac=1)
    
    return final_split


#functions for data
def resample_dataset_equal(data):
    flagsD = data['sex_Male']==1#'Black'
    flagsL = data['sex_Female']==1#'White'
    data_D = data[flagsD]
    data_L = data[flagsL]
    
    data_D = equalize_dataset_gender(data_D,0.5)
    data_L = equalize_dataset_gender(data_L,0.5)
    
    baseline = min(len(data_D),len(data_L))
    
    data_D = data_D.sort_values('salary')
    data_L = data_L.sort_values('salary')
    
    remD = len(data_D)-int((0.5)*baseline)
    tempD = data_D[int(0.5*remD):-int(0.5*remD)]
    remL = len(data_L)-int((0.5)*baseline)
    tempL = data_L[int(0.5*remL):-int(0.5*remL)]

    
    return tempD,tempL

In [None]:
train_labels = df

In [None]:

batch_size = 30
# learning_rate = 0.0006
    
#generate base data
train_split_t, val_split_t_dash = train_test_split(train_labels, test_size=0.2)
# print(val_split_t)
val_split_t, test_labels = train_test_split(val_split_t_dash, test_size=0.96)

#Ensure val set has equal representation
flagsD = val_split_t['sex_Male']==1#'Black'
flagsL = val_split_t['sex_Female']==1#'White'
data_L = val_split_t[flagsL]
data_L = equalize_dataset_gender(data_L,0.5)

data_D = val_split_t[flagsD]
data_D = equalize_dataset_gender(data_D,0.5)

val_split = data_L.sample(frac=1)


test_split_D,test_split_L = resample_dataset_equal(test_labels)

print(val_split['salary'].value_counts())
print(val_split['sex_Male'].value_counts())
print(val_split['sex_Female'].value_counts())

val_split = val_split.drop('sex_Male',axis=1)
val_split = val_split.drop('sex_Female',axis=1)

print(test_split_D['salary'].value_counts())
print(test_split_D['sex_Male'].value_counts())
print(test_split_D['sex_Female'].value_counts())

test_split_D = test_split_D.drop('sex_Male',axis=1)
test_split_D = test_split_D.drop('sex_Female',axis=1)

print(test_split_L['salary'].value_counts())
print(test_split_L['sex_Male'].value_counts())
print(test_split_L['sex_Female'].value_counts())

test_split_L = test_split_L.drop('sex_Male',axis=1)
test_split_L = test_split_L.drop('sex_Female',axis=1)


# print(train_split_t.head())
print(val_split.shape)
print(test_split_D.shape)
print(test_split_L.shape)

#get the inputs and targets
val_x_data = val_split.drop('salary',axis=1)
val_y_labels = val_split['salary']

test_split_D_x_data = test_split_D.drop('salary',axis=1)
test_split_D_y_labels = test_split_D['salary']

test_split_L_x_data = test_split_L.drop('salary',axis=1)
test_split_L_y_labels = test_split_L['salary']

#dataloaders
valid_data = AdultsDataset(val_x_data,val_y_labels)
test_data_D = AdultsDataset(test_split_D_x_data,test_split_D_y_labels)
test_data_L = AdultsDataset(test_split_L_x_data,test_split_L_y_labels)
valid_loader = DataLoader(dataset = valid_data, batch_size = batch_size, shuffle=False, num_workers=0)
test_loader_D = DataLoader(dataset = test_data_D, batch_size = batch_size, shuffle=False, num_workers=0)
test_loader_L = DataLoader(dataset = test_data_L, batch_size = batch_size, shuffle=False, num_workers=0)

In [None]:
#Model

def act(x):
    return F.relu(x)

class Network(nn.Module):

    def __init__(self,):
        super().__init__()

        self.fc1 = nn.Linear(101, 50)
        self.fc2 = nn.Linear(50, 50)
        self.fc3 = nn.Linear(50, 50)
        self.fcLast = nn.Linear(50,2)

    def forward(self,x):

        x = act(self.fc1(x))
        x = act(self.fc2(x))
        x = act(self.fc3(x))
        x = torch.sigmoid(self.fcLast(x))

        return x

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
def test_performance(model,dataL,criterion):

    model.eval()
    model.to(device)

    test_loss = 0
    test_acc = 0
    temp_test_acc = []

    for data, target in dataL:

        data = data.to(device).float()
        target = target.to(device)

        output = model(data)

        loss = criterion(output, target)
        # update-average-validation-loss 
        test_loss += loss.item() * data.size(0)

        op_temp = output.detach().cpu().numpy()
        op_temp = np.argmax(op_temp,axis=1)

        test_acc += np.mean(op_temp==target.detach().cpu().numpy())*data.size(0)

    ttacc  = test_acc/len(dataL.sampler)
    test_loss_M = test_loss/len(dataL.sampler)
    
    test_print = 'Test Loss: {:.3f} \tTest Acc: {:.3f}'.format(
        test_loss_M, ttacc)

    print(test_print)
    return test_print, ttacc

In [None]:
def write_file(fname,string,act):
    with open(fname, act) as text_file:
        text_file.write(string+'\n')

###Combined iteration
def train_model(dark_frac,train_split_t,valid_loader,test_loader_D,test_loader_L):

    num_epochs = 250
    num_classes = 2  # for age
#     batch_size = 25
    learning_rate = 0.0005

    check_point_dir = 'male'+str(dark_frac)
    
    if not os.path.isdir(f"checkpoints/"+check_point_dir):
        os.makedirs(f"checkpoints/"+check_point_dir)
        print("Output directory is created")
        
    #make logger text file
    text_path = f"checkpoints/"+check_point_dir+"/"+"log.txt"
    try:
        os.remove(text_path)
    except OSError:
        pass
    
    write_file(text_path,'********* Male fraction: {} *********'.format(dark_frac),'a')

    train_split = resample_dataset_race(train_split_t,dark_frac)

    
    write_file(text_path,str(train_split['sex_Male'].value_counts()),'a')
    write_file(text_path,str(train_split['sex_Female'].value_counts()),'a')
    
    write_file(text_path,str(train_split['salary'].value_counts()),'a')
    
    train_split = train_split.drop('sex_Male',axis=1)
    train_split = train_split.drop('sex_Female',axis=1)

    train_split_x_data = train_split.drop('salary',axis=1)
    train_split_y_labels = train_split['salary']


#     print(len(train_split))

    
    #Dataloaders
    train_data = AdultsDataset(train_split_x_data,train_split_y_labels)

    train_loader = DataLoader(dataset = train_data, batch_size = batch_size, shuffle=True, num_workers=0)
    
    model = Network()
    model.load_state_dict(torch.load(f"model2_init_2class.pt"))
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    
    optimizer = torch.optim.AdamW(
        model.parameters(), 
        lr=learning_rate, 
        betas=(0.5, 0.999), 
        weight_decay=0.08
        )

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=30, 
        eta_min=0.01 * learning_rate, verbose=True
        )
    
    
    
    # Actual training of model

    train_losses = []
    valid_losses = []

    train_accuracies = []
    val_accuracies = []

    valid_accuracy = []
    test_accuracy_D = []
    test_accuracy_L = []


    print("Training model...")

    best_val_acc = 0

    for epoch in range(1, num_epochs+1):
        # keep track of train/val loss
        train_loss = 0.0
        valid_loss = 0.0

        # training the model
        model.train()
        temp_train_acc = 0.0
        for data, target in train_loader:

            data = data.to(device).float()
            target = target.to(device)

            optimizer.zero_grad()                   # init gradients to zeros
            output = model(data)        
            loss = criterion(output, target)        # compute loss
            loss.backward()                         # loss backwards
            optimizer.step()                        # update model params

            train_loss += loss.item() * data.size(0)

            op_temp = output.detach().cpu().numpy()
            op_temp = np.argmax(op_temp,axis=1)

            temp_train_acc += np.mean(op_temp==target.detach().cpu().numpy())*data.size(0)

        
        # validate-the-model
        model.eval()
        temp_val_acc = 0.0
        for data, target in valid_loader:

            data = data.to(device).float()
            target = target.to(device)

            output = model(data)

            loss = criterion(output, target)

            # update-average-validation-loss 
            valid_loss += loss.item() * data.size(0)

            op_temp = output.detach().cpu().numpy()
            op_temp = np.argmax(op_temp,axis=1)

            temp_val_acc += np.mean(op_temp==target.detach().cpu().numpy())*data.size(0)

        tvacc  = np.mean(np.array(temp_val_acc))

        if tvacc>best_val_acc:
            best_val_acc = tvacc
            torch.save(model.state_dict(), f"checkpoints/"+check_point_dir+"/model_best.pt")
            print('Model saved')
            write_file(text_path,'Model saved','a')

        # calculate-average-losses
        train_loss = train_loss/len(train_loader.sampler)
        valid_loss = valid_loss/len(valid_loader.sampler)
        
        ttacc  = temp_train_acc/len(train_loader.sampler)
        tvacc  = temp_val_acc/len(valid_loader.sampler)
        
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

        train_accuracies.append(ttacc)
        val_accuracies.append(tvacc)

        scheduler.step()

        # print-training/validation-statistics 
        train_print = 'Epoch: {} \tTr Loss: {:.3f} \tTr Acc: {:.3f} \tVal Loss: {:.3f} \tVal Acc: {:.3f}'.format(
            epoch, train_loss, ttacc, valid_loss, tvacc)
        print(train_print)

        test_print_D, ttacc_D = test_performance(model,test_loader_D,criterion)
        test_print_L, ttacc_L = test_performance(model,test_loader_L,criterion)
        
        valid_accuracy.append(tvacc)
        test_accuracy_D.append(ttacc_D)
        test_accuracy_L.append(ttacc_L)

        write_file(text_path,train_print,'a')
        
        write_file(text_path,test_print_D,'a')
        
        write_file(text_path,test_print_L,'a')
            
    path_val = f"checkpoints/"+check_point_dir+"/"+"validation_accuracy"
    path_D = f"checkpoints/"+check_point_dir+"/"+"test_accuracy_M"
    path_L = f"checkpoints/"+check_point_dir+"/"+"test_accuracy_F"
    valid_accuracy = np.array(valid_accuracy)
    test_accuracy_D = np.array(test_accuracy_D)
    test_accuracy_L = np.array(test_accuracy_L)
    np.save(path_val, valid_accuracy)
    np.save(path_D, test_accuracy_D)
    np.save(path_L, test_accuracy_L)
    


In [None]:
dark_fracs = np.linspace(0.0,1.0,11)
for dark_frac in dark_fracs:
    print('********* Male fraction: {} *********'.format(dark_frac))
    train_model(dark_frac,train_split_t,valid_loader,test_loader_D,test_loader_L)