In [1]:
# Path to the MDS dataset
dataset_path='../datasets/MDS/processed_acl/' 

#### all_domains = ['books', 'dvd', 'electronics', 'kitchen']
#### src_domain and trg_domain can be any of the above domains.
s_domain = 'books' #source domain
t_domain = 'kitchen' #target domain

In [2]:
import sys,os
#### Change "0" to GPU device number you want to use if you have multiple GPU devices
# os.environ['CUDA_VISIBLE_DEVICES'] ="1" 
import time
import numpy as np
import torch
from aux_functions import get_all_docs, preproces_datasets, SimpleModel, DataHandler, Solver

%load_ext autoreload
%autoreload 2

# Reading and preprocessing data

In [3]:
documents = get_all_docs(dataset_path)
x_source,y_source,x_target,y_target,x_test,y_test = preproces_datasets(s_domain,t_domain,documents)

1000 documents read for domain books in file positive.review
1000 documents read for domain books in file negative.review
4465 documents read for domain books in file unlabeled.review
1000 documents read for domain dvd in file positive.review
1000 documents read for domain dvd in file negative.review
3586 documents read for domain dvd in file unlabeled.review
1000 documents read for domain electronics in file positive.review
1000 documents read for domain electronics in file negative.review
5681 documents read for domain electronics in file unlabeled.review
1000 documents read for domain kitchen in file positive.review
1000 documents read for domain kitchen in file negative.review
5945 documents read for domain kitchen in file unlabeled.review
source_docs.shape (2000,) source_labels.shape (2000,) target_docs.shape (7945,) target_labels.shape (7945,)
Shapes after unifying features:
source.X.shape (2000, 58402) target.X.shape (7945, 58402)
Shapes of the inputs that will be fed to the cla

# Training a model on the src domain

In [4]:
net = SimpleModel(input_size=x_source.shape[1])
if torch.cuda.is_available():
    net.cuda()

base_lr = 0.0001
optimizer = torch.optim.Adam(net.parameters(),lr=base_lr)
tsolver = Solver(optimizer,net,base_lr)

dhandler = DataHandler(x_source,y_source,None,batch_size=128,shuffle=True)

In [5]:
def train_model_on_src(nb_iters,dhandler,tsolver):
    st = time.time()
    loss = 0
    for i in range(nb_iters):
        x_batch,y_batch = dhandler.next_batch()
        x_batch,y_batch = x_batch.cuda(),y_batch.cuda()
        # update learning rate
        if i%5==0:
            tsolver.iters+=1
            tsolver.update_lr()

        tsolver.net.train()
        tsolver.net.zero_grad()

        source_logits, _ = tsolver.net(x_batch)
        y_onehot = torch.FloatTensor(len(y_batch), 2).cuda()
        y_onehot.zero_()
        y_onehot.scatter_(1, y_batch.view(-1,1), 1)
        mse_loss = torch.sum((source_logits - y_onehot)**2,dim=1)
        mse_loss = torch.mean(mse_loss)
        mse_loss.backward()
        loss += mse_loss

        # update the network
        tsolver.optimizer.step()
        if i % 50 == 49 or i == nb_iters - 1:

            remained_iters = nb_iters - i
            passed_time = time.time() - st
            ETA = int(passed_time * remained_iters / i)
            ETA_min, ETA_sec = ETA // 60, ETA % 60
            mean_loss = loss/i
            print ('\r' + \
                  ' iter: ' + str(i + 1) + '/' + str(nb_iters) + \
                  ' ETA: ' + str(ETA_min) + ':' + "{0:02d}".format(ETA_sec) + \
                  ' loss: ' + "{0:0.4f}".format(mean_loss),end=" ")
            sys.stdout.flush()
    print(' ')



In [6]:
def get_outputs(x,y,tsolver,ret_logits=False):
    with torch.no_grad():
        tsolver.net.eval()
        batch_size = 128
        test_handler = DataHandler(x,y,None,batch_size,shuffle=False)
        total_batch = test_handler.len//batch_size
        if test_handler.len%batch_size!=0:
            total_batch+=1

        Preds = np.zeros(total_batch*batch_size)
        Labels = np.zeros(total_batch*batch_size)
        Logits = torch.zeros((total_batch*batch_size,2))
        for i in range(total_batch):
            x_batch,y_batch = test_handler.next_batch()
            x_batch,y_batch = x_batch.cuda(),y_batch.cuda()
            pred_logits,pred_probs = tsolver.net(x_batch)
            Preds[i*batch_size:(i+1)*batch_size] = pred_probs.cpu().numpy().argmax(axis=1)
            if ret_logits:
                Logits[i*batch_size:(i+1)*batch_size] = pred_logits.cpu()
            Labels[i*batch_size:(i+1)*batch_size] = y_batch.cpu().numpy()
        if ret_logits:
            Logits = Logits[:len(x)]
            return Logits
        Preds = Preds[:len(x)]
        Labels = Labels[:len(x)]
        return Preds,Labels

In [7]:
def test(x,y,tsolver):
    Preds,Labels = get_outputs(x,y,tsolver)
    return np.sum(Preds==Labels)/len(Preds)


In [8]:
for i in range(10):
    train_model_on_src(100,dhandler,tsolver)
    
current_acc = test(x_source,y_source,tsolver)
print('accuracy on source: ',current_acc)
current_acc = test(x_target,y_target,tsolver)
print('accuracy on target: ',current_acc)
only_src_acc = test(x_test,y_test,tsolver)
print('accuracy on test: ',only_src_acc) ### This is the accuracy model gets on the reviews that are names "unlabaled" in target domain

 iter: 100/100 ETA: 0:00 loss: 0.3924  
 iter: 100/100 ETA: 0:00 loss: 0.0016  
 iter: 100/100 ETA: 0:00 loss: 0.0001  
 iter: 100/100 ETA: 0:00 loss: 0.0000  
 iter: 100/100 ETA: 0:00 loss: 0.0000  
 iter: 100/100 ETA: 0:00 loss: 0.0000  
 iter: 100/100 ETA: 0:00 loss: 0.0000  
 iter: 100/100 ETA: 0:00 loss: 0.0000  
 iter: 100/100 ETA: 0:00 loss: 0.0000  
 iter: 100/100 ETA: 0:00 loss: 0.0000  
accuracy on source:  1.0
accuracy on target:  0.7524229074889868
accuracy on test:  0.7530698065601346


# Domain Adaptation with PPPL

In [9]:
base_lr = 0.0001*0.25
optimizer = torch.optim.Adam(net.parameters(),lr=base_lr)
tsolver = Solver(optimizer,net,base_lr)

In [10]:
trg_gts = y_target
trg_data = x_target
src_gts = y_source
src_data = x_source
t_labels = np.array(trg_gts)

n_classes = 2
trg_cp = np.zeros(n_classes)
for i in range(n_classes):
    trg_cp[i] = np.sum(t_labels==i)/len(t_labels)


In [11]:
def adjust_pseudo_labels(trg_probs_np,trg_cp):
    n_classes = 2
    pseudo_labels = trg_probs_np.argmax(axis=1)
    current_cp = np.zeros(n_classes)
    for c in range(n_classes):
        current_cp[c] = np.sum(pseudo_labels==c)/len(trg_probs_np)

    diff_class_rates =  current_cp - trg_cp
    for i in range(len(diff_class_rates)):
        if diff_class_rates[i]<=0:
            continue
        predicted_as_c = pseudo_labels==i
        current_class = i
        current_diff = diff_class_rates[i]
        current_num = np.round(current_diff*len(trg_probs_np)).astype(np.int32)

        current_probs = trg_probs_np[pseudo_labels==current_class]
        current_probs_sorted = np.sort(current_probs,axis=1)
        current_certainty_scores = current_probs_sorted[:,-1] - current_probs_sorted[:,-2]
        
        current_certainty_scores_sorted_inds = np.argsort(current_certainty_scores)
        y_val = np.ones(len(current_certainty_scores))*current_class
        for i in range(current_num):
            y_val[i]=-1
        temp_pl = np.zeros(len(current_certainty_scores))
        temp_pl[current_certainty_scores_sorted_inds] = y_val
        pseudo_labels[predicted_as_c] = temp_pl
    
    return pseudo_labels

In [12]:
def train_model_with_weights(nb_epochs,dhandler,tsolver):
    total_batch = dhandler.len // dhandler.batch_size
    if dhandler.len % dhandler.batch_size != 0:
        total_batch += 1
    st = time.time()
    for ep in range(nb_epochs):
        loss = 0
        for i in range(total_batch):
            x_batch,y_batch,w_batch = dhandler.next_batch()
            x_batch,y_batch,w_batch = x_batch.cuda(),y_batch.cuda(),w_batch.cuda()
            if i%5==0:
                tsolver.iters+=1
                tsolver.update_lr()
            tsolver.net.train()
            tsolver.net.zero_grad()
            source_logits, _ = tsolver.net(x_batch)

            y_onehot = torch.FloatTensor(len(y_batch), 2).cuda()
            y_onehot.zero_()
            y_onehot.scatter_(1, y_batch.view(-1,1), 1)
            mse_loss = torch.sum((source_logits - y_onehot)**2,dim=1)*w_batch
            mse_loss = torch.mean(mse_loss)
            mse_loss.backward()
            loss += mse_loss

            # update the network
            tsolver.optimizer.step()
        passed_time = time.time() - st
        remained_epochs = nb_epochs - ep
        ETA = int(passed_time * remained_epochs)
        ETA_min, ETA_sec = ETA // 60, ETA % 60
        print ('\r' + 'epoch: ' + str(ep + 1) + '/' + str(nb_epochs) + \
                      ' ETA: ' + str(ETA_min) + ':' + "{0:02d}".format(ETA_sec) + \
                      ' loss: ' + "{0:0.4f}".format(loss/total_batch),end=" ")
        sys.stdout.flush()
    print(' ')



In [16]:
def DA(tsolver,src_data,src_gts,trg_data,trg_cp,x_test,y_test):
    begin_time = time.time()
    inner_loop_size = 1
    trg_gts_unreal = torch.zeros(len(trg_data))
    weights_src = torch.ones(len(src_gts))
    
    for nnn in range(0,90,2):
        if nnn>=88:
            inner_loop_size=10
        for j in range(inner_loop_size):
            print ('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ i:',nnn//2 + 1,'j:',j,
               'Elapsed Time(m): {0:0.2f}'.format((time.time()-begin_time)/60))
            

            #### Get scores on the target domain
            trg_scores = get_outputs(trg_data,trg_gts_unreal,tsolver,ret_logits=True)
            trg_scores_np = trg_scores.cpu().numpy()
            
            #### Calculate pseudo-labels of the target domain
            trg_pseudo_labels = trg_scores_np.argmax(axis=1)
            
            ### Calculate the certainty scores for target samples
            trg_scores_np_sorted = np.sort(trg_scores_np,axis=1)
            certainty_scores = trg_scores_np_sorted[:,-1] - trg_scores_np_sorted[:,-2]
            
            ### Calculate weight for the target samples
            weights_trg = np.zeros(len(certainty_scores))
            for c in range(n_classes):
                predicted_as_c = trg_pseudo_labels==c
                size_c = np.sum(predicted_as_c)
                if size_c>1:
                    left_size = int(np.ceil(((nnn+1)*0.01+0.1)*size_c))
                    x_val_left = 1+(10/2 - 1)/left_size*(np.arange(left_size))
                    right_size = size_c - left_size
                    x_val_right = 10000*(np.arange(1,right_size+1))
                    x_val = np.concatenate((x_val_left,x_val_right))
                    y_val = np.power(x_val,-1)
                    y_val = y_val[::-1]

                    cs_c = certainty_scores[predicted_as_c]
                    cs_c_sorted_inds = np.argsort(cs_c)
                    weights_trg2 = np.zeros(len(cs_c))
                    weights_trg2[cs_c_sorted_inds] = y_val
                    weights_trg[predicted_as_c] = weights_trg2
                    
                    
            ### Exclude
            trg_pseudo_labels_adjusted = adjust_pseudo_labels(np.copy(trg_scores_np),trg_cp)
            coef = (trg_pseudo_labels==trg_pseudo_labels_adjusted)*1
            weights_trg*=coef
            weights_trg = weights_trg.astype(np.float32)
            weights_trg = torch.tensor(weights_trg)


            trg_pseudo_labels = trg_pseudo_labels.astype(np.int32)
            trg_pseudo_labels = torch.tensor(trg_pseudo_labels,dtype=torch.long)
            
            #### Randomly select some samples from the source domain
            p = np.random.permutation(len(src_data))
            p = p[:len(trg_data)*2]
            x_temp = src_data[p]
            y_temp = src_gts[p]
            w_temp = weights_src[:len(trg_data)*2]
            
            #### Train Model
            m1 = torch.cat((x_temp,trg_data))
            m2 = torch.cat((y_temp,trg_pseudo_labels))
            m3 = torch.cat((w_temp,weights_trg))
            DA_dhandler = DataHandler(m1, m2, m3, 64,shuffle=True)
            ep = 1
            train_model_with_weights(ep,DA_dhandler,tsolver)

            
            with torch.no_grad():
                current_acc = test(x_test,y_test,tsolver)
                print('current accuracy on test set: ',current_acc)


In [14]:
DA(tsolver,src_data,src_gts,trg_data,trg_cp,x_test,y_test)

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ i: 1 j: 0 Elapsed Time(m): 0.00
epoch: 1/1 ETA: 0:02 loss: 0.0005  
current accuracy on test set:  0.7720773759461732
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ i: 2 j: 0 Elapsed Time(m): 0.09
epoch: 1/1 ETA: 0:02 loss: 0.0003  
current accuracy on test set:  0.7826745164003364
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ i: 3 j: 0 Elapsed Time(m): 0.18
epoch: 1/1 ETA: 0:02 loss: 0.0002  
current accuracy on test set:  0.7902439024390244
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ i: 4 j: 0 Elapsed Time(m): 0.27
epoch: 1/1 ETA: 0:02 loss: 0.0001  
current accuracy on test set:  0.7991589571068124
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ i: 5 j: 0 Elapsed Time(m): 0.37
epoch: 1/1 ETA: 0:02 loss: 0.0001  
current accuracy on test set:  0.8057190916736754
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

epoch: 1/1 ETA: 0:02 loss: 0.0002  
current accuracy on test set:  0.8593776282590412
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ i: 44 j: 0 Elapsed Time(m): 4.00
epoch: 1/1 ETA: 0:02 loss: 0.0003  
current accuracy on test set:  0.8612279226240538
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ i: 45 j: 0 Elapsed Time(m): 4.09
epoch: 1/1 ETA: 0:02 loss: 0.0003  
current accuracy on test set:  0.8612279226240538
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ i: 45 j: 1 Elapsed Time(m): 4.18
epoch: 1/1 ETA: 0:02 loss: 0.0000  
current accuracy on test set:  0.8610597140454164
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ i: 45 j: 2 Elapsed Time(m): 4.27
epoch: 1/1 ETA: 0:02 loss: 0.0000  
current accuracy on test set:  0.8603868797308662
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ i: 45 j: 3 Elapsed Time(m): 4.36
epoch: 1/1 ETA: 0:02 loss

In [15]:
print('Accuracy on test set before domain adaptation: ',only_src_acc)
DA_acc = test(x_test,y_test,tsolver)
print('Accuracy on test set after domain adaptation: ',DA_acc)

Accuracy on test set before domain adaptation:  0.7530698065601346
Accuracy on test set after domain adaptation:  0.8603868797308662
