# Imports

## Packages

In [1]:
import torch.optim
from torch.utils import data
import argparse
import json
from tqdm import tqdm
import sys
import numpy
import clip
sys.path.append('/home/guest/Documents/Siraj TM/RSCaMa')
from model.model_encoder_attMamba import Encoder, AttentiveEncoder
from model.model_decoder import DecoderTransformer
from utils_tool.utils import *
from data.LEVIR_CC.LEVIRCC import LEVIRCCDataset
from torch import nn, einsum

from PIL import Image
from imageio import imread
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt

import random

  from .autonotebook import tqdm as notebook_tqdm


## Functions

In [3]:
def load_json(path):
    with open(path) as f:
        file = json.load(f)
    f.close()
    return file
def save_json(file,path):
    with open(path,'w') as f:
        json.dump(file,f)
    f.close()
    print("Saved Successfully")
def rem_print(word):
    t_word = word
    for _ in range(100 - len(t_word)):
        word = word + ' '
    print(word,end='\r')

In [3]:
a = load_json('data/Levir-CC-dataset/LevirCCcaptions.json')
a.keys()

dict_keys(['images'])

## Data

In [14]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

In [5]:
word_vocab = load_json('assets/vocab.json')

Vocab = {key:word_vocab[key] for key in list(word_vocab.keys())[:468]}


# Analysis

In [15]:
model, preprocess = clip.load("ViT-B/32", device=device)

In [16]:
encoder = Encoder('CLIP-ViT-B/32')
encoder_trans = AttentiveEncoder(n_layers=3,
                                        feature_size=[7, 7, 768],
                                        heads=8, dropout=0.1)
decoder = DecoderTransformer(decoder_type='transformer_decoder',embed_dim=768,
                                    vocab_size=len(Vocab), max_lengths=42,
                                    word_vocab=Vocab, n_head=8,
                                    n_layers=1, dropout=0.1)

checkpoint = torch.load('data/Pre-Trained Models/RSCaMa.pth',map_location=device)

encoder.load_state_dict(checkpoint['encoder_dict'])
encoder_trans.load_state_dict(checkpoint['encoder_trans_dict'])
decoder.load_state_dict(checkpoint['decoder_dict'])

encoder.eval()
encoder = encoder.cuda()
encoder_trans.eval()
encoder_trans = encoder_trans.cuda()
decoder.eval()
decoder = decoder.cuda()
print('load model success!')

decoder_n_layers= 1
decoder_type= transformer_decoder
load model success!


# Data

In [None]:
'''from zipfile import ZipFile 
with ZipFile("DATA/Levir-CC-dataset.zip", 'r') as zObject: 
  
    # Extracting all the members of the zip  
    # into a specific location. 
    zObject.extractall( 
        path="DATA/Levir-CC-dataset")'''

In [17]:
Captions = load_json("data/Levir-CC-dataset/LevirCCcaptions.json")
test_path = 'data/Levir-CC-dataset/images/test'
invert = {val:key for key,val in word_vocab.items()}
except_tokens = {Vocab['<START>'], Vocab['<END>'], Vocab['<NULL>']}
except_tokens

{0, 2, 3}

In [18]:
def get_tokens(index):
    
    Path = 'data/Levir-CC-dataset/images/'
    Image_name = Captions['images'][index]['filename']
    dir_path = Path + '/' +  Captions['images'][index]['filepath']
    
    ImA =  f"{dir_path}/A/{Image_name}"
    ImB =  f"{dir_path}/B/{Image_name}"
       
    Ground_truth = Captions['images'][index]['sentences'][0]['raw']
    
    IMA = preprocess(Image.fromarray(
        io.imread(ImA)
        )).unsqueeze(0).to(device)
    
    IMB = preprocess(Image.fromarray(
        io.imread(ImB)
        )).unsqueeze(0).to(device)
    
    
    with torch.no_grad():
        feat1,feat2 = encoder(IMA,IMB)
        feat = encoder_trans(feat1,feat2)
        seq = decoder.sample(feat,k=1)
    return feat
    pred_seq = [w for w in seq if w not in except_tokens]
    caption = [invert[token] for token in pred_seq]
    
    output = ''
    for word in caption:
        output += word + ' '
        
    print(f"Predicted_Caption : {output}\tGround Truth : {Ground_truth} ")

    fig,axes = plt.subplots(1,2)
    axes[0].set_title(f"Before - Index:{index}")
    axes[0].imshow(np.asarray(Image.open(ImA)))
    axes[0].axis('off')
    
    axes[1].set_title(f"After - Index:{index}")
    axes[1].imshow(np.asarray(Image.open(ImB)))
    axes[1].axis('off')
    plt.show()

In [19]:
for index in [random.randint(0,1000) for _ in range(1)]:
    s = get_tokens(index)
    break



In [21]:
seq = decoder.sample(s,k=1)



In [None]:
seq

[2, 399, 346, 206, 399, 344, 28, 40, 3]

: 

# SYSU-CD Dataset

In [37]:
def get_inference(ImA,ImB):

    
    IMA = preprocess(Image.fromarray(
        io.imread(ImA)
        )).unsqueeze(0).to(device)
    
    IMB = preprocess(Image.fromarray(
        io.imread(ImB)
        )).unsqueeze(0).to(device)
    
    
    with torch.no_grad():
        feat1,feat2 = encoder(IMA,IMB)
        feat = encoder_trans(feat1,feat2)
        seq = decoder.sample(feat,k=1)
        
    pred_seq = [w for w in seq if w not in except_tokens]
    caption = [invert[token] for token in pred_seq]
    
    return caption
    output = ''
    for word in caption:
        output += word + ' '
        
    print(f"Predicted_Caption : {output}\tGround Truth : {Ground_truth} ")

    fig,axes = plt.subplots(1,2)
    axes[0].set_title(f"Before - Index:{index}")
    axes[0].imshow(np.asarray(Image.open(ImA)))
    axes[0].axis('off')
    
    axes[1].set_title(f"After - Index:{index}")
    axes[1].imshow(np.asarray(Image.open(ImB)))
    axes[1].axis('off')
    plt.show()

In [40]:
a['images'][0]

{'filepath': 'train',
 'filename': 'train_000001.png',
 'imgid': 0,
 'sentences': [{'tokens': ['there', 'is', 'no', 'difference'],
   'raw': ' there is no difference .',
   'imgid': 0,
   'sentid': 0},
  {'tokens': ['the', 'two', 'scenes', 'seem', 'identical'],
   'raw': ' the two scenes seem identical .',
   'imgid': 0,
   'sentid': 1},
  {'tokens': ['the', 'scene', 'is', 'the', 'same', 'as', 'before'],
   'raw': ' the scene is the same as before .',
   'imgid': 0,
   'sentid': 2},
  {'tokens': ['no', 'change', 'has', 'occurred'],
   'raw': ' no change has occurred .',
   'imgid': 0,
   'sentid': 3},
  {'tokens': ['almost', 'nothing', 'has', 'changed'],
   'raw': ' almost nothing has changed .',
   'imgid': 0,
   'sentid': 4}],
 'split': 'train',
 'changeflag': 0,
 'sentids': [0, 1, 2, 3, 4]}

In [41]:
res

['squares', 'replacing', 'have', 'squares', 'replaced', 'as', 'before']

In [7]:
directory = f'/home/guest/Documents/Siraj TM/DATA/subset'

In [None]:
for folder in os.listdir(directory):
    
    for image_name in os.listdir(f'{directory}/{folder}/A'):
        
        file_path = folder
        filename = image_name
        file_A = f'{directory}/{folder}/A/{filename}'
        file_A = f'{directory}/{folder}/B/{filename}'
        result = get_inference(A,B)
        tokens = result
        raw = ''
        
        for token in tokens:
            raw += token
        raw += ' .'
        
        imgid = 0
        sentid = 1
        image_info = {
            'filepath':file_path,
            'filename':filename,
            'sentences':{
                'tokens':tokens,
                'raw':raw,
                'imgid':imgid,
                'sentid':sentid,
                },
            'split':folder,
        }
        
        
    break

In [None]:
filename

'00802.jpg'

In [32]:
A = f'{directory}/{folder}/A/{filename}'
B = f'{directory}/{folder}/B/{filename}'

In [38]:
res = get_inference(A,B)



In [39]:
res

['squares', 'replacing', 'have', 'squares', 'replaced', 'as', 'before']

In [19]:
os.listdir(f'{directory}/val')

['B', 'OUT', 'A']