In [None]:
import chardet
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
!python -m spacy download en_core_web_sm
import numpy as np
from scipy.io import loadmat

In [None]:
def detect_encoding(file_path):
  

    with open(file_path, "rb") as f:
        raw_data = f.read(10000) 
    return chardet.detect(raw_data)["encoding"]

def extract_description(file_path):
   
    try:
        encoding = detect_encoding(file_path)  # Detect encoding
        with open(file_path, "r", encoding=encoding, errors="replace") as file:
            content = file.read()

        match = re.search(r"<DESCRIPTION>(.*?)</DESCRIPTION>", content, re.DOTALL)
        return match.group(1).strip() if match else None
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

def load_glove_model(glove_file):
    glove_dict = {}
    with open(glove_file, 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype=np.float32)
            glove_dict[word] = vector
    return glove_dict


def get_glove_embeddings(words, glove_dict, embedding_dim):
    embeddings = []
    for word in words:
        embeddings.append(glove_dict.get(word, np.zeros(embedding_dim)))  # Return zero vector if not found
    return embeddings



def tokenize_sentence(sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    return [token.text.lower() for token in doc if token.is_alpha]  # Keep only words

def sentences_to_glove_embedding(descs,glove_dict):

    embedding_dataset=[]
    for sentence in descs:
        words = word_tokenize(sentence.lower())

        filtered_words = [word for word in words if word not in stopwords.words('english')]

        sentence=' '.join(filtered_words)
        # print(sentence)

        tokens = tokenize_sentence(sentence)
        
        # tokens=tokens[:tokens_to_take]
        
        word_embeddings = get_glove_embeddings(tokens, glove_dict, 300)
        embedding_dataset.append(word_embeddings)

    summed_embedding = [np.sum(sample, axis=0) for sample in embedding_dataset]
    summed_embedding_array=np.array(summed_embedding)

    return summed_embedding_array



def process_text_file_to_get_embedding(base_dir,source_file):
    sentences=[]
    files_not_found=[]
    with open(source_file,"r") as file:
        for i,line in enumerate(file):
            line=line.strip()
            if line:
                actual_path=f"{base_dir}{line}.eng"
            
            desc = extract_description(actual_path)
            if desc is not None:
                sentences.append(desc)
            else:
                files_not_found.append(i)

    
    # print(len(sentences))
    glove_file = "glove.6B/glove.6B.300d.txt"
    glove_dict = load_glove_model(glove_file)

    return sentences_to_glove_embedding(sentences,glove_dict),files_not_found

In [None]:
def convert_to_svm_format(features, labels, output_file):
    with open(output_file, 'w') as f:
        for label_indices, feature_vector in zip(labels, features):
            label_str = ','.join(map(str, label_indices))  # Labels
            feature_str = ' '.join([f"{i}:{v:.6f}" for i, v in enumerate(feature_vector) if v != 0])            
            f.write(f"{label_str} {feature_str}\n")

def one_hot_to_indices(one_hot_labels):
    """Convert one-hot encoding to label indices."""
    return [list(np.where(row == 1)[0]) for row in one_hot_labels]

In [None]:
base_dir = "iaprtc12/annotations_complete_eng/"
train_file="IAPRTC/iapr_train_list.txt"


train_summed_array,files_not_found=process_text_file_to_get_embedding(base_dir,train_file)
print(train_summed_array.shape)

files_not_found = np.array(files_not_found)
print(len(files_not_found))

In [None]:
train_label_data = loadmat("IAPRTC/IAPRTC-12_TrainLabels.mat")
I_z_tr=train_label_data['I_z_tr']
label_column_1 = I_z_tr[0,0]  
label_column_2 = I_z_tr[0, 1] 

print("Column 1 Shape:", label_column_1.shape) 
    
# label_train_column_1=label_column_1[:2734]
mask = np.ones(label_column_1.shape[0], dtype=bool)
mask[files_not_found] = False

filtered_train_label_column = label_column_1[mask]

print("Original shape:", label_column_1.shape)
print("Filtered shape:", filtered_train_label_column.shape) #for labels of the files which were not found

train_label_indices = one_hot_to_indices(filtered_train_label_column)
convert_to_svm_format(train_summed_array, train_label_indices, "train.svm") 

In [None]:
# for test dataset
base_dir = "iaprtc12/annotations_complete_eng/"
test_file="IAPRTC/iapr_test_list.txt"


test_summed_array,files_not_found=process_text_file_to_get_embedding(base_dir,test_file)


test_label_data = loadmat("IAPRTC/IAPRTC-12_TestLabels.mat")
I_z_te=test_label_data['I_z_te']
label_test_column_1 = I_z_te[0,0]  
label_test_column_2 = I_z_te[0, 1] 

print("Column 1 Shape:", label_test_column_1.shape)

# label_test_column_1=label_test_column_1[:300]

mask = np.ones(label_test_column_1.shape[0], dtype=bool)
mask[files_not_found] = False

filtered_test_label_column = label_test_column_1[mask]

print("Original shape:", label_test_column_1.shape)
print("Filtered shape:", filtered_test_label_column.shape) #for labels of the files which were not found

test_label_indices = one_hot_to_indices(filtered_test_label_column)
convert_to_svm_format(test_summed_array, test_label_indices, "test.svm")