In [1]:
import gc
import os
import pickle
import random
import re
import warnings
from collections import Counter

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
from PIL import Image
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from tensorflow.keras.datasets import cifar10
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertModel

warnings.filterwarnings('ignore')

seed = 2025
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)


2024-04-07 18:50:25.621402: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-07 18:50:25.621547: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-07 18:50:25.799826: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
import os
import pandas as pd

# Path to the folder containing the keyframe images
folder_path = '/kaggle/input/mintrec/speaker_annotations/keyframes'
train_df_path = '/kaggle/input/mintrec/train.tsv'
dev_df_path = '/kaggle/input/mintrec/dev.tsv'
test_df_path = '/kaggle/input/mintrec/test.tsv'

train_df_raw = pd.read_csv(train_df_path, sep='\t')
dev_df_raw = pd.read_csv(dev_df_path, sep='\t')
test_df_raw = pd.read_csv(test_df_path, sep='\t')
# Create a list to store the results
train_data = []

# Iterate through each row of the existing dataframe
for index, row in train_df_raw.iterrows():
    # Construct the filename pattern
    filename_pattern = f"{row['season']}_{row['episode']}_{row['clip']}"
    # Find the first image that matches the pattern in the folder
    matching_files = [file for file in os.listdir(folder_path) if file.startswith(filename_pattern)]
    if matching_files:
        # If a matching image is found, add it to the result list
        train_data.append({
            'season': row['season'],
            'episode': row['episode'],
            'clip': row['clip'],
            'text': row['text'],
            'label': row['label'],
            'image_path': os.path.join(folder_path, matching_files[0])
        })

# Convert the list of dictionaries to a pandas DataFrame
train_df = pd.DataFrame(train_data)

text_data = list(train_df['text'])
image_data = list(train_df['image_path'])
labels_data = list(train_df['label'])

from sklearn.preprocessing import LabelEncoder

# Encode labels to integers
label_encoder = LabelEncoder()
labels_data = label_encoder.fit_transform(labels_data)

len(train_df_raw), len(dev_df_raw), len(test_df_raw)

(1334, 445, 445)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Text preprocessing and feature extraction
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

def get_text_features(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
    outputs = bert_model(**inputs)
    text_features = outputs.last_hidden_state.mean(dim=1)
    return text_features.detach().cpu().numpy()

text_features = [get_text_features(text) for text in text_data]

# Image preprocessing and feature extraction
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

model_name = 'resnet101'
resnet_model = timm.create_model(model_name, pretrained=True)
resnet_model = resnet_model.to(device)
resnet_model.eval()

def get_image_features(image_path):
    img = Image.open(image_path).convert('RGB')
    img_tensor = transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = resnet_model(img_tensor)
    image_features = image_features.squeeze().cpu().numpy()
    return np.expand_dims(image_features, axis=0)  # Reshape to match text features

image_features = [get_image_features(image_path) for image_path in image_data]




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/179M [00:00<?, ?B/s]

In [4]:
# Combine text and image features

combined_features = [np.concatenate((text_feat, image_feat), axis=1) for text_feat, image_feat in zip(text_features, image_features)]
combined_features = np.array(combined_features)
combined_features = combined_features.reshape(-1, combined_features.shape[-1])  # Reshape to (n_samples, n_features)

In [5]:
def view_clusters(groups):
# Function to view a cluster based on identifier
  def view_cluster(cluster):
    plt.figure(figsize=(25, 25))
    files = groups[cluster]
    if len(files) > 30:
      print(f"Clipping cluster size from {len(files)} to 30")
      files = files[:30]
    for index, file in enumerate(files):
      plt.subplot(10, 10, index + 1)
      img = cv2.imread(file)  # Load image using OpenCV
      img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
      plt.imshow(img)
      plt.axis('off')

  # Visualize each cluster with a gap between them
  for cl_index, cl in enumerate(groups):
    view_cluster(cl)
    if cl_index != len(groups) - 1:  # Add a gap between clusters
      plt.figure()  # Create a new figure for the next cluster

def novel_class_detection(features, filenames, labels, x=2):
  K = 1
  Nnew = 0
  all_clusters = []
  tot_anotations_kmeans = 0
  annotated_images = []
  while Nnew >= K // 2:
    # Perform K-Means Clustering on features
    kmeans = KMeans(n_clusters=K,  n_init='auto', random_state=42)
    kmeans.fit(features)

    # Annotate x points from each cluster
    #choose x*K indices randomly from labels
    N_set = set()
    clusters = {}
    for file, cluster in zip(filenames, kmeans.labels_):
      if cluster not in clusters.keys():
        clusters[cluster] = []
      clusters[cluster].append(file)

    for ci in clusters:
      image_path_list = clusters[ci]
      images = random.sample(image_path_list, x)
      annotated_images+=images
    #   image_id = [int(re.search(r'\d+', filename).group()) for filename in images]
      #image_id is the index of the image in the filenames list
      image_id = [filenames.index(filename) for filename in images]
      print(image_id)
#       for j in image_id:
#         print(j)
#         print(labels)
#         print(labels[j])
#       actual_labels.append(labels[j])
      actual_labels = [labels[i] for i in image_id]
      N_set.update(set(actual_labels))
      tot_anotations_kmeans+=x
    #check the acutal label -> get the number of distinct actual labels(N)
    N = len(N_set)
    #increment nnew by N-K
    Nnew += N-K

    # Double the value of K for the next iteration
    K *= 2

    #debug printing
    print("K:", K)
    print("Nnew:", Nnew)
  # Organize images into clusters
  clusters = {}
  for file, cluster in zip(filenames, kmeans.labels_):
    if cluster not in clusters.keys():
      clusters[cluster] = []
    clusters[cluster].append(file)

  return clusters, tot_anotations_kmeans, K, annotated_images

def process_clusters(clusters, tot_anotations, K, annotated_images, n_bad_clusters, labels):
    cluster_errs = {}
    cluster_labels = {}

    # for each cluster ci
    for idx, ci in enumerate(clusters):
        # for each image in ci: images are values in the dictionary
        image_path_list = clusters[ci]
        # pick p random images from image_path_list
        p_annotated_images = random.sample(image_path_list, p)
        annotated_images += p_annotated_images

        # get the actual labels for the images stored in a list
        image_id = [int(re.search(r'\d+', filename).group()) for filename in p_annotated_images]
        
        actual_labels = [labels[i][0] for i in image_id]

        # count the actual labels and get the max count
        counter = Counter(actual_labels)
        most_common_elements = counter.most_common()
        max_count = most_common_elements[0][1]
        cluster_label = most_common_elements[0][0]
        cluster_labels[idx] = cluster_label
        cluster_error = p - max_count
        cluster_errs[idx] = cluster_error
        tot_anotations += p

    # sort the cluster_errs dictionary by the values
    sorted_cluster_errs = sorted(cluster_errs.items(), key=lambda x: x[1], reverse=True)

    n_bad_clusters = min(n_bad_clusters, len(clusters))
    # pick the n_bad_clusters clusters with the highest error
    bad_clusters = sorted_cluster_errs[:n_bad_clusters]
    good_clusters = sorted_cluster_errs[n_bad_clusters:]

    # for each bad_cluster, pick q random images and annotate them
    for i, err in bad_clusters:
        image_path_list = clusters[i]
        q_annotated_images = random.sample(image_path_list, q)
        tot_anotations += q
        annotated_images += q_annotated_images

    return tot_anotations, annotated_images, cluster_errs, cluster_labels, bad_clusters, good_clusters

#train a model on the initial labelled data
#get confidence score(CS) for each point
#for a point if CS>threshold1 and point in Good Cluster and avg cosine similarity w. already annotated data points in cluster > threshold2
  #label the point as the cluster label <- Silver Labelling Strategy

def silver_annotate(threshold1=-2, threshold2=0.5):
    silver_annotation = {}
    # Use tqdm.notebook.tqdm to iterate over the items with a progress bar
    for k, v in tqdm(good_clusters, desc="Processing clusters", total=len(good_clusters)):
        # Your existing code goes here
        cluster_label = cluster_labels[k]
        clusteri = clusters[k]
        for curr_img_path in tqdm(clusteri, desc="silver annotating cluster", total=len(clusteri)):
            if curr_img_path in annotated_images:
                continue
            img = Image.open(curr_img_path).convert('RGB')
            img = transform(img)
            img = img.unsqueeze(0)
            img = img.to(device)
            output = model(img)
            confidences = np.squeeze(output.detach().cpu().numpy())
            predicted_class = np.argmax(confidences)
            top_confidence = confidences[predicted_class]

            #get the image paths of annotated_imgs_w_labels having label cluster_label
            annotated_img_paths = [img_path for img_path in annotated_imgs_w_labels if annotated_imgs_w_labels[img_path] == cluster_label]
            #get the index of the current image and the annotated images using the categories_subset
            curr_img_index = categories_subset.index(curr_img_path)
            annotated_img_indices = [categories_subset.index(img_path) for img_path in annotated_img_paths]
            # get the features of the current image and the annotated images
            curr_img_features = feat[curr_img_index]
            annotated_img_features = [feat[i] for i in annotated_img_indices]
            # get the cosine similarity of the current image with the annotated images
            similarities = []

            # Convert NumPy arrays to PyTorch tensors
            curr_img_features_tensor = torch.tensor(curr_img_features)
            # Reshape the tensor to be 2D with a single row
            curr_img_features_tensor = curr_img_features_tensor.reshape(1, -1)
            for features in annotated_img_features:
                features_tensor = torch.tensor(features)
                # Reshape the tensor to be 2D with a single row
                features_tensor = features_tensor.reshape(1, -1)
                similarity = F.cosine_similarity(curr_img_features_tensor, features_tensor).item()
                similarities.append(similarity)
            avg_similarity = np.mean(similarities)
#             print(top_confidence, avg_similarity)
            if top_confidence > threshold1 and avg_similarity > threshold2:
                silver_annotation[curr_img_path] = cluster_label
    return silver_annotation

In [6]:
# x = ___
p = 40
q = 30
n_bad_clusters = 3

threshold1 = -2   # confidence threshold: -2
threshold2 = 0.1  # avg coisne similarity threshold

In [7]:
combined_features.shape


(1316, 1768)

In [8]:
clusters, tot_anotations, K, annotated_images = novel_class_detection(combined_features, image_data, labels_data)

[1142, 169]
K: 2
Nnew: 1
[1128, 837]
[408, 1314]
K: 4
Nnew: 3
[0, 914]
[607, 934]
[364, 123]
[639, 668]
K: 8
Nnew: 4
[297, 109]
[253, 414]
[508, 148]
[718, 74]
[818, 54]
[823, 746]
[138, 1051]
[424, 108]
K: 16
Nnew: 7


In [9]:
# clusters, 
tot_anotations, K, annotated_images

(30,
 16,
 ['/kaggle/input/mintrec/speaker_annotations/keyframes/S05_E21_272_50_4.jpg',
  '/kaggle/input/mintrec/speaker_annotations/keyframes/S04_E04_220_20_4.jpg',
  '/kaggle/input/mintrec/speaker_annotations/keyframes/S05_E12_416_30_2.jpg',
  '/kaggle/input/mintrec/speaker_annotations/keyframes/S04_E03_201_60_3.jpg',
  '/kaggle/input/mintrec/speaker_annotations/keyframes/S06_E04_604_20_3.jpg',
  '/kaggle/input/mintrec/speaker_annotations/keyframes/S06_E03_458_20_2.jpg',
  '/kaggle/input/mintrec/speaker_annotations/keyframes/S05_E16_329_60_5.jpg',
  '/kaggle/input/mintrec/speaker_annotations/keyframes/S04_E10_266_0_7.jpg',
  '/kaggle/input/mintrec/speaker_annotations/keyframes/S04_E16_331_70_3.jpg',
  '/kaggle/input/mintrec/speaker_annotations/keyframes/S04_E12_363_30_3.jpg',
  '/kaggle/input/mintrec/speaker_annotations/keyframes/S05_E07_262_20_3.jpg',
  '/kaggle/input/mintrec/speaker_annotations/keyframes/S05_E16_538_30_2.jpg',
  '/kaggle/input/mintrec/speaker_annotations/keyframes/

In [10]:
# list(train_df['image_path'])
# train_df

In [11]:

# Load your dataset containing text and images
# Assuming you have a list of text data, image data, and corresponding labels

# from sklearn.preprocessing import LabelEncoder

# # Encode labels to integers
# label_encoder = LabelEncoder()
# labels = label_encoder.fit_transform(labels)
# len(labels)

In [12]:
# annotated_images_labels

In [13]:
# annotated_images_labels

In [14]:
# labels_data
# max(annotated_images_labels)

In [15]:
# len(annotated_images_labels)
# annotated_images_labels


In [16]:

#get the indices of the annotated images
annotated_images_indices = [image_data.index(filename) for filename in annotated_images]

# get the features of the annotated images
annotated_images_features = [combined_features[i] for i in annotated_images_indices]
#get the labels of the annotated images
annotated_images_labels = [labels_data[i] for i in annotated_images_indices]


# annotated_images_labels = label_encoder.transform(annotated_images_labels)



annotated_images_features = np.array(annotated_images_features)
annotated_images_labels = np.array(annotated_images_labels)



# # One-hot encode the labels
# one_hot_encoder = OneHotEncoder()
# labels_ohe = one_hot_encoder.fit_transform(annotated_images_labels.reshape(-1, 1)).toarray()

# Create custom dataset and dataloader
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.labels[idx]
        return feature, label

# dataset = CustomDataset(annotated_images_features, labels_ohe)
dataset = CustomDataset(annotated_images_features, annotated_images_labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define the model architecture
input_size = annotated_images_features.shape[1]
num_classes = max(annotated_images_labels)+1

class TextImageClassifier(nn.Module):
    def __init__(self, num_classes):
        super(TextImageClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
#         return x.squeeze(1)
        return x


model = TextImageClassifier(num_classes).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 25
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for features, labels in dataloader:
#         print(labels)
        features, labels = features.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(features)
#         print(outputs, labels)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * features.size(0)

    epoch_loss = running_loss / len(dataset)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

print('Finished Training')

Epoch [1/25], Loss: 4.6132
Epoch [2/25], Loss: 12.8716
Epoch [3/25], Loss: 20.3941
Epoch [4/25], Loss: 15.7703
Epoch [5/25], Loss: 17.5046
Epoch [6/25], Loss: 14.8750
Epoch [7/25], Loss: 11.0267
Epoch [8/25], Loss: 7.1641
Epoch [9/25], Loss: 7.7792
Epoch [10/25], Loss: 6.7243
Epoch [11/25], Loss: 6.5387
Epoch [12/25], Loss: 5.2609
Epoch [13/25], Loss: 4.5020
Epoch [14/25], Loss: 4.3812
Epoch [15/25], Loss: 4.3208
Epoch [16/25], Loss: 3.9359
Epoch [17/25], Loss: 3.4283
Epoch [18/25], Loss: 2.5404
Epoch [19/25], Loss: 2.8722
Epoch [20/25], Loss: 2.7201
Epoch [21/25], Loss: 2.5545
Epoch [22/25], Loss: 2.6450
Epoch [23/25], Loss: 2.4748
Epoch [24/25], Loss: 2.3623
Epoch [25/25], Loss: 2.3399
Finished Training


In [17]:
# labels_ohe

In [18]:
# len(combined_features)
len(labels)

30

In [19]:
print("hi")

hi


In [20]:
# Evaluate the model on a test set
# Assume you have a separate test set with text, images, and labels
# test_text_data = [...]
# test_image_data = [...]
# test_labels = [...]

test_df_raw = pd.read_csv(test_df_path, sep='\t')
# Create a list to store the results
test_data = []

# Iterate through each row of the existing dataframe
for index, row in train_df_raw.iterrows():
    # Construct the filename pattern
    filename_pattern = f"{row['season']}_{row['episode']}_{row['clip']}"
    # Find the first image that matches the pattern in the folder
    matching_files = [file for file in os.listdir(folder_path) if file.startswith(filename_pattern)]
    if matching_files:
        # If a matching image is found, add it to the result list
        test_data.append({
            'season': row['season'],
            'episode': row['episode'],
            'clip': row['clip'],
            'text': row['text'],
            'label': row['label'],
            'image_path': os.path.join(folder_path, matching_files[0])
        })

# Convert the list of dictionaries to a pandas DataFrame
test_df = pd.DataFrame(test_data)

test_text_data = list(test_df['text'])
test_image_data = list(test_df['image_path'])
test_labels_data = list(test_df['label'])

test_labels_data = label_encoder.fit_transform(test_labels_data)
# len(train_df_raw), len(dev_df_raw), len(test_df_raw)

print("Extracting features")
test_text_features = [get_text_features(text) for text in test_text_data]
test_image_features = [get_image_features(image_path) for image_path in test_image_data]


Extracting features


In [21]:
test_combined_features = [np.concatenate((text_feat, image_feat), axis=1) for text_feat, image_feat in zip(test_text_features, test_image_features)]

test_dataset = CustomDataset(test_combined_features, test_labels_data)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)


Test Accuracy: 2.8579


In [23]:
from sklearn.metrics import f1_score
model.eval()
correct = 0
total = 0


all_preds = []
all_labels = []
with torch.no_grad():
    for images, labels in test_dataloader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = correct / total
print(f'Test Accuracy: {accuracy:.4f}')

# Calculate F1 score
f1 = f1_score(all_labels, all_preds, average='macro')
print(f'F1 Score: {f1:.4f}')


Test Accuracy: 0.2679
F1 Score: 0.2532
