# Setup

In [None]:
import numpy as np
import os
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, \
                    classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, ReLU, \
                                        Lambda, GlobalAveragePooling2D, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import ResNet50,ResNet101,ResNet152
from tensorflow.keras.applications.resnet50 import preprocess_input
from keras.preprocessing import image
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras import layers, Model, optimizers


from tqdm.auto import tqdm

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

import pickle

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# Loading Features

In [None]:
labeled_features = np.load('data/processed/train/features/labeled_data_features.npy')
unlabeled_features = np.load('data/processed/unlabeled/features/unlabeled_data_features.npy')

print("Shapes of labeled features:", labeled_features.shape)
print("Shapes of unlabeled features:", unlabeled_features.shape)

Shapes of labeled features: (35122, 51200)
Shapes of unlabeled features: (53570, 51200)


# Generating Cosine Similarities

In [None]:
# Calculate cosine similarity between each unlabeled feature and labeled features
similarities = cosine_similarity(unlabeled_features, labeled_features)

print("Shape of similarities:", similarities.shape)
print("Minimum similarity value:", np.min(similarities))
print("Maximum similarity value:", np.max(similarities))

np.save('data/sampled/similarities.npy', similarities)
print("Similarities saved at data/sampled/similarities.npy")

Shape of similarities: (53570, 35122)
Minimum similarity value: 0.052696016
Maximum similarity value: 0.9736681
Similarities saved at data/sampled/similarities.npy


# Seperating Reliable and Unreliable Samples

In [None]:
# similarities = np.load('data/sampled/similarities.npy')

In [None]:
# Set a similarity threshold
similarity_threshold = 0.80

In [None]:
# Categorize unlabeled samples into reliable and unreliable based on the threshold
reliable_indices = np.where(similarities > similarity_threshold)[0]
reliable_indices = np.unique(reliable_indices)

print('Number of Reliable:', len(reliable_indices))

Number of Reliable: 41425


In [None]:
unreliable_indices = np.setdiff1d(np.arange(len(unlabeled_features)), reliable_indices)

print('Number of Unreliable:', len(unreliable_indices))

Number of Unreliable: 12145


In [None]:
# Get reliable and unreliable samples
reliable_samples = unlabeled_features[reliable_indices]

print("Shape of reliable samples:", reliable_samples.shape)

os.makedirs('data/sampled/reliable/', exist_ok=True)
np.save('data/sampled/reliable/reliable_samples.npy', reliable_samples)
print('Reliable Samples saved at data/sampled/reliable/reliable_samples.npy')

Shape of reliable samples: (41425, 51200)
Reliable Samples saved at data/sampled/reliable/reliable_samples.npy


In [None]:
unlabeled_data_path = "data/ext/test"
unlabeled_data_images = os.listdir(unlabeled_data_path)
unlabeled_data_images = [unlabeled_data_path + "/" + i for i in unlabeled_data_images]

print("Unlabeled data samples", len(unlabeled_data_images))

# Extract paths of reliable images based on their indices
unreliable_image_paths = [unlabeled_data_images[index] for index in unreliable_indices]

print("Number of unreliable images:", len(unreliable_image_paths))

os.makedirs('data/sampled/unreliable/', exist_ok=True)
np.save('data/sampled/unreliable/unreliable_image_paths.npy', unreliable_image_paths)
print("Unreliable image paths saved at data/sampled/unreliable_image_paths.npy")