# **Preprocessing Training Unlabelled Data**

In [1]:
import sys

sys.path.insert(0, "..\\Scripts")

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from ImageDatastore import ImageDatastore
from torch.utils.data import DataLoader
from torchvision import transforms

In [3]:
batch_size = 1024

transform = transforms.Compose(
    [
        transforms.Resize(232, interpolation=transforms.InterpolationMode.BILINEAR),
        transforms.Pad(padding=(0, 0, 0, 0), fill=0),
        transforms.CenterCrop((232, 232)),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)

## **Splitting the data into smaller chunks**

In [4]:
train_unlabelled = pd.read_csv("../Dataset/train_unlabeled.csv", header=None)
train_unlabelled.columns = ["Image", "Label"]
train_unlabelled.head()

Unnamed: 0,Image,Label
0,Image,Label
1,train_059329.jpg,-1
2,train_059330.jpg,-1
3,train_059331.jpg,-1
4,train_059332.jpg,-1


In [5]:
n = train_unlabelled.shape[0] // 20_000
for i in range(n + 1):
    tmp = train_unlabelled.iloc[i * 20_000 : (i + 1) * 20_000]
    tmp = tmp.reset_index(drop=True)
    tmp.to_csv(f"../Dataset/train_unlabeled_{i}.csv", index=False, header=False)
    print(f"../Dataset/train_unlabeled_{i}.csv")

../Dataset/train_unlabeled_0.csv
../Dataset/train_unlabeled_1.csv
../Dataset/train_unlabeled_2.csv
../Dataset/train_unlabeled_3.csv
../Dataset/train_unlabeled_4.csv
../Dataset/train_unlabeled_5.csv


## **Extracting Features**

In [6]:
from tqdm import tqdm
from NeuralFeatureExtractor import MobileNetFeatureExtractor

In [7]:
i = 0
feature_extractor = MobileNetFeatureExtractor(
    result_file=f"mobilenet_v3_classifier_unlabelled_{i}.npy"
)
image_datastore = ImageDatastore(f"train_unlabeled_{i}", transform=transform)
dataloader = DataLoader(image_datastore, batch_size=batch_size, shuffle=False)

features, labels = feature_extractor.compute_features(dataloader)

feature_extractor._save_features(features, labels)

100%|██████████| 20/20 [11:15<00:00, 33.77s/it]


In [8]:
i = 1
feature_extractor = MobileNetFeatureExtractor(
    result_file=f"mobilenet_v3_classifier_unlabelled_{i}.npy"
)
image_datastore = ImageDatastore(f"train_unlabeled_{i}", transform=transform)
dataloader = DataLoader(image_datastore, batch_size=batch_size, shuffle=False)

features, labels = feature_extractor.compute_features(dataloader)

feature_extractor._save_features(features, labels)

100%|██████████| 20/20 [11:14<00:00, 33.73s/it]


In [9]:
i = 2
feature_extractor = MobileNetFeatureExtractor(
    result_file=f"mobilenet_v3_classifier_unlabelled_{i}.npy"
)
image_datastore = ImageDatastore(f"train_unlabeled_{i}", transform=transform)
dataloader = DataLoader(image_datastore, batch_size=batch_size, shuffle=False)

features, labels = feature_extractor.compute_features(dataloader)

feature_extractor._save_features(features, labels)

100%|██████████| 20/20 [11:09<00:00, 33.47s/it]


In [10]:
i = 3
feature_extractor = MobileNetFeatureExtractor(
    result_file=f"mobilenet_v3_classifier_unlabelled_{i}.npy"
)
image_datastore = ImageDatastore(f"train_unlabeled_{i}", transform=transform)
dataloader = DataLoader(image_datastore, batch_size=batch_size, shuffle=False)

features, labels = feature_extractor.compute_features(dataloader)

feature_extractor._save_features(features, labels)

100%|██████████| 20/20 [13:08<00:00, 39.42s/it]


In [11]:
i = 4
feature_extractor = MobileNetFeatureExtractor(
    result_file=f"mobilenet_v3_classifier_unlabelled_{i}.npy"
)
image_datastore = ImageDatastore(f"train_unlabeled_{i}", transform=transform)
dataloader = DataLoader(image_datastore, batch_size=batch_size, shuffle=False)

features, labels = feature_extractor.compute_features(dataloader)

feature_extractor._save_features(features, labels)

100%|██████████| 20/20 [12:12<00:00, 36.62s/it]


In [12]:
i = 5
feature_extractor = MobileNetFeatureExtractor(
    result_file=f"mobilenet_v3_classifier_unlabelled_{i}.npy"
)
image_datastore = ImageDatastore(f"train_unlabeled_{i}", transform=transform)
dataloader = DataLoader(image_datastore, batch_size=batch_size, shuffle=False)

features, labels = feature_extractor.compute_features(dataloader)

feature_extractor._save_features(features, labels)

100%|██████████| 14/14 [08:15<00:00, 35.37s/it]


In [13]:
base_feature_path = '../Features/features/'
base_labels_path = '../Features/labels/'

In [14]:
un0 = np.load(os.path.join(base_feature_path, "mobilenet_v3_classifier_unlabelled_0.npy"))
un1 = np.load(os.path.join(base_feature_path, "mobilenet_v3_classifier_unlabelled_1.npy"))
un2 = np.load(os.path.join(base_feature_path, "mobilenet_v3_classifier_unlabelled_2.npy"))
un3 = np.load(os.path.join(base_feature_path, "mobilenet_v3_classifier_unlabelled_3.npy"))
un4 = np.load(os.path.join(base_feature_path, "mobilenet_v3_classifier_unlabelled_4.npy"))
un5 = np.load(os.path.join(base_feature_path, "mobilenet_v3_classifier_unlabelled_5.npy"))

In [15]:
labels0 = np.load(os.path.join(base_labels_path, "mobilenet_v3_classifier_unlabelled_0.npy"))
labels1 = np.load(os.path.join(base_labels_path, "mobilenet_v3_classifier_unlabelled_1.npy"))
labels2 = np.load(os.path.join(base_labels_path, "mobilenet_v3_classifier_unlabelled_2.npy"))
labels3 = np.load(os.path.join(base_labels_path, "mobilenet_v3_classifier_unlabelled_3.npy"))
labels4 = np.load(os.path.join(base_labels_path, "mobilenet_v3_classifier_unlabelled_4.npy"))
labels5 = np.load(os.path.join(base_labels_path, "mobilenet_v3_classifier_unlabelled_5.npy"))

In [16]:
un = np.vstack((un0, un1, un2, un3, un4, un5))
un.shape

(113450, 1280)

In [17]:
np.save(os.path.join(base_feature_path, "mobilenet_v3_classifier_unlabelled.npy"), un, allow_pickle=False)

In [18]:
labels = np.hstack((labels0, labels1, labels2, labels3, labels4, labels5))
labels.shape

(113450,)

In [19]:
np.save(os.path.join(base_labels_path, "mobilenet_v3_classifier_unlabelled.npy"), labels, allow_pickle=False)