In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd

from preprocessing.preprocess_text import clean_text
from preprocessing.gen_text_embeddings import generate_text_embeddings
import torch
from pytorch_datasets.mvae_dataset import MVAEDataset
from torch.utils.data import DataLoader, random_split
from models.mvae import MVAE
from torch.optim import Adam

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mehta.vats/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
cuda_device = 0 # set the index of the GPU you want to use
torch.cuda.set_device(cuda_device)
torch.backends.cudnn.benchmark = True

# set the maximum GPU memory usage
max_memory_usage = 0.95 # set the maximum memory usage as a fraction of the available memory
torch.cuda.set_per_process_memory_fraction(max_memory_usage, cuda_device)

In [3]:
print(torch.cuda.max_memory_allocated(cuda_device)/1024/1024/1024)
torch.cuda.empty_cache()

print(torch.cuda.memory_summary(device=None, abbreviated=False))

0.0
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|-----------------------------------------------------------

In [4]:
# Load and preprocess data

instagram_data = pd.read_csv("data/instagram_data.csv")
data = instagram_data.dropna(subset=["description"]).reset_index(drop=True)

print(f"Removed {len(instagram_data) - len(data)} rows due to N/A descriptions.")

data = data.dropna(subset=["id"]).reset_index(drop=True)

print(f"Removed {len(instagram_data) - len(data)} rows due to N/A images.")

# data = data.sample(frac=0.1)

post_descriptions = (
    data["description"]
    .apply(lambda text: clean_text(text) if type(text) == str else text)
    .tolist()
)

post_classes = data["Party"].tolist()

image_usernames = data["username"].tolist()
image_file_ids = data["id"].tolist()
image_encoding_folder_path = "data/VGG/"

  instagram_data = pd.read_csv("data/instagram_data.csv")


Removed 7662 rows due to N/A descriptions.
Removed 7662 rows due to N/A images.


In [5]:
# Generate Text Embeddings
text_embeddings, word_index_mapping = generate_text_embeddings(post_descriptions, 32)

In [6]:
# Create datasets
dataset = MVAEDataset(post_descriptions, word_index_mapping, image_usernames, image_file_ids, image_encoding_folder_path, post_classes)

train_dataset, test_dataset = random_split(dataset, [0.8, 0.2])
print("Complete Dataset Size:", len(dataset))
print("Training Dataset Size:", len(train_dataset))
print("Test Dataset Size:", len(test_dataset))

11531 of data removed because matching VGG image embeddings not found.
Complete Dataset Size: 361794
Training Dataset Size: 289436
Test Dataset Size: 72358


In [7]:
# Set hyperparameters

num_epochs = 10
batch_size = 15
learning_rate = 0.001

In [9]:
# Train model

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

device = "cuda:0" if torch.cuda.is_available() else "cpu"

print(f"Using {device} device")

model = MVAE(device, 32, 32, text_embeddings, dataset.padding_index, 2048, dataset.num_post_classes).to(device)
model = model.half()
optimizer = Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0

    for batch_idx, (text, img, party) in enumerate(train_dataloader):
        text = text.to(device)
        img = img.to(device)
        img = img.half()
        
        optimizer.zero_grad()

        decoded_text, decoded_img, mu, logvar, classifier_result = model(text, img)

        loss = model.loss_function(
            text, img, party, decoded_text, decoded_img, mu, logvar, classifier_result
        )
        loss.backward()

        train_loss += loss.item()
        
        optimizer.step()
        
        correct += (party == torch.argmax(classifier_result, dim=-1).detach().cpu()).sum().item()

    print(f"Train Epoch: {epoch+1} \tAccuracy: {(correct * 100) / len(train_dataset):.2f}% \tLoss: {train_loss / len(train_dataloader):.6f}")


Using cuda:0 device
208232
208231


OutOfMemoryError: CUDA out of memory. Tried to allocate 7.24 GiB (GPU 0; 39.45 GiB total capacity; 3.74 GiB already allocated; 184.81 MiB free; 37.48 GiB allowed; 3.77 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [21]:
for param in model.parameters():
    print(param.half().dtype)

torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16
torch.float16


In [None]:
# Test Model

model.eval()
test_loss = 0
test_correct = 0

for batch_idx, (image_embedding, party) in enumerate(test_dataloader):
    image_embedding = image_embedding.to(device)

    decoded_image, mu, logvar, classifier_result = model(image_embedding)

    loss = model.loss_function(
        image_embedding, party, decoded_image, mu, logvar, classifier_result
    )

    test_loss += loss.item()

    test_correct += (party == torch.argmax(classifier_result, dim=-1).detach().cpu()).sum().item()

print(f"Test Metrics \tAccuracy: {(test_correct * 100) / len(test_dataset):.2f}% \tLoss: {test_loss / len(test_dataloader):.6f}")
