<a href="https://colab.research.google.com/github/ShamaSharma/SVD/blob/main/fusevulwithembeddingfinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install torch torchvision torchaudio
!pip install transformers
!pip install pandas scikit-learn tqdm
!pip install datasets

# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("Using CPU")

# Set up runtime type - you should manually set GPU in Colab settings

CUDA available: True
GPU: Tesla T4
GPU Memory: 14.7 GB


In [None]:
from google.colab import files
import zipfile
import os

print("Upload your CSV files or ZIP containing the dataset folder:")
print("Expected files:")
print("- devign_train_normalized.csv")
print("- devign_val_normalized.csv")
print("- ss_train.csv")
print("- ss_val.csv")

uploaded = files.upload()

# If you upload a ZIP file, extract it
for filename in uploaded.keys():
    if filename.endswith('.zip'):
        print(f"Extracting {filename}...")
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall('.')
        print(f"Extracted {filename}")

# Check current directory structure
print("\nCurrent directory structure:")
for root, dirs, files in os.walk('.'):
    level = root.replace('.', '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        if file.endswith('.csv'):
            print(f"{subindent}{file}")

Upload your CSV files or ZIP containing the dataset folder:
Expected files:
- devign_train_normalized.csv
- devign_val_normalized.csv
- ss_train.csv
- ss_val.csv


Saving devign_train_normalized.csv to devign_train_normalized.csv
Saving devign_val_normalized.csv to devign_val_normalized.csv
Saving ss_train.csv to ss_train.csv
Saving ss_val.csv to ss_val.csv

Current directory structure:
./
  devign_val_normalized.csv
  ss_val.csv
  ss_train.csv
  devign_train_normalized.csv
  .config/
    configurations/
    logs/
      2025.09.25/
  sample_data/
    california_housing_train.csv
    mnist_test.csv
    california_housing_test.csv
    mnist_train_small.csv


In [None]:
%%writefile selfattention.py
import torch
import torch.nn as nn

class SelfAttention(torch.nn.Module):
    def __init__(self, embed_size, dimen_size):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.dimen_size = dimen_size

        # Initialize weight matrices
        self.values = torch.nn.Linear(embed_size, embed_size, bias=False)
        self.keys = torch.nn.Linear(embed_size, embed_size, bias=False)
        self.queries = torch.nn.Linear(embed_size, embed_size, bias=False)
        self.assist = torch.nn.Linear(embed_size, dimen_size, bias=False)

    def forward(self, code_output, text_output):
        outs = []
        for i in range(code_output.shape[0]):
            # Extract feature vectors for each code
            code = code_output[i].unsqueeze(0)
            text = text_output[i].unsqueeze(0)

            values = self.values(code)
            keys = self.keys(code)
            queries = self.queries(code)
            assist = self.assist(text)

            # Calculate attention scores
            attention = torch.matmul(queries, keys.permute(0, 2, 1))
            attention = torch.matmul(attention, assist)
            attention = attention / (self.embed_size ** 0.5)

            # Use softmax function to calculate attention weights
            attention = nn.functional.softmax(attention, dim=-1)
            # Use weights for weighted average of values
            out = torch.matmul(attention, values)

            outs.append(out)
        output = torch.cat(outs, dim=0)
        return output

print("✅ selfattention.py created successfully!")

Writing selfattention.py


In [None]:
%%writefile model.py
import torch
import torch.nn as nn
from selfattention import SelfAttention

class Code_Note(nn.Module):
    def __init__(self, code_encoder, text_encoder, input_size, hidden_size, output_size):
        super(Code_Note, self).__init__()
        self.code_encoder = code_encoder
        self.text_encoder = text_encoder
        self.attention = SelfAttention(768, 512)
        self.fc1 = torch.nn.Linear(input_size, hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_size, output_size)
        self.relu2 = torch.nn.ReLU()
        self.fc3 = torch.nn.Linear(output_size, 2)

        # Initialize embedding storage
        self.last_code_embeddings = None
        self.last_text_embeddings = None
        self.last_attention_embeddings = None
        self.last_pooled_embeddings = None

    def forward(self, inputs_code_id, inputs_code_mask, inputs_text_id, inputs_text_mask, save_embeddings=False):
        code_output = self.code_encoder.encoder(inputs_code_id, attention_mask=inputs_code_mask).last_hidden_state
        text_output = self.text_encoder(inputs_text_id, attention_mask=inputs_text_mask).last_hidden_state

        # Apply attention mechanism
        attention_output = self.attention(code_output, text_output)
        pooled_output = attention_output[:, 0, :]  # Take CLS token

        # Save embeddings if requested
        if save_embeddings:
            self.last_code_embeddings = code_output.detach().cpu()
            self.last_text_embeddings = text_output.detach().cpu()
            self.last_attention_embeddings = attention_output.detach().cpu()
            self.last_pooled_embeddings = pooled_output.detach().cpu()

        # Pass through fully connected layers
        output = self.fc1(pooled_output)
        output = self.relu(output)
        output = self.fc2(output)
        output = self.relu2(output)
        output = self.fc3(output)
        return output

print("✅ model.py created successfully!")

Writing model.py


In [None]:
%%writefile run.py
import logging
import pandas as pd
import torch
import time
import numpy as np
import random
import os
from model import Code_Note
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel, RobertaTokenizer, RobertaModel
import warnings
import sklearn.exceptions

warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

class Example(object):
    def __init__(self, code, text, label, idx=None):
        self.code = code
        self.text = text
        self.label = label
        self.idx = idx

class InputFeatures(object):
    def __init__(self, inputs_code_ids, inputs_code_masks, inputs_text_ids, inputs_text_masks, label, idx=None):
        self.inputs_code_ids = inputs_code_ids
        self.inputs_code_masks = inputs_code_masks
        self.inputs_text_ids = inputs_text_ids
        self.inputs_text_masks = inputs_text_masks
        self.label = label
        self.idx = idx

def read_file(codefile, textfile):
    examples = []
    print(f"Reading code file: {codefile}")
    print(f"Reading text file: {textfile}")

    code_data = pd.read_csv(codefile, na_filter=False, encoding_errors='ignore')
    text_data = pd.read_csv(textfile, na_filter=False, encoding_errors='ignore')

    print(f"Code data shape: {code_data.shape}")
    print(f"Text data shape: {text_data.shape}")

    code = code_data['code'].values.tolist()
    code_label = code_data['label'].values.tolist()
    text = text_data['text'].values.tolist()
    text_label = text_data['label'].values.tolist()

    for idx, (c, cl, t, tl) in enumerate(zip(code, code_label, text, text_label)):
        if c != '' and t != '' and int(cl) == int(tl):
            examples.append(Example(c, t, int(cl), idx))
        else:
            break

    print(f"Created {len(examples)} examples")
    return examples

def mini_sample(examples, num):
    if num >= len(examples):
        print(f"Sample size ({num}) >= total examples ({len(examples)}), using all examples")
        return examples
    example1 = []
    unique_numbers = random.sample(range(0, len(examples)), num)
    for n in unique_numbers:
        for example_index, example in enumerate(examples):
            if example_index == n:
                example1.append(example)
    print(f"Sampled {len(example1)} examples from {len(examples)} total")
    return example1

def text_to_feature(examples, code_tokenizer, text_tokenizer, stage=None):
    features = []
    print(f"Converting {len(examples)} examples to features...")
    for example_index, example in enumerate(examples):
        code_tokens = code_tokenizer.tokenize(example.code)[:510]
        code_tokens = [code_tokenizer.cls_token] + code_tokens + [code_tokenizer.sep_token]
        inputs_code_ids = code_tokenizer.convert_tokens_to_ids(code_tokens)
        inputs_code_masks = [1] * len(code_tokens)
        code_padding_length = 512 - len(inputs_code_ids)
        inputs_code_ids += [code_tokenizer.pad_token_id] * code_padding_length
        inputs_code_masks += [0] * code_padding_length

        text_tokens = text_tokenizer.tokenize(example.text)[:510]
        text_tokens = [text_tokenizer.cls_token] + text_tokens + [text_tokenizer.sep_token]
        inputs_text_ids = text_tokenizer.convert_tokens_to_ids(text_tokens)
        inputs_text_masks = [1] * len(text_tokens)
        text_padding_length = 512 - len(inputs_text_ids)
        inputs_text_ids += [text_tokenizer.pad_token_id] * text_padding_length
        inputs_text_masks += [0] * text_padding_length

        features.append(InputFeatures(
            inputs_code_ids,
            inputs_code_masks,
            inputs_text_ids,
            inputs_text_masks,
            example.label,
            example.idx
        ))
    print(f"Created {len(features)} features")
    return features

def extract_and_save_embeddings(model, dataloader, device, save_dir, split_name):
    os.makedirs(save_dir, exist_ok=True)
    model.eval()
    all_code_embeddings, all_text_embeddings, all_attention_embeddings, all_pooled_embeddings = [], [], [], []
    all_labels, all_indices = [], []
    print(f"\nExtracting embeddings for {split_name}...")
    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(dataloader, desc=f'Extracting {split_name} embeddings')):
            inputs_code_id, inputs_code_mask, inputs_text_id, inputs_text_mask, inputs_label, indices = [x.to(device) for x in batch]
            all_indices.extend(indices.cpu().numpy())
            _ = model(inputs_code_id, inputs_code_mask, inputs_text_id, inputs_text_mask, save_embeddings=True)
            all_code_embeddings.append(model.last_code_embeddings)
            all_text_embeddings.append(model.last_text_embeddings)
            all_attention_embeddings.append(model.last_attention_embeddings)
            all_pooled_embeddings.append(model.last_pooled_embeddings)
            all_labels.extend(inputs_label.cpu().numpy())
    all_code_embeddings = torch.cat(all_code_embeddings, dim=0)
    all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
    all_attention_embeddings = torch.cat(all_attention_embeddings, dim=0)
    all_pooled_embeddings = torch.cat(all_pooled_embeddings, dim=0)
    embeddings_dict = {
        'code_embeddings_full': all_code_embeddings.numpy(),
        'text_embeddings_full': all_text_embeddings.numpy(),
        'attention_embeddings_full': all_attention_embeddings.numpy(),
        'code_embeddings_cls': all_code_embeddings[:, 0, :].numpy(),
        'text_embeddings_cls': all_text_embeddings[:, 0, :].numpy(),
        'attention_embeddings_cls': all_attention_embeddings[:, 0, :].numpy(),
        'code_embeddings_mean': all_code_embeddings.mean(dim=1).numpy(),
        'text_embeddings_mean': all_text_embeddings.mean(dim=1).numpy(),
        'attention_embeddings_mean': all_attention_embeddings.mean(dim=1).numpy(),
        'pooled_embeddings': all_pooled_embeddings.numpy(),
        'code_text_cls_concat': np.concatenate([
            all_code_embeddings[:, 0, :].numpy(),
            all_text_embeddings[:, 0, :].numpy()
        ], axis=1),
        'labels': np.array(all_labels),
        'indices': np.array(all_indices),
    }
    filename = f'{split_name}_embeddings_final.npz'
    save_path = os.path.join(save_dir, filename)
    np.savez_compressed(save_path, **embeddings_dict)
    print(f"Saved {split_name} embeddings to {save_path}")
    return embeddings_dict

def evaluate(eval_dataloader, model, device):
    start_time = time.time()
    total_correct, total_examples = 0.0, 0.0
    all_pre, all_labels = [], []
    model.eval()
    for batch in eval_dataloader:
        inputs_code_id, inputs_code_mask, inputs_text_id, inputs_text_mask, inputs_label = [x.to(device) for x in batch]
        mlp_output = model(inputs_code_id, inputs_code_mask, inputs_text_id, inputs_text_mask, save_embeddings=False)
        pred = torch.argmax(mlp_output, dim=1)
        all_labels += inputs_label.tolist()
        all_pre += pred.tolist()
        correct = torch.sum(pred == inputs_label)
        total_correct += correct.item()
        total_examples += int(mlp_output.size(0))
    acc = total_correct / total_examples
    f1 = f1_score(y_true=all_labels, y_pred=all_pre)
    rec = recall_score(y_true=all_labels, y_pred=all_pre)
    prec = precision_score(y_true=all_labels, y_pred=all_pre)
    return {'acc': acc, 'f1': f1, 'rec': rec, 'prec': prec, 'execution_time': time.time() - start_time}

def main():
    print("Starting training with optimized embedding extraction...")
    epochs, batchsize = 10, 4
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    print("Loading pre-trained models...")
    code_tokenizer = AutoTokenizer.from_pretrained('Salesforce/codet5p-110m-embedding', trust_remote_code=True)
    code_model = AutoModel.from_pretrained('Salesforce/codet5p-110m-embedding', trust_remote_code=True)
    text_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    text_model = RobertaModel.from_pretrained('roberta-base')
    model = Code_Note(code_model, text_model, 768, 1536, 384)
    model.to(device)
    print("Models loaded successfully!")

    train_codefile, train_textfile = 'devign_train_normalized.csv', 'ss_train.csv'
    eval_codefile, eval_textfile = 'devign_val_normalized.csv', 'ss_val.csv'
    print("Checking if files exist...")
    for file in [train_codefile, train_textfile, eval_codefile, eval_textfile]:
        if os.path.exists(file):
            print(f"✓ Found: {file}")
        else:
            print(f"✗ Missing: {file}")
            return

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-6)

    print("\nLoading training data...")
    examples = read_file(train_codefile, train_textfile)
    examples = mini_sample(examples, 2000)
    print(f"Using {len(examples)} training samples")
    train_examples = text_to_feature(examples, code_tokenizer, text_tokenizer, 'train')
    all_inputs_code_ids = torch.tensor([f.inputs_code_ids for f in train_examples])
    all_inputs_code_masks = torch.tensor([f.inputs_code_masks for f in train_examples])
    all_inputs_text_ids = torch.tensor([f.inputs_text_ids for f in train_examples])
    all_inputs_text_masks = torch.tensor([f.inputs_text_masks for f in train_examples])
    all_inputs_labels = torch.tensor([f.label for f in train_examples])
    all_indices = torch.tensor([f.idx for f in train_examples])
    train_data = TensorDataset(all_inputs_code_ids, all_inputs_code_masks, all_inputs_text_ids, all_inputs_text_masks, all_inputs_labels)
    train_dataloader = DataLoader(train_data, batch_size=batchsize, shuffle=True)

    print("\nLoading validation data...")
    eva_examples = read_file(eval_codefile, eval_textfile)
    print(f"Using {len(eva_examples)} validation samples")
    eval_examples = text_to_feature(eva_examples, code_tokenizer, text_tokenizer, 'eval')
    all_evalinputs_code_ids = torch.tensor([f.inputs_code_ids for f in eval_examples])
    all_evalinputs_code_masks = torch.tensor([f.inputs_code_masks for f in eval_examples])
    all_evalinputs_text_ids = torch.tensor([f.inputs_text_ids for f in eval_examples])
    all_evalinputs_text_masks = torch.tensor([f.inputs_text_masks for f in eval_examples])
    all_evalinputs_labels = torch.tensor([f.label for f in eval_examples])
    all_eval_indices = torch.tensor([f.idx for f in eval_examples])
    eval_data = TensorDataset(all_evalinputs_code_ids, all_evalinputs_code_masks, all_evalinputs_text_ids, all_evalinputs_text_masks, all_evalinputs_labels)
    eval_dataloader = DataLoader(eval_data, batch_size=batchsize, shuffle=False)

    os.makedirs('embeddings', exist_ok=True)
    best_metrics, best_epoch = {}, 0
    print(f"\nStarting training for {epochs} epochs...")
    for epoch in range(epochs):
        train_total_lose, train_total_correct, train_total_examples = 0.0, 0.0, 0.0
        model.train()
        loop = tqdm(train_dataloader, total=len(train_dataloader))
        for bidx, batch in enumerate(loop):
            inputs_code_id, inputs_code_mask, inputs_text_id, inputs_text_mask, inputs_label = [x.to(device) for x in batch]
            optimizer.zero_grad()
            mlp_output = model(inputs_code_id, inputs_code_mask, inputs_text_id, inputs_text_mask, save_embeddings=False)
            loss = criterion(mlp_output, inputs_label)
            loss.backward()
            optimizer.step()
            pred = torch.argmax(mlp_output, dim=1)
            train_total_lose += loss.item()
            correct = torch.sum(pred == inputs_label)
            train_total_correct += correct.item()
            train_total_examples += int(mlp_output.size(0))
            loop.set_description(f'Epoch [{epoch+1}/{epochs}]')
            loop.set_postfix({'Train Loss': f'{train_total_lose/(bidx+1):.4f}', 'Train ACC': f'{train_total_correct/train_total_examples:.4f}'})
        metrics = evaluate(eval_dataloader, model, device)
        eval_acc, eval_f1, eval_rec, eval_prec, eval_time = metrics['acc'], metrics['f1'], metrics['rec'], metrics['prec'], metrics['execution_time']
        print(f'Epoch [{epoch + 1}/{epochs}] val_time: {eval_time:.2f}s val_acc={eval_acc:.4f}, val_f1={eval_f1:.4f}, val_recall={eval_rec:.4f}, val_precision={eval_prec:.4f}')
        if epoch == 0 or eval_acc >= best_metrics.get('acc', 0):
            best_metrics, best_epoch = metrics, epoch
            torch.save({'epoch': epoch,'model_state_dict': model.state_dict(),'optimizer_state_dict': optimizer.state_dict(),'best_metrics': best_metrics}, 'best_model.pth')
        print(f'Best epoch: {best_epoch+1} acc={best_metrics["acc"]:.4f}, f1={best_metrics["f1"]:.4f}\n')

    print("\nTRAINING COMPLETED! Now extracting embeddings...")
    train_data_indices = TensorDataset(all_inputs_code_ids, all_inputs_code_masks, all_inputs_text_ids, all_inputs_text_masks, all_inputs_labels, all_indices)
    train_dataloader_indices = DataLoader(train_data_indices, batch_size=batchsize, shuffle=False)
    eval_data_indices = TensorDataset(all_evalinputs_code_ids, all_evalinputs_code_masks, all_evalinputs_text_ids, all_evalinputs_text_masks, all_evalinputs_labels, all_eval_indices)
    eval_dataloader_indices = DataLoader(eval_data_indices, batch_size=batchsize, shuffle=False)
    train_embeddings = extract_and_save_embeddings(model, train_dataloader_indices, device, 'embeddings', 'train')
    val_embeddings = extract_and_save_embeddings(model, eval_dataloader_indices, device, 'embeddings', 'val')
    print("\nAll done! Best model saved: 'best_model.pth', embeddings saved in 'embeddings/'")
    return train_embeddings, val_embeddings

if __name__ == "__main__":
    train_emb, val_emb = main()


Overwriting run.py


In [None]:
# Execute the training script
exec(open('run.py').read())

✅ selfattention.py created successfully!
✅ model.py created successfully!
Starting training with optimized embedding extraction...
Using device: cuda
Loading pre-trained models...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_codet5p_embedding.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Salesforce/codet5p-110m-embedding:
- configuration_codet5p_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_codet5p_embedding.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Salesforce/codet5p-110m-embedding:
- modeling_codet5p_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Models loaded successfully!
Checking if files exist...
✓ Found: devign_train_normalized.csv
✓ Found: ss_train.csv
✓ Found: devign_val_normalized.csv
✓ Found: ss_val.csv

Loading training data...
Reading code file: devign_train_normalized.csv
Reading text file: ss_train.csv
Code data shape: (21854, 2)
Text data shape: (21854, 2)
Created 21854 examples


Token indices sequence length is longer than the specified maximum sequence length for this model (1754 > 512). Running this sequence through the model will result in indexing errors


Sampled 2000 examples from 21854 total
Using 2000 training samples
Converting 2000 examples to features...
Created 2000 features

Loading validation data...
Reading code file: devign_val_normalized.csv
Reading text file: ss_val.csv
Code data shape: (2732, 2)
Text data shape: (2732, 2)
Created 2732 examples
Using 2732 validation samples
Converting 2732 examples to features...
Created 2732 features

Starting training for 10 epochs...


Epoch [1/10]: 100%|██████████| 500/500 [07:03<00:00,  1.18it/s, Train Loss=0.6907, Train ACC=0.5235]


Epoch [1/10] val_time: 184.76s val_acc=0.5582, val_f1=0.1421, val_recall=0.0842, val_precision=0.4545
Best epoch: 1 acc=0.5582, f1=0.1421



Epoch [2/10]: 100%|██████████| 500/500 [07:07<00:00,  1.17it/s, Train Loss=0.6866, Train ACC=0.5510]


Epoch [2/10] val_time: 184.44s val_acc=0.5545, val_f1=0.0992, val_recall=0.0564, val_precision=0.4085
Best epoch: 1 acc=0.5582, f1=0.1421



Epoch [3/10]: 100%|██████████| 500/500 [07:07<00:00,  1.17it/s, Train Loss=0.6849, Train ACC=0.5625]


Epoch [3/10] val_time: 184.60s val_acc=0.5564, val_f1=0.1217, val_recall=0.0708, val_precision=0.4352
Best epoch: 1 acc=0.5582, f1=0.1421



Epoch [4/10]: 100%|██████████| 500/500 [07:07<00:00,  1.17it/s, Train Loss=0.6840, Train ACC=0.5700]


Epoch [4/10] val_time: 184.68s val_acc=0.5564, val_f1=0.1429, val_recall=0.0851, val_precision=0.4449
Best epoch: 1 acc=0.5582, f1=0.1421



Epoch [5/10]: 100%|██████████| 500/500 [07:07<00:00,  1.17it/s, Train Loss=0.6835, Train ACC=0.5735]


Epoch [5/10] val_time: 184.78s val_acc=0.5545, val_f1=0.1411, val_recall=0.0842, val_precision=0.4348
Best epoch: 1 acc=0.5582, f1=0.1421



Epoch [6/10]: 100%|██████████| 500/500 [07:07<00:00,  1.17it/s, Train Loss=0.6801, Train ACC=0.5800]


Epoch [6/10] val_time: 184.47s val_acc=0.5589, val_f1=0.1707, val_recall=0.1045, val_precision=0.4662
Best epoch: 6 acc=0.5589, f1=0.1707



Epoch [7/10]: 100%|██████████| 500/500 [07:07<00:00,  1.17it/s, Train Loss=0.6762, Train ACC=0.5775]


Epoch [7/10] val_time: 184.23s val_acc=0.5545, val_f1=0.1838, val_recall=0.1154, val_precision=0.4507
Best epoch: 6 acc=0.5589, f1=0.1707



Epoch [8/10]: 100%|██████████| 500/500 [07:07<00:00,  1.17it/s, Train Loss=0.6724, Train ACC=0.5945]


Epoch [8/10] val_time: 184.76s val_acc=0.5560, val_f1=0.2298, val_recall=0.1525, val_precision=0.4665
Best epoch: 6 acc=0.5589, f1=0.1707



Epoch [9/10]: 100%|██████████| 500/500 [07:07<00:00,  1.17it/s, Train Loss=0.6664, Train ACC=0.6015]


Epoch [9/10] val_time: 184.67s val_acc=0.5567, val_f1=0.2427, val_recall=0.1634, val_precision=0.4709
Best epoch: 6 acc=0.5589, f1=0.1707



Epoch [10/10]: 100%|██████████| 500/500 [07:07<00:00,  1.17it/s, Train Loss=0.6580, Train ACC=0.6190]


Epoch [10/10] val_time: 184.52s val_acc=0.5421, val_f1=0.4688, val_recall=0.4650, val_precision=0.4726
Best epoch: 6 acc=0.5589, f1=0.1707


TRAINING COMPLETED! Now extracting embeddings...

Extracting embeddings for train...


Extracting train embeddings: 100%|██████████| 500/500 [02:19<00:00,  3.59it/s]


In [None]:
# Check what files were created
import os
from google.colab import files

print("Checking for saved files...")

# Check for the trained model
if os.path.exists('best_model.pth'):
    size = os.path.getsize('best_model.pth')
    print(f"✓ Found trained model: best_model.pth ({size/1024/1024:.1f} MB)")

    # Download the trained model
    files.download('best_model.pth')
    print("Model downloaded!")
else:
    print("✗ No trained model found")

# Check for any embeddings that might have been created
if os.path.exists('embeddings'):
    print("\nEmbeddings folder contents:")
    for file in os.listdir('embeddings'):
        filepath = os.path.join('embeddings', file)
        size = os.path.getsize(filepath)
        print(f"  {file}: {size/1024/1024:.1f} MB")

        # Download any embedding files
        files.download(filepath)
else:
    print("No embeddings folder found")

# Check for any other relevant files
other_files = ['result.txt', 'run.py', 'model.py', 'selfattention.py']
for file in other_files:
    if os.path.exists(file):
        print(f"✓ Found: {file}")
        files.download(file)

Checking for saved files...
✓ Found trained model: best_model.pth (2722.5 MB)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model downloaded!

Embeddings folder contents:
✓ Found: run.py


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✓ Found: model.py


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✓ Found: selfattention.py


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import os
size = os.path.getsize("best_model.pth") / (1024*1024)
print(f"{size:.1f} MB")


FileNotFoundError: [Errno 2] No such file or directory: 'best_model.pth'

In [None]:
import os

print("best_model.pth exists:", os.path.exists("best_model.pth"))
print("train embeddings exist:", os.path.exists("embeddings/train_embeddings_final.npz"))
print("val embeddings exist:", os.path.exists("embeddings/val_embeddings_final.npz"))


best_model.pth exists: False
train embeddings exist: False
val embeddings exist: False
