In [1]:
import torch
print("GPU Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

GPU Available: True
GPU Name: Tesla T4


In [2]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm
import os
import gc
import numpy as np
from google.colab import drive

In [3]:
# --- 1. Setup Google Drive and Define Paths ---
print("Mounting Google Drive...")
drive.mount('/content/drive')

# IMPORTANT: Update this path to your project folder on Google Drive
DRIVE_PROJECT_PATH = "/content/drive/My Drive/Github_data_collection/Data processing notebooks"

# Define all file paths relative to your project folder
CLEAN_SOURCE_CSV = os.path.join(DRIVE_PROJECT_PATH, "final_dataset_with_full_diffs_CLEANED.csv")
# CLEAN_SOURCE_CSV = os.path.join(DRIVE_PROJECT_PATH, "filtered_output.csv")
FINAL_OUTPUT_CSV = os.path.join(DRIVE_PROJECT_PATH, "final_dataset_with_embeddings.csv")
FINAL_OUTPUT_CSV = os.path.join(DRIVE_PROJECT_PATH, "final_dataset_with_embeddings_old.csv")
ERROR_LOG_PATH = os.path.join(DRIVE_PROJECT_PATH, "embedding_errors.log")

print(f"All files will be read from and saved to: {DRIVE_PROJECT_PATH}")

Mounting Google Drive...
Mounted at /content/drive
All files will be read from and saved to: /content/drive/My Drive/Github_data_collection/Data processing notebooks


In [4]:
# --- 2. Configuration ---
CHUNK_SIZE = 16 # Adjust based on your Colab GPU's memory (e.g., T4 vs P100)
MODEL_NAME = "microsoft/codebert-base"
# Sliding Window Configuration
MAX_LENGTH = 512
OVERLAP = 50
STRIDE = MAX_LENGTH - OVERLAP

In [5]:
# --- 3. Setup Model and Device ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval() # Set model to evaluation mode


# --- 4. Checkpointing ---
rows_to_skip = 0
if os.path.exists(FINAL_OUTPUT_CSV):
    print("Checkpoint file found. Correctly counting processed rows...")
    try:
        row_count = 0
        for chunk in pd.read_csv(FINAL_OUTPUT_CSV, chunksize=1000, usecols=['commit_hash']):
            row_count += len(chunk)
        rows_to_skip = row_count
        print(f"Resuming: Found {rows_to_skip} rows already processed.")
    except Exception as e:
        print(f"Could not read checkpoint file, starting from scratch. Error: {e}")
        rows_to_skip = 0

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Checkpoint file found. Correctly counting processed rows...
Resuming: Found 8049 rows already processed.


In [None]:
# --- 5. Main Resumable Loop ---
try:
    csv_reader = pd.read_csv(CLEAN_SOURCE_CSV, chunksize=CHUNK_SIZE, skiprows=range(1, rows_to_skip + 1))

    write_header = (rows_to_skip == 0)

    for chunk_df in tqdm(csv_reader, desc="Generating Embeddings"):
        batch_embeddings = []

        for _, row in chunk_df.iterrows():
            try:
                diff_text = str(row['diff'])
                all_input_ids = tokenizer.encode(diff_text, add_special_tokens=False)
                final_embedding_vector = None

                if len(all_input_ids) <= MAX_LENGTH - 2:
                    inputs = tokenizer(diff_text, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt").to(device)
                    with torch.no_grad():
                        outputs = model(**inputs)
                    final_embedding_vector = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()
                else:
                    # Sliding Window Logic for long diffs
                    chunk_input_embeddings = []
                    for start in range(0, len(all_input_ids), STRIDE):
                        chunk_ids = all_input_ids[start : start + MAX_LENGTH - 2]
                        input_tensor = torch.tensor([tokenizer.cls_token_id] + chunk_ids + [tokenizer.sep_token_id]).unsqueeze(0).to(device)
                        with torch.no_grad():
                            outputs = model(input_tensor)
                        chunk_input_embeddings.append(outputs.last_hidden_state[:, 0, :])

                    if chunk_input_embeddings:
                        stacked_embeddings = torch.stack(chunk_input_embeddings)
                        final_embedding_vector = torch.mean(stacked_embeddings, dim=0).cpu().numpy().flatten()

                if final_embedding_vector is not None:
                    batch_embeddings.append(final_embedding_vector)
                else:
                    batch_embeddings.append(np.zeros(model.config.hidden_size))

            except Exception as e:
                batch_embeddings.append(np.zeros(model.config.hidden_size))
                with open(ERROR_LOG_PATH, 'a', encoding='utf-8') as f:
                    f.write(f"Error on commit {row.get('commit_hash', 'N/A')}: {e}\n")

        # --- Combine original data with new embeddings ---
        embedding_df = pd.DataFrame(batch_embeddings, columns=[f'emb_{j}' for j in range(model.config.hidden_size)])

        chunk_df.reset_index(drop=True, inplace=True)
        chunk_df.drop(columns=['diff'], inplace=True)
        embedding_df.reset_index(drop=True, inplace=True)

        combined_chunk_df = pd.concat([chunk_df, embedding_df], axis=1)

        # --- Append the combined chunk to the final CSV on Google Drive ---
        combined_chunk_df.to_csv(FINAL_OUTPUT_CSV, mode='a', header=write_header, index=False)
        write_header = False

        # --- Clean up memory ---
        del batch_embeddings, embedding_df, chunk_df, combined_chunk_df
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    print("\n--- Final Dataset Generation Complete --- ✅")
    print(f"Dataset with all original metadata and embeddings saved to '{FINAL_OUTPUT_CSV}'.")

except FileNotFoundError:
    print(f"ERROR: Clean file not found at '{CLEAN_SOURCE_CSV}'. Please place it in your Google Drive project folder.")

Generating Embeddings: 0it [00:00, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (5019511 > 512). Running this sequence through the model will result in indexing errors


In [None]:
import csv
import os
import pandas as pd
from tqdm.notebook import tqdm
from google.colab import drive
import sys


# --- THE KEY FIX IS HERE: Increase the CSV field size limit ---
max_int = sys.maxsize
while True:
    try:
        csv.field_size_limit(max_int)
        break
    except OverflowError:
        max_int = int(max_int / 10)
print(f"CSV field size limit increased to handle large fields.")


# --- 2. Checkpointing ---
rows_to_skip = 0
if os.path.exists(FINAL_OUTPUT_CSV):
    print("Checkpoint file found. Correctly counting processed rows...")
    try:
        row_count = 0
        for chunk in pd.read_csv(FINAL_OUTPUT_CSV, chunksize=1000, usecols=[0]):
            row_count += len(chunk)
        rows_to_skip = row_count
        print(f"Resuming search after {rows_to_skip} successfully processed rows.")
    except Exception:
        rows_to_skip = 0

# --- 3. Main Debugging Loop ---
try:
    with open(CLEAN_SOURCE_CSV, 'r', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        header = next(reader)

        print(f"Skipping {rows_to_skip} rows...")
        for _ in range(rows_to_skip):
            next(reader, None)

        print("\nStarting scan. The last number printed before the crash is the problematic row.")

        for i, row in enumerate(reader):
            current_row_num = rows_to_skip + i + 1
            print(f"Attempting to process row: {current_row_num}", end='\r')
            # The script is designed to crash here when it encounters the massive row.

    print("\nScan complete. No crash occurred.")

except FileNotFoundError:
    print(f"ERROR: Input file not found at: {CLEAN_SOURCE_CSV}")
except Exception as e:
    print(f"\nAn unexpected error occurred during the scan: {e}")