In [5]:
import json
import csv
import sys
import time
from sentence_transformers import SentenceTransformer
import numpy as np
import itertools

# Load the fine-tuned SentenceTransformer model
model_path = "sbert_finetuned_issues"  # Update with the correct model path
model = SentenceTransformer(model_path)
print("Model loaded successfully")

# Function to display a terminal loading animation with estimated time
def loading_animation_with_eta(message, current, total, start_time):
    animation = itertools.cycle(["|", "/", "-", "\\"])
    elapsed_time = time.time() - start_time
    average_time_per_item = elapsed_time / current if current > 0 else 0
    estimated_time_remaining = average_time_per_item * (total - current)
    eta = time.strftime("%H:%M:%S", time.gmtime(estimated_time_remaining))

    sys.stdout.write(f"\r{message} {next(animation)} | Progress: {current}/{total} | ETA: {eta}")
    sys.stdout.flush()
    time.sleep(0.1)

# Function to process the input data and calculate embeddings
def process_issues(json_file, csv_output):
    # Load the JSON data
    with open(json_file, 'r') as file:
        issues = json.load(file)
        total_issues = len(issues)
        print(f"Loaded {total_issues} issues from {json_file}")
    
    # Open CSV file for writing
    with open(csv_output, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['repo', 'text', 'embedding', 'date']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        print(f"Writing embeddings to {csv_output}")
        
        start_time = time.time()
        for count, issue in enumerate(issues, start=1):
            loading_animation_with_eta("Processing issue", count, total_issues, start_time)
            
            # Extract information
            repo = issue['repo']
            date = issue['createdAt']
            title = issue.get('processed_title', issue.get('title', ''))
            body = issue.get('processed_body', issue.get('body', ''))
            comments = ' '.join(
                comment.get('processed_comment_body', comment.get('comment_body', ''))
                for comment in issue.get('comments', [])
            )
            
            # Combine text fields
            combined_text = f"{title} {body} {comments}".strip()
            
            # Calculate embedding
            embedding = model.encode(combined_text)
            embedding_str = np.array2string(embedding, separator=',', max_line_width=np.inf)
            
            # Write to CSV
            writer.writerow({
                'repo': repo,
                'text': combined_text,
                'embedding': embedding_str,
                'date': date
            })
        print()  # Move to the next line after processing

# File paths
json_file = 'issues.json'  # Input JSON file
csv_output = 'issues_with_embeddings.csv'  # Output CSV file

# Process and save embeddings
process_issues(json_file, csv_output)

print(f"Embeddings saved to {csv_output}")


Model loaded successfully
Loaded 121171 issues from issues.json
Writing embeddings to issues_with_embeddings.csv
Processing issue | | Progress: 121171/121171 | ETA: 00:00:00
Embeddings saved to issues_with_embeddings.csv
