In [None]:
import requests
import time
import json
import os

# Base API URL for Apache Jira
JIRA_API_BASE = 'https://issues.apache.org/jira/rest/api/2'

# Projects to scrape
PROJECTS = ['HADOOP', 'SPARK', 'KAFKA']

# Max issues per request
PAGE_SIZE = 50

# Number of retries for failed requests
MAX_RETRIES = 5

# File to save scraper checkpoint state
CHECKPOINT_FILE = 'jira_scraper_checkpoint.json'

# Headers needed to accept JSON responses
HEADERS = {'Accept': 'application/json'}

class JiraDataScraper:
    def __init__(self, projects):
        self.projects = projects
        self.session = requests.Session()
        self.session.headers.update(HEADERS)
        self.checkpoint = self.load_checkpoint()

    def load_checkpoint(self):
        """Load last scraping positions from a checkpoint file to resume."""
        if os.path.exists(CHECKPOINT_FILE):
            with open(CHECKPOINT_FILE, 'r') as f:
                return json.load(f)
        else:
            # Initialize with zero offsets for all projects
            return {project: 0 for project in self.projects}

    def save_checkpoint(self):
        """Save current state to a checkpoint file."""
        with open(CHECKPOINT_FILE, 'w') as f:
            json.dump(self.checkpoint, f)

    def get_json_with_retries(self, url, params=None):
        """Generic HTTP GET with retries, handling rate limit and server errors."""
        for attempt in range(MAX_RETRIES):
            try:
                response = self.session.get(url, params=params, timeout=15)
                if response.status_code == 200:
                    return response.json()
                elif response.status_code == 429:
                    wait_time = int(response.headers.get('Retry-After', '10'))
                    print(f"Rate limit hit. Sleeping for {wait_time} seconds.")
                    time.sleep(wait_time)
                elif 500 <= response.status_code < 600:
                    backoff = 2 ** attempt
                    print(f"Server error {response.status_code}. Retrying in {backoff} seconds.")
                    time.sleep(backoff)
                else:
                    print(f"Unexpected status code: {response.status_code}. Aborting request.")
                    break
            except requests.RequestException as e:
                backoff = 2 ** attempt
                print(f"Request failed with {e}. Retrying after {backoff} seconds.")
                time.sleep(backoff)
        return None

    def fetch_issues(self, project, start_at):
        """Fetch a page of issues for project starting at start_at offset."""
        url = f"{JIRA_API_BASE}/search"
        jql_query = f"project={project} order by created asc"
        params = {
            'jql': jql_query,
            'startAt': start_at,
            'maxResults': PAGE_SIZE,
            'fields': 'summary,status,assignee,reporter,priority,labels,created,updated,description'
        }
        return self.get_json_with_retries(url, params)

    def fetch_comments(self, issue_key):
        """Fetch comments associated with an issue."""
        url = f"{JIRA_API_BASE}/issue/{issue_key}/comment"
        data = self.get_json_with_retries(url)
        if data and 'comments' in data:
            return data['comments']
        else:
            return []

    def scrape_project_issues(self, project):
        """Scrape all issues and comments for a single project."""
        start_at = self.checkpoint.get(project, 0)
        collected_issues = []

        while True:
            data = self.fetch_issues(project, start_at)
            if not data:
                print(f"Failed to fetch issues for project {project} at offset {start_at}. Stopping.")
                break

            issues = data.get('issues', [])
            if not issues:
                print(f"No more issues found for project {project}. Completed.")
                break

            for issue in issues:
                issue_key = issue.get('key')
                fields = issue.get('fields', {})

                comments_raw = self.fetch_comments(issue_key)
                comments = [
                    {
                        'author': c.get('author', {}).get('displayName') if c.get('author') else None,
                        'created': c.get('created'),
                        'body': c.get('body', '')
                    }
                    for c in comments_raw
                ]

                issue_data = {
                    'issue_id': issue_key,
                    'project': project,
                    'title': fields.get('summary'),
                    'status': fields.get('status', {}).get('name'),
                    'reporter': (fields.get('reporter') or {}).get('displayName'),
                    'assignee': (fields.get('assignee') or {}).get('displayName'),
                    'priority': (fields.get('priority') or {}).get('name'),
                    'labels': fields.get('labels', []),
                    'created': fields.get('created'),
                    'updated': fields.get('updated'),
                    'description': fields.get('description', '') or '',
                    'comments': comments,
                    # Placeholder for LLM tasks (to fill in later)
                    'derived_tasks': {
                        'summarization': '',
                        'classification': '',
                        'qa': []
                    }
                }

                collected_issues.append(issue_data)

            start_at += PAGE_SIZE
            self.checkpoint[project] = start_at
            self.save_checkpoint()

            if start_at >= data.get('total', 0):
                print(f"Finished all pages for project {project}.")
                break

        return collected_issues

def main():
    scraper = JiraDataScraper(PROJECTS)
    for proj in PROJECTS:
        print(f"Scraping project: {proj}")
        issues = scraper.scrape_project_issues(proj)

        # Append issues as JSONL to file
        filename = f"{proj}_issues.jsonl"
        with open(filename, 'a', encoding='utf-8') as f:
            for issue in issues:
                f.write(json.dumps(issue) + '\n')

if __name__ == '__main__':
    main()


Scraping project: HADOOP
