In [4]:
import os
import requests
import json
import pandas as pd
import time # For pausing between requests to avoid hitting rate limits

# --- Configuration ---
JIRA_BASE_URL = "https://issues.apache.org/jira/"
API_ENDPOINT = JIRA_BASE_URL + "rest/api/2/search"
JIRA_PROJECT_KEY = "KAFKA"
ISSUE_TYPE = "Bug"
MAX_RESULTS_PER_PAGE = 100 # Maximum allowed by JIRA API usually

# JQL query to fetch all bug issues for Kafka
JQL_QUERY = f"project = {JIRA_PROJECT_KEY} AND issuetype = {ISSUE_TYPE} ORDER BY created DESC"

# --- Authentication (if required, for public JIRA instances like Apache, it's often not for read-only) ---
# For public Apache JIRA, you might not need authentication for basic reads.
# If you hit rate limits or need more data, consider using a session or basic auth:
# AUTH = ('your_username', 'your_password_or_token')
# headers = {'Content-Type': 'application/json'} # Usually not needed for GET, but good practice
# If using a token: headers = {"Authorization": "Bearer YOUR_TOKEN"}

# --- Data Collection ---
all_issues = []
start_at = 0
total_issues = None # Will be populated by the first API call

print(f"Starting data collection for {JIRA_PROJECT_KEY} {ISSUE_TYPE} issues...")

while True:
    params = {
        'jql': JQL_QUERY,
        'startAt': start_at,
        'maxResults': MAX_RESULTS_PER_PAGE,
        'fields': 'summary,description,status,resolution,issuetype,priority,creator,assignee,created,updated,resolutiondate,labels,components,fixVersions,comment' # Request relevant fields
    }

    try:
        response = requests.get(API_ENDPOINT, params=params) # Add auth=AUTH if using authentication
        response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
        data = response.json()

        if total_issues is None:
            total_issues = data.get('total')
            print(f"Total {total_issues} issues found.")

        issues_on_page = data.get('issues', [])
        if not issues_on_page:
            print("No more issues found or end of pagination.")
            break # No more issues

        all_issues.extend(issues_on_page)
        print(f"Collected {len(all_issues)}/{total_issues} issues so far...")

        start_at += len(issues_on_page)

        # Implement a delay to be respectful of the API's rate limits
        # For public JIRA, 1-2 seconds between requests is a good starting point.
        time.sleep(1.5)

        if start_at >= total_issues:
            print("All issues collected.")
            break

    except requests.exceptions.RequestException as e:
        print(f"Error during API request: {e}")
        break
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON response: {e}")
        print(f"Response content: {response.text}")
        break

print(f"\nFinished collecting {len(all_issues)} issues.")

# --- Process Collected Data ---
# Extract relevant fields into a list of dictionaries
# --- Process Collected Data ---
# Extract relevant fields into a list of dictionaries
processed_issue_data = []
for issue in all_issues:
    fields = issue.get('fields', {})

    creator_name = fields.get('creator', {}).get('displayName') if fields.get('creator') else None
    assignee_name = fields.get('assignee', {}).get('displayName') if fields.get('assignee') else None

    # Safely get resolution name (from previous fix)
    resolution_data = fields.get('resolution')
    resolution_name = resolution_data.get('name') if resolution_data else None

    # --- FIX STARTS HERE (Labels and Components/FixVersions might follow similar pattern) ---
    # For labels, it's more common for them to be a list of strings directly
    # So, we can directly join them, ensuring to handle potential None if labels field itself is missing
    labels_list = fields.get('labels', [])
    if isinstance(labels_list, list): # Ensure it's a list before joining
        labels = ', '.join(labels_list)
    else: # If for some reason it's a string or other format, handle it
        labels = str(labels_list) if labels_list is not None else ''


    # Components and FixVersions often come as list of dictionaries, usually with a 'name' key
    # Let's adjust them to be robust as well, similar to how we handled resolution
    components_list = fields.get('components', [])
    components = ', '.join([comp.get('name') for comp in components_list if isinstance(comp, dict) and comp.get('name')])

    fix_versions_list = fields.get('fixVersions', [])
    fix_versions = ', '.join([fv.get('name') for fv in fix_versions_list if isinstance(fv, dict) and fv.get('name')])
    # --- FIX ENDS HERE ---

    processed_issue_data.append({
        'key': issue.get('key'),
        'summary': fields.get('summary'),
        'description': fields.get('description'),
        'status': fields.get('status', {}).get('name'),
        'resolution': resolution_name,
        'issue_type': fields.get('issuetype', {}).get('name'),
        'priority': fields.get('priority', {}).get('name'),
        'creator': creator_name,
        'assignee': assignee_name,
        'created_at': fields.get('created'),
        'updated_at': fields.get('updated'),
        'resolved_at': fields.get('resolutiondate'),
        'labels': labels, # Use the safely extracted labels
        'components': components, # Use the safely extracted components
        'fix_versions': fix_versions, # Use the safely extracted fix_versions
        'comments_count': fields.get('comment', {}).get('total')
    })

# Ensure the output directory exists before saving
output_dir = "data/raw/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir) # This will create 'data' and 'data/raw' if they don't exist
    print(f"Created directory: {output_dir}")

# --- Save to Raw Data Folder ---
df_raw = pd.DataFrame(processed_issue_data)
output_path = os.path.join(output_dir, "kafka_bug_reports_raw.csv") # Use os.path.join for better path handling
df_raw.to_csv(output_path, index=False)
print(f"Raw data saved to {output_path}")

# --- Initial EDA (Quick Check) ---
print("\n--- Initial Data Exploration ---")
print(df_raw.head())
print("\n--- Data Info ---")
print(df_raw.info())
print("\n--- Missing Values ---")
print(df_raw.isnull().sum())
print("\n--- Unique Issue Types (should be mostly 'Bug') ---")
print(df_raw['issue_type'].value_counts())
print("\n--- Unique Statuses ---")
print(df_raw['status'].value_counts())

Starting data collection for KAFKA Bug issues...
Total 8500 issues found.
Collected 100/8500 issues so far...
Collected 200/8500 issues so far...
Collected 300/8500 issues so far...
Collected 400/8500 issues so far...
Collected 500/8500 issues so far...
Collected 600/8500 issues so far...
Collected 700/8500 issues so far...
Collected 800/8500 issues so far...
Collected 900/8500 issues so far...
Collected 1000/8500 issues so far...
Collected 1100/8500 issues so far...
Collected 1200/8500 issues so far...
Collected 1300/8500 issues so far...
Collected 1400/8500 issues so far...
Collected 1500/8500 issues so far...
Collected 1600/8500 issues so far...
Collected 1700/8500 issues so far...
Collected 1800/8500 issues so far...
Collected 1900/8500 issues so far...
Collected 2000/8500 issues so far...
Collected 2100/8500 issues so far...
Collected 2200/8500 issues so far...
Collected 2300/8500 issues so far...
Collected 2400/8500 issues so far...
Collected 2500/8500 issues so far...
Collected 