In [None]:
"""
Efficient GitHub analysis to identify issue solvers by starting with merged PRs
"""
import json
from datetime import datetime
from github import Github
import os



def map_prs_to_issues(repo, max_prs=500):
    """
    Start by retrieving merged PRs and mapping them to the issues they close.
    This is more efficient for large repositories.
    """
    # Get merged PRs
    merged_prs = repo.get_pulls(state='closed', sort='updated', direction='desc')

    # Track PR to issue mapping
    pr_issue_mapping = {}
    processed_count = 0

    for pr in merged_prs:
        try:
            # Skip if PR wasn't merged
            if not pr.merged:
                continue

            processed_count += 1

            # Extract PR info
            pr_info = {
                "number": pr.number,
                "title": pr.title,
                "author": pr.user.login,
                "merged_at": pr.merged_at.isoformat() if pr.merged_at else None,
                "merged_by": pr.merged_by.login if pr.merged_by else None,
                "referenced_issues": []
            }

            # Find issues referenced in PR body
            if pr.body:
                # Look for issue references using common patterns
                import re
                # Match patterns like "fixes #123", "closes #456", etc.
                closing_patterns = [
                    r'(?:close|closes|closed|fix|fixes|fixed|resolve|resolves|resolved)[\s:]+#(\d+)',
                    r'(?:issue|issues)[\s:]+#(\d+)'  # Also look for general issue references
                ]

                for pattern in closing_patterns:
                    matches = re.findall(pattern, pr.body.lower())
                    for match in matches:
                        issue_number = int(match)
                        pr_info["referenced_issues"].append(issue_number)

            # Store PR info if it references any issues
            if pr_info["referenced_issues"]:
                pr_issue_mapping[pr.number] = pr_info
                print(f"PR #{pr.number} references issues: {pr_info['referenced_issues']}")

            # Stop after processing max_prs
            if processed_count >= max_prs:
                break

        except Exception as e:
            print(f"Error processing PR #{pr.number}: {e}")
            continue

    # Save mapping to file
    with open('pr_issue_mapping.json', 'w') as f:
        json.dump(pr_issue_mapping, f, indent=2)

    print(f"Mapped {len(pr_issue_mapping)} PRs to their referenced issues")
    return pr_issue_mapping

def analyze_issue_solvers(repo, pr_issue_mapping=None):
    """
    Analyze PRs to determine who solved each issue and what files were changed.
    """
    # Load mapping from file if not provided
    if pr_issue_mapping is None:
        with open('pr_issue_mapping.json', 'r') as f:
            pr_issue_mapping = json.load(f)

    # Convert to issue-centric mapping
    issue_solver_data = {}

    # First, organize PRs by issue
    for pr_number, pr_data in pr_issue_mapping.items():
        pr_number = int(pr_number)  # Convert string keys back to integers

        for issue_number in pr_data["referenced_issues"]:
            # Initialize issue data if first encounter
            if issue_number not in issue_solver_data:
                issue_solver_data[issue_number] = {
                    "issue_number": issue_number,
                    "title": None,  # Will fill this later
                    "linked_prs": [],
                    "solvers": [],
                    "file_changes": []
                }

            # Add PR to issue data
            issue_solver_data[issue_number]["linked_prs"].append({
                "number": pr_number,
                "title": pr_data["title"],
                "merged_at": pr_data["merged_at"],
                "merged_by": pr_data["merged_by"]
            })

    # For each issue, get details and analyze PR contributors + file changes
    for issue_number in list(issue_solver_data.keys()):
        try:
            # Get issue details
            issue = repo.get_issue(issue_number)

            # Update issue info
            issue_solver_data[issue_number].update({
                "title": issue.title,
                "labels": [label.name for label in issue.labels],
                "state": issue.state,
                "created_at": issue.created_at.isoformat(),
                "closed_at": issue.closed_at.isoformat() if issue.closed_at else None,
                "opened_by": issue.user.login
            })

            # Get detailed PR info for each linked PR
            solvers = set()
            file_changes = []

            for pr_info in issue_solver_data[issue_number]["linked_prs"]:
                pr_number = pr_info["number"]
                try:
                    # Get full PR object
                    pr = repo.get_pull(pr_number)

                    # Get commit authors
                    pr_contributors = get_pr_contributors(pr)
                    solvers.update(pr_contributors)

                    # Get file changes
                    pr_file_changes = get_pr_file_changes(pr)
                    file_changes.extend(pr_file_changes)

                except Exception as e:
                    print(f"Error analyzing PR #{pr_number}: {e}")

            # Update solvers and file changes
            issue_solver_data[issue_number]["solvers"] = list(solvers)
            issue_solver_data[issue_number]["file_changes"] = file_changes

            print(f"Issue #{issue_number} ({issue.title}) was solved by: {', '.join(list(solvers)) if solvers else 'Unknown'}")

        except Exception as e:
            print(f"Error retrieving details for issue #{issue_number}: {e}")
            # Remove issues we couldn't get details for
            issue_solver_data.pop(issue_number, None)

    # Save solver data to file
    with open('issue_solver_data.json', 'w') as f:
        json.dump(issue_solver_data, f, indent=2)

    return issue_solver_data

def get_pr_contributors(pull_request):
    """Get primary contributors to a pull request (focus on commit authors)."""
    contributors = set()

    try:
        # Get commit authors - these are the people who actually wrote the code
        commits = pull_request.get_commits()
        for commit in commits:
            if commit.author:
                contributors.add(commit.author.login)
    except Exception as e:
        print(f"Error fetching commit authors for PR #{pull_request.number}: {e}")

    return list(contributors)

def get_pr_file_changes(pull_request):
    """Get information about files changed in a pull request."""
    try:
        files = pull_request.get_files()
        file_changes = []
        for file in files:
            file_changes.append({
                "filename": file.filename,
                "additions": file.additions,
                "deletions": file.deletions,
                "changes": file.changes,
                "status": file.status  # 'added', 'removed', 'modified', etc.
            })
        return file_changes
    except Exception as e:
        print(f"Error fetching file changes for PR #{pull_request.number}: {e}")
        return []

def generate_solver_stats(issue_solver_data=None):
    """Generate statistics about who solved issues."""
    # Load data from file if not provided
    if issue_solver_data is None:
        with open('issue_solver_data.json', 'r') as f:
            issue_solver_data = json.load(f)

    # Track solver statistics
    solver_stats = {}

    # For each issue, credit solvers
    for issue_number, issue_data in issue_solver_data.items():
        for solver in issue_data["solvers"]:
            if solver not in solver_stats:
                solver_stats[solver] = {
                    "issues_solved": 0,
                    "files_changed": set(),
                    "additions": 0,
                    "deletions": 0,
                    "total_changes": 0
                }

            # Increment issue count
            solver_stats[solver]["issues_solved"] += 1

            # Track files and changes
            for file_change in issue_data["file_changes"]:
                solver_stats[solver]["files_changed"].add(file_change["filename"])
                solver_stats[solver]["additions"] += file_change["additions"]
                solver_stats[solver]["deletions"] += file_change["deletions"]
                solver_stats[solver]["total_changes"] += file_change["changes"]

    # Convert sets to lists for JSON serialization
    for solver in solver_stats:
        solver_stats[solver]["files_changed"] = list(solver_stats[solver]["files_changed"])

    # Sort solvers by issues solved (descending)
    sorted_solvers = sorted(
        solver_stats.items(),
        key=lambda x: x[1]["issues_solved"],
        reverse=True
    )

    # Create formatted report
    solver_report = {
        "total_issues": len(issue_solver_data),
        "total_solvers": len(solver_stats),
        "solvers_by_issues": [
            {
                "solver": solver,
                "issues_solved": stats["issues_solved"],
                "files_changed": len(stats["files_changed"]),
                "total_changes": stats["total_changes"]
            }
            for solver, stats in sorted_solvers
        ],
        "detailed_stats": solver_stats
    }

    # Save report to file
    with open('solver_stats.json', 'w') as f:
        json.dump(solver_report, f, indent=2)

    print(f"Generated solver statistics for {len(solver_stats)} contributors")
    return solver_report


def retrieve_pr_file_contents(repo, pr_number, output_dir=None):
    """
    Retrieve the full content of all files changed in a specific PR and save them.

    Args:
        repo: GitHub repository object
        pr_number: Pull request number
        output_dir: Directory to save files (defaults to pr_{number}_files)

    Returns:
        Dictionary with information about saved files
    """
    try:
        # Get the pull request
        pr = repo.get_pull(pr_number)

        # Create output directory if not specified
        if output_dir is None:
            output_dir = f"pr_{pr_number}_files"

        # Create the main directory if it doesn't exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Create a subdirectory with timestamp to avoid conflicts
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        pr_dir = os.path.join(output_dir, f"{pr_number}_{timestamp}")
        os.makedirs(pr_dir)

        # Retrieve all files modified in the PR
        files = pr.get_files()

        file_info = {
            "pr_number": pr_number,
            "pr_title": pr.title,
            "author": pr.user.login,
            "saved_files": [],
            "skipped_files": []
        }

        for file in files:
            try:
                # Skip deleted files
                if file.status == "removed":
                    file_info["skipped_files"].append({
                        "filename": file.filename,
                        "reason": "file was deleted in PR"
                    })
                    continue

                # Get the content of the file at the head of the PR
                content = repo.get_contents(file.filename, ref=pr.head.sha)

                # Create necessary subdirectories
                file_path = os.path.join(pr_dir, file.filename)
                os.makedirs(os.path.dirname(file_path), exist_ok=True)

                # Decode and save the file
                if isinstance(content, list):
                    # This is a directory, skip it
                    file_info["skipped_files"].append({
                        "filename": file.filename,
                        "reason": "is a directory"
                    })
                    continue

                # Save file content
                with open(file_path, 'wb') as f:
                    f.write(content.decoded_content)

                file_info["saved_files"].append({
                    "filename": file.filename,
                    "path": file_path,
                    "additions": file.additions,
                    "deletions": file.deletions,
                    "changes": file.changes,
                    "status": file.status
                })

                print(f"Saved: {file.filename}")

            except Exception as e:
                print(f"Error saving file {file.filename}: {e}")
                file_info["skipped_files"].append({
                    "filename": file.filename,
                    "reason": str(e)
                })

        # Save metadata about this PR's files
        metadata_path = os.path.join(pr_dir, "_metadata.json")
        with open(metadata_path, 'w') as f:
            json.dump(file_info, f, indent=2)

        print(f"Saved {len(file_info['saved_files'])} files for PR #{pr_number}")
        print(f"Skipped {len(file_info['skipped_files'])} files")
        print(f"Files saved to: {pr_dir}")

        return file_info

    except Exception as e:
        print(f"Error retrieving PR #{pr_number}: {e}")
        return None


def retrieve_files_for_all_prs(repo, pr_issue_mapping=None, output_base_dir="pr_files"):
    """
    Retrieve files from all PRs in the mapping.

    Args:
        repo: GitHub repository object
        pr_issue_mapping: Dictionary mapping PR numbers to PR data
        output_base_dir: Base directory to save all PR files

    Returns:
        Dictionary with information about all saved files
    """
    # Load mapping from file if not provided
    if pr_issue_mapping is None:
        with open('pr_issue_mapping.json', 'r') as f:
            pr_issue_mapping = json.load(f)

    # Create output directory
    if not os.path.exists(output_base_dir):
        os.makedirs(output_base_dir)

    # Track results
    retrieval_results = {}

    # Process each PR
    for pr_number in pr_issue_mapping:
        pr_number = int(pr_number)  # Convert string keys back to integers

        print(f"\nRetrieving files for PR #{pr_number}...")
        file_info = retrieve_pr_file_contents(repo, pr_number, output_base_dir)

        if file_info:
            retrieval_results[pr_number] = file_info

    # Save summary of all retrievals
    with open(os.path.join(output_base_dir, "retrieval_summary.json"), 'w') as f:
        json.dump({
            "total_prs_processed": len(retrieval_results),
            "total_files_saved": sum(
                len(info["saved_files"]) for info in retrieval_results.values()),
            "total_files_skipped": sum(
                len(info["skipped_files"]) for info in retrieval_results.values()),
            "pr_summary": {
                pr_num: {
                    "files_saved": len(info["saved_files"]),
                    "files_skipped": len(info["skipped_files"])
                }
                for pr_num, info in retrieval_results.items()
            }
        }, f, indent=2)

    print(f"\nProcessed {len(retrieval_results)} PRs")
    print(f"Results saved to {output_base_dir}")

    return retrieval_results


def analyze_issue_solvers_with_file_content(repo, pr_issue_mapping=None, retrieve_files=False):
    """
    Enhanced version of analyze_issue_solvers that also retrieves file contents.
    """
    # First, perform regular analysis
    issue_solver_data = analyze_issue_solvers(repo, pr_issue_mapping)

    # If requested, retrieve all file contents
    if retrieve_files:
        print("\nRetrieving file contents for all PRs...")
        retrieve_files_for_all_prs(repo, pr_issue_mapping)

    return issue_solver_data

# Example usage
if __name__ == "__main__":
    # Initialize GitHub client
    GITHUB_TOKEN = "" #token is removed because github is not allowing it to be uploaded
    REPO_NAME = "tensorflow/tensorflow"
    g = Github(GITHUB_TOKEN)

    # Get repository
    repo = g.get_repo(REPO_NAME)

    # Step 1: Create PR to issue mapping (start with PRs)
    pr_issue_mapping = map_prs_to_issues(repo, max_prs=1000)

    # Step 2: Analyze PR details to determine issue solvers
    # issue_solver_data = analyze_issue_solvers(repo, pr_issue_mapping)

    issue_solver_data = analyze_issue_solvers_with_file_content(repo, pr_issue_mapping, retrieve_files=True)


    # Step 3: Generate statistics about solvers
    solver_stats = generate_solver_stats(issue_solver_data)

    # Print top solvers
    print("\nTop Issue Solvers:")
    for i, solver_data in enumerate(solver_stats["solvers_by_issues"][:10]):
        print(f"{i+1}. {solver_data['solver']}: {solver_data['issues_solved']} issues, {solver_data['total_changes']} changes")

PR #88868 references issues: [23427]
PR #88844 references issues: [23485]
PR #88813 references issues: [23414]
PR #88818 references issues: [23349]
PR #88849 references issues: [23340]
PR #88836 references issues: [23477]
PR #88792 references issues: [23423]
PR #88768 references issues: [21638]
PR #88592 references issues: [88221]
PR #88765 references issues: [23395]
PR #88718 references issues: [23307]
PR #88593 references issues: [23060]
PR #88693 references issues: [22968]
PR #88749 references issues: [22808]
PR #88748 references issues: [22799]
PR #88745 references issues: [23433]
PR #88561 references issues: [23232]
PR #88690 references issues: [22383]
PR #88628 references issues: [20399]
PR #88178 references issues: [23018]
PR #88596 references issues: [23355]
PR #88568 references issues: [23351]
PR #88556 references issues: [22897]
PR #88180 references issues: [23020]
PR #88523 references issues: [23271]
PR #88269 references issues: [23181]
PR #88522 references issues: [23051]
P