### Load the Model on GPU

Here I am trying to load the model only on the second GPU of my server.

In [1]:
import os
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer

# Set CUDA device to 0
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
if torch.cuda.device_count() > 1:
    torch.cuda.set_device(0)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load Qwen model
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map={"": device},  # Place on CUDA:0
    trust_remote_code=True
).to(device)


  from .autonotebook import tqdm as notebook_tqdm
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.12it/s]


### Building contributor_skills.csv

This dataset contains primary skills integrating for the same contributor. So, we have dataset for unique contributors.

In [1]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from collections import defaultdict

model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map={"": "cuda" if torch.cuda.is_available() else "cpu"},
    trust_remote_code=True
)

df = pd.read_csv("dataset.csv")
contributor_skills = defaultdict(set)

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting Skills"):
    contributor_id = row["contributor_id"]
    modified_source_files = str(row["modified_source_files"])
    commit_messages = str(row["commit_messages"])

    existing_skills_list = sorted(contributor_skills[contributor_id])
    existing_skills_str = "\n".join(f"- {s}" for s in existing_skills_list) if existing_skills_list else "None"

    system_prompt = "You are an AI assistant that analyzes code changes and commit messages to identify technical skills."
    user_prompt = f"""Contributor's existing skills:
    
{existing_skills_str}

New code changes:
{modified_source_files}

Commit message:
{commit_messages}

List only the new specific technical skills (not listed above) demonstrated in this change and commit message in bullet points.
No extra descriptions or explanations."""

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    chat_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    input_ids = tokenizer(chat_prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(
            **input_ids,
            max_new_tokens=2048,
            do_sample=False
        )

    response_text = tokenizer.decode(output[0], skip_special_tokens=True)
    skills_output = response_text[len(chat_prompt):].strip()

    new_skills = set()
    for line in skills_output.splitlines():
        line = line.strip()
        if line.startswith("-"):
            skill = line[1:].strip()
            if skill:
                new_skills.add(skill)

    contributor_skills[contributor_id].update(new_skills)

final_data = []
for seq, (contributor_id, skills_set) in enumerate(contributor_skills.items(), start=1):
    final_data.append({
        "sequence": seq,
        "contributor_id": contributor_id,
        "skills": ", ".join(sorted(skills_set))
    })

output_df = pd.DataFrame(final_data)
output_df.to_csv("contributor_skills.csv", index=False)

print("All rows processed. Final dataset saved as 'contributor_skills.csv'")


  from .autonotebook import tqdm as notebook_tqdm
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.10it/s]
Extracting Skills:  16%|███████████████▋                                                                                   | 21/133 [00:39<03:29,  1.87s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (39474 > 32768). Running this sequence through the model will result in indexing errors
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (32768). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Extracting Skills: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 13

All rows processed. Final dataset saved as 'contributor_skills_simple.csv'





### Apply only this stage was not good for accuracy, keeping for reference

### Merging Skills based on Few Shots

Tried merging repeating skills using few shots but the result was not good.

In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

df = pd.read_csv("contributor_skills.csv")

few_shot_prompt = """You are a helpful assistant that cleans and merges technical skills in bullet points.

Here are some examples:
Example 1:
Input Skills:
- Used Bazel build system configuration
- Demonstrated knowledge of build automation tools (Bazel)
- Updated build templates for CUDA components

Merged Skills:
- Bazel
- Build System Configuration

Example 2:
Input Skills:
- Confirmed ability to manage project timelines and deliverables
- Proven skill in contributing to successful and impactful software products
- Proven skill in contributing to the development of high-quality software products

Merged Skills:
- Software Project Management

Example 3:
Input Skills:
- Added detailed HLO operation profiles for NVIDIA B200 GPU
- Added GPU spec for B200
- Added autotuning results to fix gpu_compiler_test for Blackwell

Merged Skills:
- GPU Architecture (B200, Blackwell)
- HLO Operation Profiling

Example 4:
Input Skills:
- Highlighted experience with continuous integration and delivery pipelines
- Showcased proficiency in automated testing and integration

Merged Skills:
- CI/CD Pipelines
- Automated Testing
"""

merged_data = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Merging skills"):
    contributor_id = row["contributor_id"]
    sequence = row["sequence"]
    raw_skills = row["skills"]

    prompt = f"""{few_shot_prompt}

Now merge the following skills for a contributor. Only merge contextually similar or identical skills. 
Do NOT remove distinct skills. Return the final cleaned and merged skill list in bullet points.

Input Skills:
{raw_skills}

Merged Skills:"""

    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    with torch.no_grad():
        output = model.generate(
            **input_ids,
            max_new_tokens=10000,
            do_sample=False
        )

    response_text = tokenizer.decode(output[0], skip_special_tokens=True)
    cleaned_skills_block = response_text[len(prompt):].strip()

    merged_skills = []
    for line in cleaned_skills_block.splitlines():
        if line.strip().startswith("-"):
            skill = line.strip().lstrip("-•* ").strip()
            if skill:
                merged_skills.append(skill)

    merged_data.append({
        "sequence": sequence,
        "contributor_id": contributor_id,
        "skills": ", ".join(sorted(set(merged_skills)))
    })

merged_df = pd.DataFrame(merged_data)
merged_df.to_csv("contributor_skills_merged.csv", index=False)

print("Merged skill set saved to 'contributor_skills_merged.csv'")


### Merging Skills based on zero shots

This time the result was better. So, it is the final dataset for contributors's skills dataset.

In [8]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

df = pd.read_csv("contributor_skills.csv")

merged_data = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Merging skills"):
    contributor_id = row["contributor_id"]
    sequence = row["sequence"]
    raw_skills = row["skills"]


    prompt = f"""Here is a list of technical skills for a contributor. Some of them are repeated in different wording or semantically similar.

Please merge the contextually similar or identical skills into a cleaner form. Do NOT remove distinct skills.
Return a cleaned, concise list of unique skills in bullet points (one skill per line).
No full sentences, no duplicates.

Input Skills:
{raw_skills}

Merged Skills:"""

    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    with torch.no_grad():
        output = model.generate(
            **input_ids,
            max_new_tokens=2048,
            do_sample=False
        )

    response_text = tokenizer.decode(output[0], skip_special_tokens=True)
    cleaned_skills_block = response_text[len(prompt):].strip()

    merged_skills = []
    for line in cleaned_skills_block.splitlines():
        if line.strip().startswith("-"):
            skill = line.strip().lstrip("-•* ").strip()
            if skill:
                merged_skills.append(skill)

    merged_data.append({
        "sequence": sequence,
        "contributor_id": contributor_id,
        "skills": ", ".join(sorted(set(merged_skills)))
    })

merged_df = pd.DataFrame(merged_data)
merged_df.to_csv("contributor_skills_merged.csv", index=False)

print("Merged skill set saved to 'contributor_skills_primary_merged.csv'")


Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.08it/s]
Merging skills: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 61/61 [16:30<00:00, 16.24s/it]

✅ Merged skill set saved to 'contributor_skills_primary_merged.csv'





### Important Note for Future Research: Please be aware that while numerous contributor_skills.csv files may have been observed previously, they are not available in the repository. The final contributor skills dataset, which achieved the optimal accuracy and is currently designated as contributor_skills.csv, was referred to as contributor_skills_primary_merged.csv in the preceding cell.

### Creating Required Skills from Issue Title and Issue Description

The dataset is okay but contains texts like assistant and so on.

In [19]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer


df = pd.read_csv("dataset.csv")

issue_skills_data = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting required skills"):
    issue_id = row["issue_id"]
    issue_title = str(row.get("issue_title", ""))
    issue_body = str(row["issue_body"])

    system_prompt = "You are a software architect helping identify skills needed to resolve GitHub issues."
    user_prompt = f"""Here is an issue's title and issue description.

Issue Title:
{issue_title}

Issue Description:
{issue_body}

What are the technical skills required to solve this issue?

### Response:"""

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    chat_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(chat_text, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=5000,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )

    full_response = tokenizer.decode(output[0], skip_special_tokens=True)
    extracted_response = full_response.split("### Response:")[-1].strip()

    issue_skills_data.append({
        "issue_id": issue_id,
        "required_skills": extracted_response
    })
    
output_df = pd.DataFrame(issue_skills_data)
output_df.to_csv("issue_skills.csv", index=False)

print("Role-based skill extraction complete. Saved to 'issue_skills.csv'")


Extracting required skills: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 133/133 [13:09<00:00,  5.94s/it]

Role-based skill extraction complete. Saved to 'issue_skills.csv'





### Cleaning the Datasets

We clean the data and extracts only skills related texts.

In [21]:
import re
import pandas as pd

def extract_numbered_skills(text):
    # Extract from first "1." to end or before conclusion phrases
    skill_block_match = re.search(r"1\..*?(?:(?:By possessing these skills)|(?:If you have any specific details)|$)", text, re.DOTALL)
    if not skill_block_match:
        return ""
    
    skill_block = skill_block_match.group(0).strip()

    # Keep only lines that start with numbers or bullet indicators
    lines = skill_block.splitlines()
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        if re.match(r"^\d+\.", line) or re.match(r"^[-•*]", line) or re.match(r"^[-\s]{2,}", line):
            cleaned_lines.append(line)

    return "\n".join(cleaned_lines)


df = pd.read_csv("issue_skills.csv")
df["required_skills"] = df["required_skills"].apply(extract_numbered_skills)

df.to_csv("issue_skills.csv", index=False)

print("Cleaned skills saved to 'issue_skills.csv'")


Cleaned skills saved to 'issue_skills.csv'


In [7]:
import pandas as pd

df = pd.read_csv("issue_skills.csv")

# Drop NaN first, then drop rows where string is empty or only whitespace
df_cleaned = df.dropna(subset=["required_skills"])
df_cleaned = df_cleaned[df_cleaned["required_skills"].astype(str).str.strip() != ""]

# Save cleaned file
df_cleaned.to_csv("issue_skills.csv", index=False)
print(f"Cleaned file saved as 'issue_skills.csv' with {len(df_cleaned)} rows.")


Cleaned file saved as 'issue_skills.csv' with 130 rows.


### Using LLM to find out Top Contributors

We find the top_k contributors using LLM and as there is limit for input tokens, we use batch based technique.

In [3]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

issues_df = pd.read_csv("issue_skills.csv")
contributors_df = pd.read_csv("contributor_skills.csv")

max_token_limit = 30000
top_k = 10

issue_row = issues_df.iloc[0]
issue_id = issue_row["issue_id"]
required_skills = str(issue_row["required_skills"])

batches = []
current_batch = []
current_token_count = 0

for idx, row in contributors_df.iterrows():
    cid = row["contributor_id"]
    skills = str(row["skills"])
    entry = f"Contributor ID: {cid}\nSkills:\n{skills}\n\n"
    tokens = len(tokenizer(entry)["input_ids"])
    
    # Start a new batch if adding the current entry exceeds token limit
    if current_token_count + tokens > max_token_limit:
        batches.append(current_batch)
        current_batch = []
        current_token_count = 0
    
    current_batch.append((cid, skills))
    current_token_count += tokens

if current_batch:
    batches.append(current_batch)

valid_ids = set(contributors_df["contributor_id"].astype(str))
contributor_scores = {}

for batch in tqdm(batches, desc="Scoring contributors"):
    # Create a single input text block with all contributors in the batch
    contributor_block = "\n".join(
        f"Contributor ID: {cid}\nSkills:\n{skills}" for cid, skills in batch
    )

    prompt_text = f"""You are a helpful assistant that recommends contributors for GitHub issues.

Given the following required skills for an issue:

{required_skills}

Below is a list of contributors and their skills:

{contributor_block}

Please rate each contributor from 0 to 100 based on how suitable they are for solving the issue.

Return the output in this format:
contributor_id: score

Only return one contributor per line. Do not include explanations.

### Response:
"""

    inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)
    
    # Run the model in inference mode without gradient computation
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=1000,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    full_output = tokenizer.decode(output[0], skip_special_tokens=True)
    response = full_output.split("### Response:")[-1].strip()

    for line in response.splitlines():
        if ":" in line:
            parts = line.split(":", 1)
            cid = parts[0].strip()
            try:
                score = int(parts[1].strip())
                # Only record valid contributor IDs with scores in the correct range
                if cid in valid_ids and 0 <= score <= 100:
                    contributor_scores[cid] = score
            except ValueError:
                continue

top_contributors = sorted(contributor_scores.items(), key=lambda x: -x[1])[:top_k]

print(f"\nTop {top_k} Contributors for Issue ID {issue_id}:\n")
for cid, score in top_contributors:
    print(f"{cid} — Score: {score}/100")


Scoring contributors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.91s/it]


Top 10 Contributors for Issue ID 24456:

kasper0406 — Score: 85/100
larsoner — Score: 85/100
jreiffers — Score: 80/100
dimvar — Score: 0/100
sergachev — Score: 0/100
jaro-sevcik — Score: 0/100
acxz — Score: 0/100
chaserileyroberts — Score: 0/100
tyb0807 — Score: 0/100
terryysun — Score: 0/100





### Finding top_k Accuracy for LLM based Approach

In [42]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map={"": "cuda" if torch.cuda.is_available() else "cpu"},
    trust_remote_code=True
)

# Load datasets
dataset_df = pd.read_csv("dataset.csv")
issues_df = pd.read_csv("issue_skills.csv")
contributors_df = pd.read_csv("contributor_skills.csv")

issue_skill_map = dict(zip(issues_df["issue_id"], issues_df["required_skills"].astype(str)))
contributor_skill_map = dict(zip(contributors_df["contributor_id"], contributors_df["skills"].astype(str)))

top_k = 15
correct = 0
total = 0

for _, row in tqdm(dataset_df.iterrows(), total=len(dataset_df), desc="Evaluating Issues"):
    issue_id = row["issue_id"]
    true_contributor = row["contributor_id"]

    if issue_id not in issue_skill_map:
        continue

    required_skills = issue_skill_map[issue_id]

    # Prepare batches (same logic as before)
    max_token_limit = 30000
    batches = []
    current_batch = []
    current_token_count = 0

    for cid, skills in contributor_skill_map.items():
        entry = f"Contributor ID: {cid}\nSkills:\n{skills}\n\n"
        tokens = len(tokenizer(entry)["input_ids"])
        if current_token_count + tokens > max_token_limit:
            batches.append(current_batch)
            current_batch = []
            current_token_count = 0
        current_batch.append((cid, skills))
        current_token_count += tokens
    if current_batch:
        batches.append(current_batch)

    contributor_scores = {}

    for batch in batches:
        contributor_block = "\n".join(
            f"Contributor ID: {cid}\nSkills:\n{skills}" for cid, skills in batch
        )

        prompt_text = f"""You are a helpful assistant that recommends contributors for GitHub issues.

Given the following required skills for an issue:

{required_skills}

Below is a list of contributors and their skills:

{contributor_block}

Please rate each contributor from 0 to 100 based on how suitable they are for solving the issue.

Return the output in this format:
contributor_id: score

Only return one contributor per line. Do not include explanations.

### Response:
"""

        inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=1500,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        full_output = tokenizer.decode(output[0], skip_special_tokens=True)
        response = full_output.split("### Response:")[-1].strip()

        for line in response.splitlines():
            if ":" in line:
                parts = line.split(":", 1)
                cid = parts[0].strip()
                try:
                    score = int(parts[1].strip())
                    if 0 <= score <= 100:
                        contributor_scores[cid] = score
                except ValueError:
                    continue

    # Evaluate top_k
    total += 1
    top_contributors = sorted(contributor_scores.items(), key=lambda x: -x[1])[:top_k]
    top_ids = [cid for cid, _ in top_contributors]
    if true_contributor in top_ids:
        correct += 1

accuracy = correct / total if total > 0 else 0.0
print(f"\nTop-{top_k} Accuracy: {accuracy:.4f} ({correct}/{total})")


Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.24s/it]
Evaluating Issues: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 133/133 [27:58<00:00, 12.62s/it]


Top-15 Accuracy: 0.2331 (31/133)





Top-10 Accuracy: 0.1955 (26/133)

In [None]:
!pip install sentence-transformers


### Using s-BERT to find out top contributors

In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")


issue_df = pd.read_csv("issue_skills.csv")
contributor_df = pd.read_csv("contributor_skills.csv")

issue_row = issue_df.iloc[0]
issue_id = issue_row["issue_id"]
required_skills_text = str(issue_row["required_skills"])

contributor_ids = contributor_df["contributor_id"].tolist()
contributor_skills = contributor_df["skills"].astype(str).tolist()

issue_embedding = model.encode(required_skills_text, convert_to_tensor=True)
contributor_embeddings = model.encode(contributor_skills, convert_to_tensor=True)

#cosine similarity on s-BERT embeddings
cosine_scores = util.cos_sim(issue_embedding, contributor_embeddings)[0]

top_k = 15
top_results = torch.topk(cosine_scores, k=top_k)

print(f"\nTop {top_k} contributors for Issue ID {issue_id}:\n")
for score, idx in zip(top_results.values, top_results.indices):
    print(f"{contributor_ids[idx]} — Score: {score.item():.4f}")


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.

Top 15 contributors for Issue ID 24456:

sergachev — Score: 0.6174
agriyakhetarpal — Score: 0.6109
kasper0406 — Score: 0.6051
ngoldbaum — Score: 0.6002
yakovdan — Score: 0.5892
terryysun — Score: 0.5863
chunhsue — Score: 0.5806
charris — Score: 0.5745
tensorflower-gardener — Score: 0.5719
baskargopinath — Score: 0.5441
yliu120 — Score: 0.5261
unknown — Score: 0.5249
HaoZeke — Score: 0.5172
wheeleha — Score: 0.4947
mayeut — Score: 0.4944


### Finding top_k Accuracy for s-BERT

In [9]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")

dataset_df = pd.read_csv("dataset.csv")
issue_df = pd.read_csv("issue_skills.csv")
contributor_df = pd.read_csv("contributor_skills.csv")

issue_skill_map = dict(zip(issue_df["issue_id"], issue_df["required_skills"].astype(str)))
contributor_skill_map = dict(zip(contributor_df["contributor_id"], contributor_df["skills"].astype(str)))

top_k = 15
total = 0
correct = 0

for _, row in dataset_df.iterrows():
    issue_id = row["issue_id"]
    true_contributor = row["contributor_id"]

    if issue_id not in issue_skill_map or true_contributor not in contributor_skill_map:
        continue

    required_skills_text = issue_skill_map[issue_id]

    contributor_ids = list(contributor_skill_map.keys())
    contributor_skills = list(contributor_skill_map.values())

    issue_embedding = model.encode(required_skills_text, convert_to_tensor=True)
    contributor_embeddings = model.encode(contributor_skills, convert_to_tensor=True)

    cosine_scores = util.cos_sim(issue_embedding, contributor_embeddings)[0]
    top_results = torch.topk(cosine_scores, k=top_k)

    top_contributor_ids = [contributor_ids[idx] for idx in top_results.indices.tolist()]
    total += 1
    if true_contributor in top_contributor_ids:
        correct += 1

accuracy = correct / total if total > 0 else 0.0
accuracy = accuracy*100
print(f"\nTop-{top_k} s-BERT Accuracy: {accuracy:.2f} % ({correct}/{total})")



Top-15 s-BERT Accuracy: 60.00 % (78/130)


### Using TF-IDF for Checking Top Contributors

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

issues_df = pd.read_csv("issue_skills.csv")
contributors_df = pd.read_csv("contributor_skills.csv")

issue_row = issues_df.iloc[0]
issue_id = issue_row["issue_id"]
required_skills = str(issue_row["required_skills"])

contributor_ids = contributors_df["contributor_id"].tolist()
contributor_skills = contributors_df["skills"].astype(str).tolist()

corpus = [required_skills] + contributor_skills

# using TF-IDF to generate embeddings
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

#cosine similarity on generated embeddings
cosine_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

top_k = 10
top_indices = cosine_scores.argsort()[::-1][:top_k]

print(f"\nTop {top_k} TF-IDF Contributors for Issue ID {issue_id}:\n")
for idx in top_indices:
    cid = contributor_ids[idx]
    score = cosine_scores[idx]
    print(f"{cid} — Score: {score:.4f}")



Top 10 TF-IDF Contributors for Issue ID 24456:

apivovarov — Score: 0.3445
charris — Score: 0.3336
ngoldbaum — Score: 0.3137
kasper0406 — Score: 0.2856
mhvk — Score: 0.2648
yakovdan — Score: 0.2635
mayeut — Score: 0.2439
terryysun — Score: 0.2326
jreiffers — Score: 0.2303
ArvidJB — Score: 0.2273


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

issues_df = pd.read_csv("issue_skills.csv")
contributors_df = pd.read_csv("contributor_skills.csv")

issue_row = issues_df.iloc[0]
issue_id = issue_row["issue_id"]
required_skills = str(issue_row["required_skills"])

contributor_ids = contributors_df["contributor_id"].tolist()
contributor_skills = contributors_df["skills"].astype(str).tolist()

corpus = [required_skills] + contributor_skills

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

cosine_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

bottom_k = 10
bottom_indices = cosine_scores.argsort()[:bottom_k]  # No [::-1] = lowest scores

print(f"\nBottom {bottom_k} TF-IDF Contributors for Issue ID {issue_id}:\n")
for idx in bottom_indices:
    cid = contributor_ids[idx]
    score = cosine_scores[idx]
    print(f"{cid} — Score: {score:.4f}")



Bottom 10 TF-IDF Contributors for Issue ID 24456:

SaraInCode — Score: 0.0000
StanFromIreland — Score: 0.0000
sterrettm2 — Score: 0.0000
setbit123 — Score: 0.0000
DWesl — Score: 0.0067
acxz — Score: 0.0152
jiunkaiy — Score: 0.0222
hauntsaninja — Score: 0.0238
philipphack — Score: 0.0242
Tixxx — Score: 0.0344


### Finding Accuracy of TF-IDF based top_k Accuracy

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

dataset_df = pd.read_csv("dataset.csv")
issues_df = pd.read_csv("issue_skills.csv")
contributors_df = pd.read_csv("contributor_skills.csv")

# Convert maps for fast lookup
issue_skill_map = dict(zip(issues_df["issue_id"], issues_df["required_skills"].astype(str)))
contributor_skill_map = dict(zip(contributors_df["contributor_id"], contributors_df["skills"].astype(str)))

top_k = 15
total = 0
correct = 0

for _, row in dataset_df.iterrows():
    issue_id = row["issue_id"]
    true_contributor = row["contributor_id"]

    if issue_id not in issue_skill_map or true_contributor not in contributor_skill_map:
        continue

    required_skills = issue_skill_map[issue_id]
    if pd.isna(required_skills) or required_skills.strip() == "":
        continue

    # Build valid contributor pool (skip 'No significant skills found.')
    contributor_ids = []
    contributor_texts = []
    for cid, skills in contributor_skill_map.items():
        if pd.isna(skills) or skills.strip() == "No significant skills found.":
            continue
        contributor_ids.append(cid)
        contributor_texts.append(skills)

    # Skip if contributor pool is empty or true_contributor was filtered out
    if not contributor_ids or true_contributor not in contributor_ids:
        continue

    # TF-IDF vectorization and cosine similarity
    corpus = [required_skills.strip()] + contributor_texts
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    cosine_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    top_indices = cosine_scores.argsort()[::-1][:top_k]
    top_contributors = [contributor_ids[i] for i in top_indices]

    total += 1
    if true_contributor in top_contributors:
        correct += 1

accuracy = (correct / total) * 100 if total > 0 else 0.0
print(f"\nTop-{top_k} TF-IDF Accuracy: {accuracy:.2f} % ({correct}/{total})")



Top-15 TF-IDF Accuracy: 70.00 % (91/130)
