### First we load the model

In [None]:
import os
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map={"": "cuda" if torch.cuda.is_available() else "cpu"},
    trust_remote_code=True
)


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.06s/it]


### We are trying to build a superset of skills that will contain all the skills in our dataset of the repository

In [2]:
df = pd.read_csv("dataset.csv")
all_skills = set()

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting skills"):
    issue_title = str(row.get("issue_title", ""))
    issue_body = str(row.get("issue_body", ""))
    modified_source = str(row.get("modified_source_files", ""))
    commit_message = str(row.get("commit_messages", ""))

    prompt = f"""You are an AI assistant that extracts technical skills from GitHub-related development activity.

Analyze the following information and list the specific technical skills involved:

Issue Title:
{issue_title}

Issue Description:
{issue_body}

Modified Source Files:
{modified_source}

Commit Message:
{commit_message}

Return only the technical skills in bullet points. Avoid soft skills and generalities.

### Response:
"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(output[0], skip_special_tokens=True)
    skills_block = response.split("### Response:")[-1].strip()

    for line in skills_block.splitlines():
        if line.strip().startswith("-"):
            skill = line.strip().lstrip("-•* ").strip()
            if skill:
                all_skills.add(skill)

# Save to a .txt file with semicolon separation
with open("skill_superset.txt", "w", encoding="utf-8") as f:
    f.write("; ".join(sorted(all_skills)))

print(f"\nSkill superset saved to 'skill_superset.txt' with {len(all_skills)} unique skills.")

Extracting skills:  16%|███████████████▋                                                                                   | 21/133 [02:10<10:58,  5.88s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (39456 > 32768). Running this sequence through the model will result in indexing errors
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (32768). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Extracting skills: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 133/133 [14:08<00:00,  6.38s/it]


Skill superset saved to 'skill_superset.txt' with 348 unique skills.





### Now we will use that list as baseline and try to build the two dataset like the main approach.

In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load main dataset
df = pd.read_csv("dataset.csv")

# Load canonical skill list
with open("skill_superset.txt", "r", encoding="utf-8") as f:
    canonical_skills = [s.strip() for s in f.read().split(";") if s.strip()]
baseline_block = "\n".join(f"- {s}" for s in sorted(set(canonical_skills)))

# Dict that accumulates skills across multiple rows per contributor
contributor_skills = defaultdict(set)

# Process each row
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting contributor skills"):
    contributor_id = row["contributor_id"]
    modified_source_files = str(row.get("modified_source_files", ""))
    commit_messages = str(row.get("commit_messages", ""))

    system_prompt = "You are an expert assistant that extracts technical skills from GitHub commits based on a canonical skill list."

    user_prompt = f"""Here is a list of allowed canonical technical skills:
{baseline_block}

Now analyze the following contribution:

Modified Source Files:
{modified_source_files}

Commit Message:
{commit_messages}

Return only the relevant skills from the above list in bullet points. No explanation.
"""

    # Format as chat template
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    chat_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(chat_input, return_tensors="pt").to(model.device)

    # Generate model response
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=4096,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    response_text = tokenizer.decode(output[0], skip_special_tokens=True)
    skills_block = response_text[len(chat_input):].strip()

    for line in skills_block.splitlines():
        if line.strip().startswith("-"):
            skill = line.strip().lstrip("-•* ").strip()
            if skill:
                contributor_skills[contributor_id].add(skill)

# Build final DataFrame
final_data = []
all_contributors = df["contributor_id"].unique()

for seq, cid in enumerate(all_contributors, start=1):
    skills = contributor_skills.get(cid)
    if not skills:
        skill_text = "No significant skills found."
    else:
        skill_text = ", ".join(sorted(skills))

    final_data.append({
        "sequence": seq,
        "contributor_id": cid,
        "skills": skill_text
    })

output_df = pd.DataFrame(final_data)
output_df.to_csv("selective_contributor_skills.csv", index=False)

print(f"\nSaved to 'selective_contributor_skill.csv' with {len(output_df)} contributors.")


Extracting contributor skills:  16%|█████████████▋                                                                         | 21/133 [01:04<05:08,  2.75s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (41361 > 32768). Running this sequence through the model will result in indexing errors
Extracting contributor skills: 100%|██████████████████████████████████████████████████████████████████████████████████████| 133/133 [33:23<00:00, 15.06s/it]


Saved to 'selective_contributor_skill.csv' with 61 contributors.





### Now we filter out those contributors who have no significant skills in terms of this repository

In [24]:
import pandas as pd

# Load dataset
df = pd.read_csv("selective_contributor_skills.csv")

# Filter out rows with "No significant skills found."
df_filtered = df[df["skills"].str.strip() != "No significant skills found."]

df_filtered.to_csv("selective_contributor_skills_filtered.csv", index=False)

print(f"Saved filtered dataset to 'selective_contributor_skills_filtered.csv' with {len(df_filtered)} contributors.")


Saved filtered dataset to 'selective_contributor_skills_filtered.csv' with 56 contributors.


### Now it's time to extract required skills for issues like the main approach but using canonical superset skills

In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load dataset
df = pd.read_csv("dataset.csv")

# Load canonical skill list
with open("skill_superset.txt", "r", encoding="utf-8") as f:
    canonical_skills = [s.strip() for s in f.read().split(";") if s.strip()]
baseline_block = "\n".join(f"- {s}" for s in sorted(set(canonical_skills)))

# Load model/tokenizer although it is not required as loaded earlier
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map={"": "cuda" if torch.cuda.is_available() else "cpu"},
    trust_remote_code=True
)

# Store results
issue_skill_data = []

#Here the prompts are self explanatory
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting issue skills"):
    issue_id = row["issue_id"]
    issue_title = str(row.get("issue_title", ""))
    issue_body = str(row.get("issue_body", ""))

    system_prompt = "You are an AI assistant that identifies technical skills needed to solve GitHub issues using a predefined skill list."

    user_prompt = f"""Here is a list of allowed canonical technical skills:
{baseline_block}

Analyze the following GitHub issue and return only the relevant skills from the list above.

Issue Title:
{issue_title}

Issue Description:
{issue_body}

Return only the skills in bullet points format. No extra explanations.
"""

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    chat_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(chat_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=4096,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    skill_lines = response_text[len(chat_prompt):].strip().splitlines()

    extracted_skills = []
    for line in skill_lines:
        if line.strip().startswith("-"):
            skill = line.strip().lstrip("-•* ").strip()
            if skill:
                extracted_skills.append(skill)

    if extracted_skills:  # Only include rows with extracted skills
        issue_skill_data.append({
            "issue_id": issue_id,
            "required_skills": ", ".join(sorted(set(extracted_skills)))
        })

# Save final output
pd.DataFrame(issue_skill_data).to_csv("selective_issue_skills.csv", index=False)
print(f"\nSaved filtered issue skill dataset to 'selective_issue_skills.csv' with {len(issue_skill_data)} issues.")


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.10it/s]
Extracting issue skills: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 133/133 [18:39<00:00,  8.42s/it]


Saved filtered issue skill dataset to 'selective_issue_skills.csv' with 127 issues.





### Now for skill matching in the main approach the LLM performed very poor. So, here we are using TF-IDF and s-BERT only.

In [25]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from tqdm import tqdm

# Load files
issue_df = pd.read_csv("selective_issue_skills.csv")
contrib_df = pd.read_csv("selective_contributor_skills_filtered.csv")
dataset_df = pd.read_csv("dataset.csv")

# Build ground-truth: issue_id → contributor_id
issue_to_true_contrib = dataset_df.dropna(subset=["issue_id", "contributor_id"]) \
    .drop_duplicates("issue_id")[["issue_id", "contributor_id"]] \
    .set_index("issue_id")["contributor_id"].to_dict()

# TF-IDF model
all_docs = list(issue_df["required_skills"].astype(str)) + list(contrib_df["skills"].astype(str))
vectorizer = TfidfVectorizer(tokenizer=lambda x: [s.strip().lower() for s in x.split(",")])
tfidf_matrix = vectorizer.fit_transform(all_docs)

# Split vectors
issue_vectors = tfidf_matrix[:len(issue_df)]
contrib_vectors = tfidf_matrix[len(issue_df):]

# Accuracy tracking
top_k = 15
correct = 0
total = 0

for i, issue_row in tqdm(issue_df.iterrows(), total=len(issue_df), desc="Evaluating top-k"):
    issue_id = issue_row["issue_id"]
    if issue_id not in issue_to_true_contrib:
        continue

    true_contributor = issue_to_true_contrib[issue_id]
    issue_vec = issue_vectors[i]
    scores = cosine_similarity(issue_vec, contrib_vectors)[0]
    top_k_indices = scores.argsort()[::-1][:top_k]
    top_contrib_ids = [contrib_df.iloc[j]["contributor_id"] for j in top_k_indices]

    if true_contributor in top_contrib_ids:
        correct += 1
    total += 1

accuracy = correct / total if total > 0 else 0

print(f"\nTop-{top_k} Accuracy: {accuracy:.2%} ({correct}/{total} correct)")


Evaluating top-k: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 127/127 [00:00<00:00, 1302.41it/s]


Top-15 Accuracy: 61.42% (78/127 correct)





### Now s-BERT

In [None]:
import pandas as pd
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util

# Load data
issue_df = pd.read_csv("selective_issue_skills.csv")
contrib_df = pd.read_csv("selective_contributor_skills.csv")
dataset_df = pd.read_csv("dataset.csv")

# Build ground truth: issue_id → contributor_id
issue_to_true_contrib = dataset_df.dropna(subset=["issue_id", "contributor_id"]) \
    .drop_duplicates("issue_id")[["issue_id", "contributor_id"]] \
    .set_index("issue_id")["contributor_id"].to_dict()

# s-BERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Prepare texts
issue_texts = issue_df["required_skills"].astype(str).tolist()
contrib_texts = contrib_df["skills"].astype(str).tolist()
contrib_ids = contrib_df["contributor_id"].tolist()

# Encode embeddings
issue_embeddings = model.encode(issue_texts, convert_to_tensor=True, show_progress_bar=True)
contrib_embeddings = model.encode(contrib_texts, convert_to_tensor=True, show_progress_bar=True)

# Evaluate top-k accuracy
top_k = 15
correct = 0
total = 0

for i, issue_row in tqdm(issue_df.iterrows(), total=len(issue_df), desc="Evaluating Top-k"):
    issue_id = issue_row["issue_id"]
    if issue_id not in issue_to_true_contrib:
        continue

    true_contributor = issue_to_true_contrib[issue_id]
    scores = util.cos_sim(issue_embeddings[i], contrib_embeddings)[0]
    top_k_indices = torch.topk(scores, k=top_k).indices.tolist()
    top_k_ids = [contrib_ids[j] for j in top_k_indices]

    if true_contributor in top_k_ids:
        correct += 1
    total += 1

accuracy = correct / total if total > 0 else 0

print(f"\nTop-{top_k} Accuracy using s-BERT: {accuracy:.2%} ({correct}/{total} correct)")


Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 65.54it/s]
Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 45.73it/s]
Evaluating Top-k: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 127/127 [00:00<00:00, 4149.64it/s]


Top-15 Accuracy using s-BERT: 56.69% (72/127 correct)



