# Load cleaned issues data

In [1]:
import os
import pandas as pd

# Define input directory
input_dir = "issues-cleanedData"

# Initialize dictionary to hold DataFrames
issues_cleaned_data = {}

# Process each CSV file
for filename in os.listdir(input_dir):
    if filename.endswith(".csv"):
        file_path = os.path.join(input_dir, filename)
        df = pd.read_csv(file_path)

        # Check if required columns are present
        if all(col in df.columns for col in ['issue_state', 'days_elapsed', 'status']):
            # Filter out rows where 'issue_state' is 'NA'
            df = df.dropna(subset=['issue_state'])
            
            # Keep only the required columns
            df = df[['issue_state', 'days_elapsed', 'status']]
            
            # Add DataFrame to dictionary using filename (without .csv) as key
            issues_cleaned_data[filename[:-11]] = df


issues_cleaned_data['age']


Unnamed: 0,issue_state,days_elapsed,status
0,CLOSED,4,Graduated
3,CLOSED,21,Graduated
9,CLOSED,1,Graduated
12,CLOSED,7,Graduated
14,CLOSED,287,Graduated
...,...,...,...
4945,CLOSED,9,Graduated
4946,OPEN,24,Graduated
4952,OPEN,12,Graduated
4954,OPEN,12,Graduated


# Import scraper data

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


def clean_csv_files(folder_path):
  cleaned_dataframes = {}

  # List of columns to drop
  columns_to_drop = [
      "status", "start_date", "end_date", "window_start_date", "window_end_date",
      "emails", "devs", "emails_thread_starter", "emails_thread_starter_word_count",
      "emails_thread_starter_characters", "emails_threads", "emails_threads_word_count",
      "emails_threads_characters", "emails_no_replies", "emails_no_replies_word_count",
      "emails_no_replies_characters", "emails_jira", "most_complex_unit_loc",
      "most_complex_unit_mcabe_index", "total_number_of_files", "number_of_files_main",
      "lines_of_code_main", "number_of_files_test", "lines_of_code_test",
      "test_vs_main_lines_of_code_percentage", "number_of_files_generated",
      "lines_of_code_generated", "number_of_files_build_and_deployment",
      "lines_of_code_build_and_deployment", "negligible_risk_file_size_count",
      "low_risk_file_size_count", "medium_risk_file_size_count", "high_risk_file_size_count",
      "very_high_risk_file_size_count", "negligible_risk_file_size_loc", "low_risk_file_size_loc",
      "medium_risk_file_size_loc", "high_risk_file_size_loc", "very_high_risk_file_size_loc",
      "number_of_units", "lines_of_code_in_units", "lines_of_code_outside_units",
      "unit_size_negligible_risk_loc", "unit_size_negligible_risk_count", "unit_size_low_risk_loc",
      "unit_size_low_risk_count", "unit_size_medium_risk_loc", "unit_size_medium_risk_count",
      "unit_size_high_risk_loc", "unit_size_high_risk_count", "unit_size_very_high_risk_loc",
      "unit_size_very_high_risk_count", "conditional_complexity_negligible_risk_loc",
      "conditional_complexity_negligible_risk_count", "conditional_complexity_low_risk_loc",
      "conditional_complexity_low_risk_count", "conditional_complexity_medium_risk_loc",
      "conditional_complexity_medium_risk_count", "conditional_complexity_high_risk_loc",
      "conditional_complexity_high_risk_count", "conditional_complexity_very_high_risk_loc",
      "conditional_complexity_very_high_risk_count", "conditional_complexity_high_plus_risk_count",
      "conditional_complexity_high_plus_risk_loc", "number_of_contributors",
      "duplication_number_of_duplicates", "duplication_number_of_files_with_duplicates",
      "duplication_number_of_duplicated_lines", "duplication_percentage", "unit_duplicates_count", "releases"
  ]

  for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
      file_path = os.path.join(folder_path, filename)

      # Load CSV file
      df = pd.read_csv(file_path)

      # Drop specified columns
      df = df.drop(
          columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')

      key = os.path.splitext(filename)[0]
      cleaned_dataframes[key] = df

  return cleaned_dataframes

folder_path = "scraper-output"
cleaned_data = clean_csv_files(folder_path)

for key, df in cleaned_data.items():
    # Replace NaN values in numerical columns with 0
    for col in df.select_dtypes(include=[np.number]).columns:
        df[col] = df[col].fillna(0)

    # Replace NaN and blank/empty values in 'programming_lang' column with the mode
    if 'programming_lang' in df.columns:
        # Calculate mode value
        mode_value = df['programming_lang'].mode()[0] if not df['programming_lang'].mode().empty else 'Unknown'
        
        # Replace NaN values with the mode
        df['programming_lang'] = df['programming_lang'].fillna(mode_value)
        
        # Replace blank or whitespace-only values with the mode
        df['programming_lang'] = df['programming_lang'].replace(r'^\s*$', mode_value, regex=True)

status_data = pd.read_csv("project-status.csv")

# Filter out projects with fewer than 10 data points
cleaned_data = {project: df for project,
                df in cleaned_data.items() if len(df) >= 10}

def merge_status(cleaned_data, status_data):
  status_dict = status_data.set_index('project')['status'].to_dict()
  for project, df in cleaned_data.items():
    df['status'] = status_dict.get(project, 'Unknown')
  return cleaned_data

cleaned_data = merge_status(cleaned_data, status_data)

for key, df in cleaned_data.items():
  cleaned_data[key] = df[["commits", "authors",
                                      "committers", "status"]]


# Keep scraper output only for projects for which we have issue data

In [3]:
# Get the keys from issues_cleaned_data
issues_cleaned_data_keys = set(issues_cleaned_data.keys())

# Filter cleaned_data to keep only the keys present in issues_cleaned_data
cleaned_data = {key: value for key, value in cleaned_data.items() if key in issues_cleaned_data_keys}

if 'kiso' in issues_cleaned_data.keys():
    del issues_cleaned_data['kiso'] 

# Deepcopy cleaned_data to create cleaned_data_transformer
import copy
cleaned_data_transformer = copy.deepcopy(cleaned_data)

# Print the filtered cleaned_data dictionary
print("Filtered cleaned_data:")
print(len(cleaned_data_transformer), len(issues_cleaned_data))


Filtered cleaned_data:
87 87


# Transformer model on combined issues and scraper data

In [4]:
import torch
import random
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset, Subset, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight

# Set random seed for reproducibility
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
np.random.seed(42)
random.seed(42)

# Device selection
device = torch.device("cuda" if torch.cuda.is_available(
) else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Data Preprocessing
max_length = 512

data = []
labels = []

separator_token = "SEP"

for project, df in cleaned_data_transformer.items():
    text = df.drop(columns=['status']).astype(str).agg(' '.join, axis=1).tolist()
    issue_text = issues_cleaned_data[project].drop(columns=['status']).astype(str).agg(' '.join, axis=1).tolist()
    text = text + [separator_token] + issue_text
    # Ensure that the combined text does not exceed the max_length
    if len(text) > max_length:
        text = text[:max_length]  # Truncate to max_length
    # Pad if necessary
    while len(text) < max_length:
        text.append("PAD")  # Add padding to the sequence
    label = df['status'].iloc[0]
    labels.append(label)
    data.append(" ".join(text))


# Labels are Retired=0, Graduated=1
labels = [0 if label == 'Retired' else 1 for label in labels]

# Compute class weights
class_weights = compute_class_weight(class_weight="balanced",
                                     classes=np.unique(labels),
                                     y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Custom Dataset Class


class ProjectDataset(Dataset):
  def __init__(self, data, labels, tokenizer, max_length):
    self.data = data
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.data)

  def __getitem__(self, item):
    text = self.data[item]
    label = self.labels[item]

    encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'label': torch.tensor(label, dtype=torch.long)
    }


# Create dataset
dataset = ProjectDataset(data, labels, tokenizer, max_length)

# Train/Test/Validation split (70/20/10)
train_data, test_val_data, train_labels, test_val_labels = train_test_split(
    data, labels, test_size=0.3, stratify=labels, random_state=42, shuffle=True)

test_data, val_data, test_labels, val_labels = train_test_split(
    test_val_data, test_val_labels, test_size=1/3, stratify=test_val_labels, random_state=42, shuffle=True)

train_dataset = ProjectDataset(train_data, train_labels, tokenizer, max_length)
test_dataset = ProjectDataset(test_data, test_labels, tokenizer, max_length)
val_dataset = ProjectDataset(val_data, val_labels, tokenizer, max_length)


def train_model(model, train_loader, val_loader, optimizer, epochs=30, patience=5):
  best_val_loss = float('inf')
  patience_counter = 0
  loss_fn = torch.nn.CrossEntropyLoss(
      weight=class_weights)  # Apply class weights

  for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    correct_train = 0
    total_train = 0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{epochs}"):
      optimizer.zero_grad()
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)

      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      logits = outputs.logits
      loss = loss_fn(logits, labels)  # Compute weighted loss

      total_train_loss += loss.item()
      _, predicted = torch.max(logits, dim=1)
      correct_train += (predicted == labels).sum().item()
      total_train += labels.size(0)

      loss.backward()
      optimizer.step()

    model.eval()
    correct_val = 0
    total_val = 0
    val_loss = 0

    with torch.no_grad():
      for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}/{epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)  # Compute weighted loss

        val_loss += loss.item()
        _, predicted = torch.max(logits, dim=1)
        correct_val += (predicted == labels).sum().item()
        total_val += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_acc = correct_val / total_val
    print(f"Epoch {epoch+1} - Train Loss: {total_train_loss/len(train_loader)} | Train Acc: {correct_train/total_train}")
    print(f"Epoch {epoch+1} - Val Loss: {avg_val_loss} | Val Acc: {val_acc}")

    if avg_val_loss < best_val_loss:
      best_val_loss = avg_val_loss
      patience_counter = 0
    else:
      patience_counter += 1
      if patience_counter >= patience:
        print("Early stopping triggered.")
        break


# Model training and evaluation
best_hyperparams = None
best_test_acc = 0.0


def execute_hyperparameter_combination(dropout, weight_decay, lr, batch_size):
  global best_hyperparams, best_test_acc
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  val_loader = DataLoader(val_dataset, batch_size=batch_size)
  test_loader = DataLoader(test_dataset, batch_size=batch_size)

  model = BertForSequenceClassification.from_pretrained(
      'bert-base-uncased',
      num_labels=2,
      hidden_dropout_prob=dropout,
      attention_probs_dropout_prob=dropout
  ).to(device)

  optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

  train_model(model, train_loader, val_loader, optimizer, epochs=30)

  # Final testing
  print("Final evaluation on test set...")
  model.eval()
  all_preds = []
  all_labels = []

  with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)

      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      logits = outputs.logits
      _, predicted = torch.max(logits, dim=1)
      all_preds.extend(predicted.cpu().numpy())
      all_labels.extend(labels.cpu().numpy())

  test_acc = accuracy_score(all_labels, all_preds)
  print(f"\nTest Accuracy: {test_acc:.4f}")

  conf_matrix = confusion_matrix(all_labels, all_preds)
  sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=[
              "Retired", "Graduated"], yticklabels=["Retired", "Graduated"])
  plt.xlabel("Predicted")
  plt.ylabel("Actual")
  plt.title("Confusion Matrix")
  plt.show()

  if test_acc > best_test_acc:
    best_test_acc = test_acc
    best_hyperparams = (dropout, weight_decay, lr, batch_size)


hyperparameter_combinations = []

for dropout in [0.1, 0.2]:
  for weight_decay in [0, 0.01, 0.1]:
    for lr in [1e-5, 5e-5, 2e-4]:
      for batch_size in [16, 32, 64]:
        hyperparameter_combinations.append({
            "dropout": dropout,
            "weight_decay": weight_decay,
            "learning_rate": lr,
            "batch_size": batch_size
        })

Using device: mps
512


In [5]:
for i in range(0, 10):
  hyperparameters = hyperparameter_combinations[i]
  execute_hyperparameter_combination(
      hyperparameters['dropout'],
      hyperparameters['weight_decay'],
      hyperparameters['learning_rate'],
      hyperparameters['batch_size']
  )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1/30:   0%|          | 0/4 [00:09<?, ?it/s]


RuntimeError: MPS backend out of memory (MPS allocated: 8.84 GB, other allocations: 194.67 MB, max allowed: 9.07 GB). Tried to allocate 192.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
for i in range(10, 20):
  hyperparameters = hyperparameter_combinations[i]
  execute_hyperparameter_combination(
      hyperparameters['dropout'],
      hyperparameters['weight_decay'],
      hyperparameters['learning_rate'],
      hyperparameters['batch_size']
  )

In [None]:
for i in range(20, 30):
  hyperparameters = hyperparameter_combinations[i]
  execute_hyperparameter_combination(
      hyperparameters['dropout'],
      hyperparameters['weight_decay'],
      hyperparameters['learning_rate'],
      hyperparameters['batch_size']
  )

In [None]:
for i in range(30, 40):
  hyperparameters = hyperparameter_combinations[i]
  execute_hyperparameter_combination(
      hyperparameters['dropout'],
      hyperparameters['weight_decay'],
      hyperparameters['learning_rate'],
      hyperparameters['batch_size']
  )

In [None]:
for i in range(40, len(hyperparameter_combinations)):
  hyperparameters = hyperparameter_combinations[i]
  execute_hyperparameter_combination(
      hyperparameters['dropout'],
      hyperparameters['weight_decay'],
      hyperparameters['learning_rate'],
      hyperparameters['batch_size']
  )