# Import Dataset, drop 0 columns

In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


def clean_csv_files(folder_path):
  cleaned_dataframes = {}

  # List of columns to drop
  columns_to_drop = [
      "status", "start_date", "end_date", "window_start_date", "window_end_date",
      "emails", "devs", "emails_thread_starter", "emails_thread_starter_word_count",
      "emails_thread_starter_characters", "emails_threads", "emails_threads_word_count",
      "emails_threads_characters", "emails_no_replies", "emails_no_replies_word_count",
      "emails_no_replies_characters", "emails_jira", "most_complex_unit_loc",
      "most_complex_unit_mcabe_index", "total_number_of_files", "number_of_files_main",
      "lines_of_code_main", "number_of_files_test", "lines_of_code_test",
      "test_vs_main_lines_of_code_percentage", "number_of_files_generated",
      "lines_of_code_generated", "number_of_files_build_and_deployment",
      "lines_of_code_build_and_deployment", "negligible_risk_file_size_count",
      "low_risk_file_size_count", "medium_risk_file_size_count", "high_risk_file_size_count",
      "very_high_risk_file_size_count", "negligible_risk_file_size_loc", "low_risk_file_size_loc",
      "medium_risk_file_size_loc", "high_risk_file_size_loc", "very_high_risk_file_size_loc",
      "number_of_units", "lines_of_code_in_units", "lines_of_code_outside_units",
      "unit_size_negligible_risk_loc", "unit_size_negligible_risk_count", "unit_size_low_risk_loc",
      "unit_size_low_risk_count", "unit_size_medium_risk_loc", "unit_size_medium_risk_count",
      "unit_size_high_risk_loc", "unit_size_high_risk_count", "unit_size_very_high_risk_loc",
      "unit_size_very_high_risk_count", "conditional_complexity_negligible_risk_loc",
      "conditional_complexity_negligible_risk_count", "conditional_complexity_low_risk_loc",
      "conditional_complexity_low_risk_count", "conditional_complexity_medium_risk_loc",
      "conditional_complexity_medium_risk_count", "conditional_complexity_high_risk_loc",
      "conditional_complexity_high_risk_count", "conditional_complexity_very_high_risk_loc",
      "conditional_complexity_very_high_risk_count", "conditional_complexity_high_plus_risk_count",
      "conditional_complexity_high_plus_risk_loc", "number_of_contributors",
      "duplication_number_of_duplicates", "duplication_number_of_files_with_duplicates",
      "duplication_number_of_duplicated_lines", "duplication_percentage", "unit_duplicates_count", "releases"
  ]

  for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
      file_path = os.path.join(folder_path, filename)

      # Load CSV file
      df = pd.read_csv(file_path)

      # Drop specified columns
      df = df.drop(
          columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')

      key = os.path.splitext(filename)[0]
      cleaned_dataframes[key] = df

  return cleaned_dataframes

folder_path = "scraper-output"
cleaned_data = clean_csv_files(folder_path)


# Clean data

In [5]:
import pandas as pd
import numpy as np

for key, df in cleaned_data.items():
    # Replace NaN values in numerical columns with 0
    for col in df.select_dtypes(include=[np.number]).columns:
        df[col] = df[col].fillna(0)

    # Replace NaN and blank/empty values in 'programming_lang' column with the mode
    if 'programming_lang' in df.columns:
        # Calculate mode value
        mode_value = df['programming_lang'].mode()[0] if not df['programming_lang'].mode().empty else 'Unknown'
        
        # Replace NaN values with the mode
        df['programming_lang'] = df['programming_lang'].fillna(mode_value)
        
        # Replace blank or whitespace-only values with the mode
        df['programming_lang'] = df['programming_lang'].replace(r'^\s*$', mode_value, regex=True)

# Compute PCA to rank relevance of features

In [6]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def perform_pca_on_each(cleaned_data):
  feature_importance_list = []

  for key, df in cleaned_data.items():
    # Exclude 'project', 'measurement_month', and 'programming_lang' columns
    features = df.drop(
        columns=['project', 'measurement_month', 'programming_lang'], errors='ignore')

    # Handle missing values - fill or drop NaNs
    features = features.fillna(0)

    # Drop columns with zero variance
    features = features.loc[:, features.var() > 0]

    # Check if there are any numeric features left
    numeric_features = features.select_dtypes(include=[np.number])
    if numeric_features.empty:
      print(
          f"Warning: No numeric features left for PCA in {key}. Skipping PCA.")
      continue

    # Standardize the data
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(numeric_features)

    # Perform PCA
    pca = PCA()
    pca.fit(scaled_features)

    # Collect feature importance
    feature_importance = dict(
        zip(numeric_features.columns, pca.explained_variance_ratio_))
    feature_importance_list.append(feature_importance)

  # Compute average importance across all DataFrames
  avg_feature_importance = {}
  for feature_dict in feature_importance_list:
    for feature, importance in feature_dict.items():
      if feature not in avg_feature_importance:
        avg_feature_importance[feature] = []
      avg_feature_importance[feature].append(importance)

  # Compute final average
  avg_feature_importance = {feature: sum(
      values) / len(values) for feature, values in avg_feature_importance.items()}

  # Rank features by average importance
  ranked_features = sorted(avg_feature_importance.items(),
                           key=lambda x: x[1], reverse=True)

  # Display ranked features
  print("Final Ranked Features by Average Importance:")
  for feature, importance in ranked_features:
    print(f"{feature}: {importance:.4f}")


# Perform PCA on each DataFrame and compute overall importance
perform_pca_on_each(cleaned_data)

Final Ranked Features by Average Importance:
commits: 0.3743
authors: 0.1941
committers: 0.1070
minor_contributors: 0.0689
major_contributors: 0.0564
directories: 0.0438
top_level_dirs: 0.0367
active_days: 0.0298
files_modified: 0.0242
files_added: 0.0195
files_deleted: 0.0156
files_renamed: 0.0124
added_lines: 0.0094
deleted_lines: 0.0072
new_contributors: 0.0054
avg_files_modified_commit: 0.0040
code: 0.0028
blanks: 0.0020
files: 0.0014
comments: 0.0009
lines: 0.0005
stars: 0.0003
forks: 0.0002
open_prs: 0.0002
closed_prs: 0.0001
merged_prs: 0.0000
stale_prs: 0.0000
deploys: 0.0000


# Kernel PCA

In [15]:
import pandas as pd
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler
import numpy as np


def perform_kernel_pca_on_each(cleaned_data):
  # List to hold feature importance for each kernel
  feature_importance_dict = {
      'linear': [],
      'poly': [],
      'rbf': [],
  }

  # You can add more kernels if needed
  kernels = ['linear', 'poly', 'rbf']

  for key, df in cleaned_data.items():
    # Exclude 'project', 'measurement_month', and 'programming_lang' columns
    features = df.drop(
        columns=['project', 'measurement_month', 'programming_lang'], errors='ignore')

    # Handle missing values - fill or drop NaNs
    features = features.fillna(0)

    # Drop columns with zero variance
    features = features.loc[:, features.var() > 0]

    # Check if there are any numeric features left
    numeric_features = features.select_dtypes(include=[np.number])
    if numeric_features.empty:
      print(
          f"Warning: No numeric features left for Kernel PCA in {key}. Skipping Kernel PCA.")
      continue

    # Standardize the data
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(numeric_features)

    # Perform Kernel PCA for each kernel and collect feature importance
    for kernel in kernels:
      kpca = KernelPCA(kernel=kernel, n_components=min(
          5, len(numeric_features.columns)))
      kpca.fit(scaled_features)

      # Collect feature importance based on explained variance ratio
      explained_variance_ratio = np.var(
          kpca.transform(scaled_features), axis=0)
      feature_importance = dict(
          zip(numeric_features.columns, explained_variance_ratio))

      feature_importance_dict[kernel].append(feature_importance)

  # For each kernel, compute average feature importance across all dataframes
  avg_feature_importance_dict = {}
  for kernel in kernels:
    avg_feature_importance = {}

    # Aggregate feature importance for each kernel
    for feature_dict in feature_importance_dict[kernel]:
      for feature, importance in feature_dict.items():
        if feature not in avg_feature_importance:
          avg_feature_importance[feature] = []
        avg_feature_importance[feature].append(importance)

    # Compute the average importance for each feature
    avg_feature_importance = {feature: sum(
        values) / len(values) for feature, values in avg_feature_importance.items()}

    # Sort features by average importance
    ranked_features = sorted(avg_feature_importance.items(),
                             key=lambda x: x[1], reverse=True)

    avg_feature_importance_dict[kernel] = ranked_features

  # Display top 5 ranked features for each kernel
  for kernel, ranked_features in avg_feature_importance_dict.items():
    print(f"\nTop 5 Features for Kernel: {kernel.capitalize()}")
    for feature, importance in ranked_features[:5]:
      print(f"{feature}: {importance:.4f}")


# Perform Kernel PCA on each DataFrame and compute overall importance
perform_kernel_pca_on_each(cleaned_data)


Top 5 Features for Kernel: Linear
commits: 9.4270
authors: 4.9040
committers: 2.7017
minor_contributors: 1.7820
major_contributors: 1.4192

Top 5 Features for Kernel: Poly
commits: 47.8842
authors: 9.1173
committers: 4.0361
directories: 2.1914
minor_contributors: 2.0498

Top 5 Features for Kernel: Rbf
commits: 0.1688
authors: 0.0838
committers: 0.0533
active_days: 0.0395
minor_contributors: 0.0370
