# Import Dataset, drop 0 columns

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


def clean_csv_files(folder_path):
  cleaned_dataframes = {}

  # List of columns to drop
  columns_to_drop = [
      "status", "start_date", "end_date", "window_start_date", "window_end_date",
      "emails", "devs", "emails_thread_starter", "emails_thread_starter_word_count",
      "emails_thread_starter_characters", "emails_threads", "emails_threads_word_count",
      "emails_threads_characters", "emails_no_replies", "emails_no_replies_word_count",
      "emails_no_replies_characters", "emails_jira", "most_complex_unit_loc",
      "most_complex_unit_mcabe_index", "total_number_of_files", "number_of_files_main",
      "lines_of_code_main", "number_of_files_test", "lines_of_code_test",
      "test_vs_main_lines_of_code_percentage", "number_of_files_generated",
      "lines_of_code_generated", "number_of_files_build_and_deployment",
      "lines_of_code_build_and_deployment", "negligible_risk_file_size_count",
      "low_risk_file_size_count", "medium_risk_file_size_count", "high_risk_file_size_count",
      "very_high_risk_file_size_count", "negligible_risk_file_size_loc", "low_risk_file_size_loc",
      "medium_risk_file_size_loc", "high_risk_file_size_loc", "very_high_risk_file_size_loc",
      "number_of_units", "lines_of_code_in_units", "lines_of_code_outside_units",
      "unit_size_negligible_risk_loc", "unit_size_negligible_risk_count", "unit_size_low_risk_loc",
      "unit_size_low_risk_count", "unit_size_medium_risk_loc", "unit_size_medium_risk_count",
      "unit_size_high_risk_loc", "unit_size_high_risk_count", "unit_size_very_high_risk_loc",
      "unit_size_very_high_risk_count", "conditional_complexity_negligible_risk_loc",
      "conditional_complexity_negligible_risk_count", "conditional_complexity_low_risk_loc",
      "conditional_complexity_low_risk_count", "conditional_complexity_medium_risk_loc",
      "conditional_complexity_medium_risk_count", "conditional_complexity_high_risk_loc",
      "conditional_complexity_high_risk_count", "conditional_complexity_very_high_risk_loc",
      "conditional_complexity_very_high_risk_count", "conditional_complexity_high_plus_risk_count",
      "conditional_complexity_high_plus_risk_loc", "number_of_contributors",
      "duplication_number_of_duplicates", "duplication_number_of_files_with_duplicates",
      "duplication_number_of_duplicated_lines", "duplication_percentage", "unit_duplicates_count", "releases"
  ]

  for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
      file_path = os.path.join(folder_path, filename)

      # Load CSV file
      df = pd.read_csv(file_path)

      # Drop specified columns
      df = df.drop(
          columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')

      key = os.path.splitext(filename)[0]
      cleaned_dataframes[key] = df

  return cleaned_dataframes

folder_path = "./scraper-output"
cleaned_data = clean_csv_files(folder_path)


# Clean data

In [2]:
import pandas as pd
import numpy as np

for key, df in cleaned_data.items():
    # Replace NaN values in numerical columns with 0
    for col in df.select_dtypes(include=[np.number]).columns:
        df[col] = df[col].fillna(0)

    # Replace NaN and blank/empty values in 'programming_lang' column with the mode
    if 'programming_lang' in df.columns:
        # Calculate mode value
        mode_value = df['programming_lang'].mode()[0] if not df['programming_lang'].mode().empty else 'Unknown'
        
        # Replace NaN values with the mode
        df['programming_lang'] = df['programming_lang'].fillna(mode_value)
        
        # Replace blank or whitespace-only values with the mode
        df['programming_lang'] = df['programming_lang'].replace(r'^\s*$', mode_value, regex=True)

# Compute PCA to rank relevance of features

In [3]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def perform_pca_on_each(cleaned_data):
  feature_importance_list = []

  for key, df in cleaned_data.items():
    # Exclude 'project', 'measurement_month', and 'programming_lang' columns
    features = df.drop(
        columns=['project', 'measurement_month', 'programming_lang'], errors='ignore')

    # Handle missing values - fill or drop NaNs
    features = features.fillna(0)

    # Drop columns with zero variance
    features = features.loc[:, features.var() > 0]

    # Check if there are any numeric features left
    numeric_features = features.select_dtypes(include=[np.number])
    if numeric_features.empty:
      print(
          f"Warning: No numeric features left for PCA in {key}. Skipping PCA.")
      continue

    # Standardize the data
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(numeric_features)

    # Perform PCA
    pca = PCA()
    pca.fit(scaled_features)

    # Collect feature importance
    feature_importance = dict(
        zip(numeric_features.columns, pca.explained_variance_ratio_))
    feature_importance_list.append(feature_importance)

  # Compute average importance across all DataFrames
  avg_feature_importance = {}
  for feature_dict in feature_importance_list:
    for feature, importance in feature_dict.items():
      if feature not in avg_feature_importance:
        avg_feature_importance[feature] = []
      avg_feature_importance[feature].append(importance)

  # Compute final average
  avg_feature_importance = {feature: sum(
      values) / len(values) for feature, values in avg_feature_importance.items()}

  # Rank features by average importance
  ranked_features = sorted(avg_feature_importance.items(),
                           key=lambda x: x[1], reverse=True)

  # Display ranked features
  print("Final Ranked Features by Average Importance:")
  for feature, importance in ranked_features:
    print(f"{feature}: {importance:.4f}")


# Perform PCA on each DataFrame and compute overall importance
perform_pca_on_each(cleaned_data)

Final Ranked Features by Average Importance:
commits: 0.3743
authors: 0.1941
committers: 0.1070
minor_contributors: 0.0689
major_contributors: 0.0564
directories: 0.0438
top_level_dirs: 0.0367
active_days: 0.0298
files_modified: 0.0242
files_added: 0.0195
files_deleted: 0.0156
files_renamed: 0.0124
added_lines: 0.0094
deleted_lines: 0.0072
new_contributors: 0.0054
avg_files_modified_commit: 0.0040
code: 0.0028
blanks: 0.0020
files: 0.0014
comments: 0.0009
lines: 0.0005
stars: 0.0003
forks: 0.0002
open_prs: 0.0002
closed_prs: 0.0001
merged_prs: 0.0000
stale_prs: 0.0000
deploys: 0.0000


In [4]:
status_data = pd.read_csv("./project-status.csv")
status_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   project  154 non-null    object
 1   status   154 non-null    object
dtypes: object(2)
memory usage: 2.5+ KB


# Interaction analysis on most important features

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Filter out projects with fewer than 10 data points
cleaned_data = {project: df for project,
                df in cleaned_data.items() if len(df) >= 10}

# Filter status_data to only include projects present in cleaned_data
status_data_filtered = status_data[status_data['project'].isin(
    cleaned_data.keys())]


# Merge project status into each dataframe
def merge_status(cleaned_data, status_data):
  status_dict = status_data.set_index('project')['status'].to_dict()
  for project, df in cleaned_data.items():
    df['status'] = status_dict.get(project, 'Unknown')
  return cleaned_data

# Function to extract 1/10th segments and compute averages
def extract_average_feature(df, feature, num_bins=10):
  df = df.sort_values(by='measurement_month')  # Ensure time is sorted
  bin_size = max(1, len(df) // num_bins)  # Determine bin size
  averages = [df[feature].iloc[i *
                               bin_size: (i + 1) * bin_size].mean() for i in range(num_bins)]
  return averages

# Function to plot feature trends
def plot_feature(cleaned_data, status_data, feature, num_bins=10):
  cleaned_data = merge_status(cleaned_data, status_data)

  # Collect averaged data for each status
  grouped_data = {status: [[] for _ in range(
      num_bins)] for status in status_data['status'].unique()}
  for project, df in cleaned_data.items():
    if feature in df.columns:
      status = df['status'].iloc[0]
      averages = extract_average_feature(df, feature, num_bins)
      for i, avg in enumerate(averages):
        grouped_data[status][i].append(avg)
  # Compute overall average per bin for each status group
  vals[feature] = {}
  for status, bins in grouped_data.items():
    avg_series = [
        np.mean(bin_values) if bin_values else 0 for bin_values in bins]
    vals[feature][status] = avg_series

vals = {}
# Function to plot all features
def plot_all_features(cleaned_data, status_data, features, num_bins=10):
  for feature in features:
    plot_feature(cleaned_data, status_data, feature, num_bins)

plot_all_features(cleaned_data, status_data, [
    'commits', 'authors', 'committers', 'minor_contributors', 'major_contributors',
    'directories', 'top_level_dirs', 'active_days', 'files_modified', 'files_added',
    'files_deleted', 'files_renamed', 'added_lines', 'deleted_lines', 'new_contributors',
    'avg_files_modified_commit'
])
cols = [
    'commits', 'authors', 'committers', 'minor_contributors', 'major_contributors',
    'directories', 'top_level_dirs', 'active_days', 'files_modified', 'files_added',
    'files_deleted', 'files_renamed', 'added_lines', 'deleted_lines', 'new_contributors',
    'avg_files_modified_commit']


In [6]:
print(vals['authors'])

{'Graduated': [np.float64(3.6469620540321324), np.float64(4.834731567723854), np.float64(5.7134331558558475), np.float64(6.789886546881157), np.float64(7.83981690976823), np.float64(8.287940487997902), np.float64(9.38176788002519), np.float64(9.205150983126684), np.float64(7.821308593205243), np.float64(7.032237664218397)], 'Retired': [np.float64(2.4073766498488065), np.float64(2.9339578059550218), np.float64(2.6154146665428524), np.float64(2.1636765491713015), np.float64(2.24237218292048), np.float64(2.0806747808841406), np.float64(1.8183319551631825), np.float64(1.773077465477337), np.float64(1.4839097504432692), np.float64(1.1302161710520033)]}


In [7]:
import ipywidgets as widgets

# Dropdown widgets
y_axis_1 = widgets.Dropdown(options=cols, value='authors', description='Y-axis:1')
y_axis_2 = widgets.Dropdown(options=cols, value='commits', description='Y-axis:2')
print(y_axis_1.value)
# Function to update the plot
def update_plot(x_col, y_col):
    fig, ax1 = plt.subplots(figsize=(8, 6))
    print(y_axis_1.value)
    ax1.plot(range(10), vals[y_axis_1.value]['Graduated'], color="red", marker='o', label=y_axis_1.value+'-Graduated')
    ax1.plot(range(10), vals[y_axis_1.value]['Retired'], color="blue", marker='o', label=y_axis_1.value+'-Retired')
    ax1.set_xlabel('Normalized Time (Bins)')
    ax1.set_ylabel(y_axis_1.value, color='black')
    ax1.tick_params(axis='y', labelcolor='black')
    
    ax2 = ax1.twinx()
    ax2.plot(range(10), vals[y_axis_2.value]['Graduated'], color="green", marker='o', label=y_axis_2.value+'-Graduated')
    ax2.plot(range(10), vals[y_axis_2.value]['Retired'], color="purple", marker='o', label=y_axis_2.value+'-Retired')
    ax2.tick_params(axis='y', labelcolor='black')    
    ax2.set_ylabel(y_axis_2.value, color='black')
    fig.legend()
    plt.show()

# Interactive widget
ui = widgets.VBox([y_axis_1, y_axis_2])
out = widgets.interactive_output(update_plot, {'x_col': y_axis_1, 'y_col': y_axis_2})

# Display widgets and plot
display(ui, out)

authors


VBox(children=(Dropdown(description='Y-axis:1', index=1, options=('commits', 'authors', 'committers', 'minor_c…

Output()

In [8]:
# Dropdown widgets
import pandas as pd
y_axis_1 = widgets.Dropdown(options=cols, value='authors', description='Y-axis:1')
y_axis_2 = widgets.Dropdown(options=cols, value='commits', description='Y-axis:2')
print(y_axis_1.value)
# Function to update the plot
def update_plot(x_col, y_col):
    df = pd.DataFrame({
        y_axis_1.value+'-Graduated': vals[y_axis_1.value]['Graduated'],
        y_axis_1.value+'-Retired': vals[y_axis_1.value]['Retired'],
        y_axis_2.value+'-Graduated': vals[y_axis_2.value]['Graduated'],
        y_axis_2.value+'-Retired': vals[y_axis_2.value]['Retired'],
    })
    corr = df.corr()
    fig, ax = plt.subplots(figsize=(6, 5))
    cax = ax.imshow(corr, cmap='coolwarm', interpolation='nearest')
    
    # Add colorbar
    fig.colorbar(cax)
    
    # Set axis labels
    ax.set_xticks(np.arange(len(corr.columns)))
    ax.set_yticks(np.arange(len(corr.columns)))
    ax.set_xticklabels(corr.columns, rotation=45)
    ax.set_yticklabels(corr.columns)
    
    # Display values in cells
    for i in range(len(corr.columns)):
        for j in range(len(corr.columns)):
            ax.text(j, i, f"{corr.iloc[i, j]:.2f}", ha='center', va='center', color='black')
    
    plt.title("Correlation Heatmap")
    plt.show()

# Interactive widget
ui = widgets.VBox([y_axis_1, y_axis_2])
out = widgets.interactive_output(update_plot, {'x_col': y_axis_1, 'y_col': y_axis_2})

# Display widgets and plot
display(ui, out)


authors


VBox(children=(Dropdown(description='Y-axis:1', index=1, options=('commits', 'authors', 'committers', 'minor_c…

Output()

In [9]:

import ipywidgets as widgets
import matplotlib.pyplot as plt

# Dropdown widgets
y_axis = widgets.Dropdown(options=cols, value='authors', description='Y-axis:')
x_axis = widgets.Dropdown(options=cols, value='commits', description='X-axis:')

# Function to update the scatter plot
def update_plot(x_col, y_col):
    fig, ax = plt.subplots(figsize=(8, 6))
    
    # Extract numerical values for x and y axes
    x_values = vals[x_col] if isinstance(vals[x_col], list) else list(vals[x_col].values())
    y_values = vals[y_col] if isinstance(vals[y_col], list) else list(vals[y_col].values())
    
    # Scatter plot with selected x and y dimensions
    ax.scatter(x_values, y_values, color="blue", label=y_axis.value, alpha=0.7)
    
    ax.set_xlabel(x_axis.value)
    ax.set_ylabel(y_axis.value, color='black')
    ax.tick_params(axis='y', labelcolor='black')
    
    fig.legend()
    plt.show()

# Interactive widget
ui = widgets.VBox([x_axis, y_axis])
out = widgets.interactive_output(update_plot, {'x_col': x_axis, 'y_col': y_axis})

# Display widgets and plot
display(ui, out)

VBox(children=(Dropdown(description='X-axis:', options=('commits', 'authors', 'committers', 'minor_contributor…

Output()