In [1]:
import os
import pandas as pd
import re
import string
import ast
import json

import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# Load configuration from JSON file
with open('config.json', 'r') as config_file:
    config = json.load(config_file)

# Access configuration variables
base_dir = config["base_dir"]
patterns = config["patterns"]
last_mod_coc = config["last_mod_coc"]
flags =config["flags"]

In [3]:
# Download necessary NLTK resources for text processing:
# - 'punkt_tab': Tokenizer models for splitting text into sentences and words.
# - 'averaged_perceptron_tagger_eng': Pretrained model for part-of-speech tagging (specific to English).
# - 'wordnet': Lexical database for English, used for lemmatization and word relationships.
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\scobosga\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\scobosga\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\scobosga\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# TABLE I

## Presence

In [4]:
data = []

# Iterate through each language directory in the base directory
for language in os.listdir(base_dir):
    language_dir = os.path.join(base_dir, language)
    if os.path.isdir(language_dir):  # Check if the path is a directory
        coc_files_count = 0  # Initialize count for code of conduct files

        # Iterate through each repository in the language directory
        for repo in os.listdir(language_dir):
            repo_dir = os.path.join(language_dir, repo)
            if os.path.isdir(repo_dir):  # Check if the path is a repository directory
                # Count files matching any pattern in the repository
                coc_files_count += len([
                    file for file in os.listdir(repo_dir) 
                    if any(pattern in file.lower() for pattern in patterns)
                ])

        # Append the language and its code of conduct file count to the data list
        data.append({"Language": language, "Presence(#)": coc_files_count})

# Create a DataFrame from the collected data
df_presence = pd.DataFrame(data)

# Calculate the percentage of presence for each language
df_presence["Presence(%)"] = (df_presence["Presence(#)"] / 1000) * 100

## Content

In [5]:
data = []

# Iterate through each language directory in the base directory
for language in os.listdir(base_dir):
    language_dir = os.path.join(base_dir, language)
    if os.path.isdir(language_dir):  # Check if the path is a directory
        coc_files_count = 0  # Initialize count for code of conduct files
        with_corpus_count = 0  # Initialize count for files with significant content

        # Iterate through each repository in the language directory
        for repo in os.listdir(language_dir):
            repo_dir = os.path.join(language_dir, repo)
            if os.path.isdir(repo_dir):  # Check if the path is a repository directory
                # Iterate through files in the repository
                for file in os.listdir(repo_dir):
                    # Check if the file matches any specified pattern
                    if any(pattern in file.lower() for pattern in patterns):
                        coc_files_count += 1  # Increment count for code of conduct files
                        coc_file_path = os.path.join(repo_dir, file)
                        try:
                            # Read the file content to check if it contains more than 5 lines
                            with open(coc_file_path, 'r', encoding='utf-8') as f:
                                lines = f.readlines()
                                if len(lines) > 5: 
                                    with_corpus_count += 1  # Increment count for files with significant content
                        except Exception as e:
                            # Print error message if the file cannot be read
                            print(f"Error reading the file {coc_file_path}: {e}")

        # Append results for the language to the data list
        data.append({
            "Language": language,
            "Content(#)": with_corpus_count
        })

# Create a DataFrame from the collected data
df_content = pd.DataFrame(data)

# Calculate the percentage of repositories with significant content
df_content["Content(%)"] = (df_content["Content(#)"] / 1000) * 100


## References to CC

In [6]:
def clean_text(text):
    """
    Cleans a given text by applying the following steps:
    
    1. Converts all characters to lowercase.
    2. Removes all punctuation marks.
    3. Replaces multiple whitespace characters with a single space.
    4. Strips leading and trailing spaces.
    
    Parameters:
        text (str): The input text to be cleaned.

    Returns:
        str: The cleaned and normalized text.
    """
    # Convert text to lowercase
    text = text.lower()
    
    # Remove all punctuation marks using a regex pattern
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    
    # Replace multiple whitespace characters with a single space and strip leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [7]:
# Collect Code of Conduct files from repositories
data = []

for language in os.listdir(base_dir):
    language_dir = os.path.join(base_dir, language)
    if os.path.isdir(language_dir):  # Ensure the path is a valid directory
        for repo in os.listdir(language_dir):  # Iterate through repositories for each language
            repo_dir = os.path.join(language_dir, repo)
            if os.path.isdir(repo_dir):  # Ensure the repository path is valid
                for file in os.listdir(repo_dir):  # Iterate through files in the repository
                    if any(pattern in file.lower() for pattern in patterns):  # Match file names against patterns
                        file_path = os.path.join(repo_dir, file)
                        with open(file_path, 'r', encoding='utf-8') as f:  # Read file content
                            content = f.read()
                        data.append({"Language": language, "Repo_Name": repo, "CoC_Content": content})

# Create a DataFrame from the collected data
df_cc = pd.DataFrame(data)

# Filter CoC files with at least 5 lines
df_cc['Line_Count'] = df_cc['CoC_Content'].apply(lambda x: len(x.split('\n')))
df_cc = df_cc[df_cc['Line_Count'] >= 5]
df_cc = df_cc.drop(columns=['Line_Count'])  # Drop the line count column after filtering

# Clean the content of the CoC files
df_cc['CoC_Content_Clean'] = df_cc['CoC_Content'].apply(clean_text)

# Calculate the percentage of repositories referencing "Contributor Covenant" for each language
data = []

languages = df_cc['Language'].unique()  # Identify unique programming languages

for language in languages:
    df_language = df_cc[df_cc['Language'] == language]  # Filter rows for the current language
    total_rows = len(df_language)  # Total repositories with CoC files for the language
    rows_with_cc = df_language['CoC_Content_Clean'].str.contains("contributor covenant", case=False, na=False).sum()
    percentage = (rows_with_cc / total_rows) * 100  # Calculate percentage of repositories with the reference
    data.append({
        'Language': language,
        'r_cc(%)': percentage  # Add the percentage to the results
    })

# Create a DataFrame with the results
df_r_cc = pd.DataFrame(data)

In [8]:
# Combine the DataFrames `df_presence` and `df_content` based on the "Language" column using an outer join
df_combined = pd.merge(df_presence, df_content, on="Language", how="outer")

# Add `df_r_cc` to the combined DataFrame by merging it on the "Language" column
df_combined = pd.merge(df_combined, df_r_cc, on="Language", how="outer")

# Display the resulting combined DataFrame
df_combined


Unnamed: 0,Language,Presence(#),Presence(%),Content(#),Content(%),r_cc(%)
0,C,101,10.1,87,8.7,80.21978
1,C#,213,21.3,178,17.8,86.934673
2,C++,171,17.1,148,14.8,74.193548
3,Go,290,29.0,208,20.8,86.363636
4,Java,107,10.7,94,9.4,80.19802
5,JavaScript,242,24.2,215,21.5,87.782805
6,PHP,117,11.7,110,11.0,86.607143
7,Python,223,22.3,193,19.3,78.325123
8,Ruby,190,19.0,176,17.6,86.592179
9,Rust,230,23.0,202,20.2,83.732057


# FIGURE 1

In [9]:
# Read the input CSV file
df_f_coc = pd.read_csv(last_mod_coc)

# Convert relevant date columns to datetime format, handling errors gracefully
df_f_coc['Code_of_Conduct_Last_Modified'] = pd.to_datetime(df_f_coc['Code_of_Conduct_Last_Modified'], errors='coerce')
df_f_coc['Last_Commit_Date'] = pd.to_datetime(df_f_coc['Last_Commit_Date'], errors='coerce')
df_f_coc['Creation_Date'] = pd.to_datetime(df_f_coc['Creation_Date'], errors='coerce')

# Drop rows where any of the key date columns are missing and create a filtered copy
df_filtered = df_f_coc.dropna(subset=['Code_of_Conduct_Last_Modified', 'Last_Commit_Date', 'Creation_Date']).copy()

# Define the threshold for active repositories (last commit within the last 6 months)
active_threshold = pd.Timestamp.now() - pd.DateOffset(months=6)

# Add a column to indicate if a repository is active
df_filtered.loc[:, 'Is_Active'] = df_filtered['Last_Commit_Date'] >= active_threshold

# Calculate the lifetime of each repository in years
df_filtered.loc[:, 'Repo_Lifetime_Years'] = (df_filtered['Last_Commit_Date'] - df_filtered['Creation_Date']).apply(lambda x: x.days / 365)

# Calculate the time difference in years between the last commit and the last modification of the code of conduct
df_filtered.loc[:, 'Years_Difference'] = (df_filtered['Last_Commit_Date'] - df_filtered['Code_of_Conduct_Last_Modified']).apply(lambda x: x.days / 365)

# Calculate the freshness of the code of conduct (f_coc) as the ratio of Years_Difference to Repo_Lifetime_Years
df_filtered['f_coc'] = df_filtered['Years_Difference'] / df_filtered['Repo_Lifetime_Years']

# Calculate the mean freshness ratio (f_coc) grouped by language
mean_ratio_by_language = df_filtered.groupby('Language')['f_coc'].mean().reset_index()

# Rename the columns for clarity
mean_ratio_by_language.columns = ['Language', 'f_coc_average']

# Display the resulting DataFrame 
mean_ratio_by_language


Unnamed: 0,Language,f_coc_average
0,C,0.52855
1,C#,0.578754
2,C++,0.505745
3,Go,0.565003
4,Java,0.500426
5,JavaScript,0.458285
6,PHP,0.466806
7,Python,0.554655
8,Ruby,0.511256
9,Rust,0.647693


# TABLE II

In [10]:
def get_wordnet_pos(word):
    """
    Maps a word's part-of-speech (POS) tag to the first character 
    that the WordNet lemmatizer accepts.
    
    Parameters:
        word (str): The word to analyze.
    
    Returns:
        str: A corresponding WordNet POS tag ('a', 'n', 'v', 'r'), defaults to 'n' (noun).
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_text(text):
    """
    Lemmatizes a given text by reducing words to their base forms using POS tagging.

    Parameters:
        text (str): The text to lemmatize.
    
    Returns:
        str: The lemmatized version of the input text.
    """
    tokens = word_tokenize(text)
    lemmatized_text = ' '.join([lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens])
    return lemmatized_text

def find_flags(text, flags):
    """
    Identifies specific flags (keywords) within the given text.

    Parameters:
        text (str): The text to analyze.
        flags (dict): A dictionary with flag identifiers as keys and lists of keywords as values.
    
    Returns:
        dict: A dictionary of flags with their matched keywords from the text.
    """
    lemmatized_text = lemmatize_text(text)
    results = {}
    for flag, keywords in flags.items():
        for keyword in keywords:
            lemmatized_keyword = lemmatize_text(keyword)
            if re.search(r'\b' + re.escape(lemmatized_keyword) + r'\b', lemmatized_text, re.IGNORECASE):
                if flag not in results:
                    results[flag] = []
                results[flag].append(keyword)
    return results

def safe_eval(val):
    """
    Safely evaluates a string representation of a Python literal (e.g., list, dict) into its Python equivalent.

    Parameters:
        val (str): The string to evaluate.
    
    Returns:
        object: The evaluated Python object, or the original value if not a string.
    """
    if isinstance(val, str):
        return ast.literal_eval(val)
    return val


In [11]:
# Apply the find_flags function to clean content and extract matched flags
df_cc['Flags'] = df_cc['CoC_Content_Clean'].apply(lambda text: find_flags(text, flags))

# Safely evaluate the extracted flags (convert string representations of dicts/lists into Python objects)
df_cc['Flags'] = df_cc['Flags'].apply(safe_eval)

# Filter out rows where no flags were identified
df_cc = df_cc[df_cc['Flags'].apply(lambda x: bool(x))]

# Initialize dictionaries to store counts and percentages
flag_counts_by_language = {}
total_records_by_language = df_cc['Language'].value_counts().to_dict()

# Calculate the counts of each flag by language
for _, row in df_cc.iterrows():
    language = row['Language']
    flags = row['Flags']
    if language not in flag_counts_by_language:
        flag_counts_by_language[language] = {}
    for flag, keywords in flags.items():
        if flag not in flag_counts_by_language[language]:
            flag_counts_by_language[language][flag] = 0
        flag_counts_by_language[language][flag] += 1

# Calculate percentages of each flag per language
flag_percentages_by_language = {}
for language, flag_counts in flag_counts_by_language.items():
    total_records = total_records_by_language[language]
    flag_percentages_by_language[language] = {flag: (count / total_records) * 100 for flag, count in flag_counts.items()}

# Convert flag percentages to a list of dictionaries for DataFrame creation
flag_percentages_list = []
for language, flag_percentages in flag_percentages_by_language.items():
    for flag, percentage in flag_percentages.items():
        flag_percentages_list.append({'Language': language, 'Flag': flag, 'Percentage': round(percentage, 2)})

# Create a DataFrame to hold the flag percentages
flag_percentages_df = pd.DataFrame(flag_percentages_list)

# Pivot the DataFrame to structure data with one row per language and one column per flag
pivot_df = flag_percentages_df.pivot(index='Language', columns='Flag', values='Percentage').reset_index()

# Display the final pivot table
pivot_df


Flag,Language,F1,F10,F2,F3,F4,F5,F6,F7,F8,F9
0,C,89.16,87.95,96.39,87.95,32.53,89.16,95.18,98.8,96.39,93.98
1,C#,91.41,91.41,93.75,91.41,23.44,91.41,98.44,100.0,100.0,99.22
2,C++,86.61,83.46,93.7,83.46,28.35,81.89,93.7,96.85,99.21,91.34
3,Go,88.78,84.88,89.27,84.88,32.2,84.88,93.66,94.63,98.05,92.2
4,Java,85.23,84.09,89.77,85.23,20.45,85.23,96.59,97.73,98.86,95.45
5,JavaScript,90.24,86.83,90.73,86.34,22.44,86.83,96.59,98.05,99.51,92.68
6,PHP,81.31,81.31,89.72,83.18,19.63,83.18,89.72,96.26,94.39,89.72
7,Python,93.1,88.51,93.1,87.93,36.21,85.63,93.1,94.83,98.28,91.38
8,Ruby,69.64,68.45,72.02,69.64,15.48,68.45,95.24,97.02,99.4,85.12
9,Rust,96.3,83.07,84.66,84.13,51.32,83.6,84.13,96.83,98.94,84.66
