In [None]:
from parser import *
from snapshoter import *
import pandas as pd
import os
from ruamel.yaml import YAML

## Loading the dataset and filtering the deleted workflows

In [19]:
df = pd.read_csv('../dataset/200_workflowsonly.csv')
print(df.shape[0])

deleted_workflows = df.loc[df['file_hash'].isnull()]
print(f"Number of deleted workflows records : {len(deleted_workflows)}")

df = df.dropna(subset=['file_hash'])
print(f"Number of workflow records after filtering : {df.shape[0]}")

2595399
Number of deleted workflows records : 61009
Number of workflow records after filtering : 2534390


## Extracting information from a single workflow file


In [None]:
firstWorkflow = df.iloc[0]
file_hash = firstWorkflow['file_hash']

folder_path = f"../dataset/workflows"
file_path = os.path.join(folder_path, file_hash)

In [None]:
if not os.path.isfile(file_path):
    print(f"File {file_path} not found.")
else:
    parsed_data = parse_workflow(file_path)

    if parsed_data:
        print(f"Workflow analysé : {parsed_data['file_path']}")
        print(f"- Nombre de lignes : {parsed_data['lines_count']}")
        print(f"- Déclencheurs : {parsed_data['events']}")
        print(f"- Nombre de jobs : {parsed_data['jobs_count']}")

        for job, details in parsed_data["jobs"].items():
            print(f"  * Job: {job}")
            print(f"    - Nombre de steps: {details['steps_count']}")
            print(f"    - Utilise GitHub Actions: {details['uses_github_actions']}")
            print(f"    - Utilise des commandes shell: {details['uses_commands']}")
            print(f"    - Détails des steps :")
            for step in details["step_details"]:
                print(f"      - {step['name']}: uses={step['uses']}, run={step['run']}")

## Filter the invalid workflows
1. First idea is to keep only the valid workflows.
3. Second idea is to delete all records of each uid that has at least one invalid workflow (valid_yaml = False).

In [20]:
invalid_yaml = df.loc[df['valid_yaml'] == False]
print(f"Number of invalid records : {len(invalid_yaml)}\n")

print(f"Number of records in the dataframe before : {df.shape[0]}\n")
filtered_df1 = delete_invalid_yaml_records(df)

print(f"Number of records in the dataframe once filtered {filtered_df1.shape[0]}")

Number of invalid records : 15417

Number of records in the dataframe before : 2534390

Number of records in the dataframe once filtered 2518973


In [21]:
invalid_uids = df.loc[df['valid_yaml'] == False, 'uid'].unique()
print(f"Number of invalid uids : {len(invalid_uids)}")
print(f"Number of records that corresponds to these invalids uids : {df['uid'].isin(invalid_uids).sum()}\n")

print(f"Number of records in the dataframe before : {df.shape[0]}")
print(f"Total number of uids before filtering : {df['uid'].nunique()}\n")

filtered_df2 = delete_uid_with_invalid_yaml(df)
print(f"Number of records in the dataframe once {filtered_df2.shape[0]}")
print(f"Total number of uids after filtering : {filtered_df2['uid'].nunique()}")

Number of invalid uids : 8384
Number of records that corresponds to these invalids uids : 255643

Number of records in the dataframe before : 2534390
Total number of uids before filtering : 219460

Number of records in the dataframe once 2278747
Total number of uids after filtering : 211076


## Take a snapshot of the dataset
- Filter the dataset to keep only the repositories that were committed before the snapshot date with a limit of 1 month before the snapshot date.
- For each uid in the snapshot, get the most recent workflow.

In [22]:
year = 2019
while year <= 2024:
    # Snapshot with the first method of filtering
    snapshot1 = extract_snapshot(filtered_df1, year)
    # Snapshot with the second method of filtering
    snapshot2 = extract_snapshot(filtered_df2, year)
    # Get the most recent workflows for each uid
    most_recent_workflows1 = get_most_recent_workflows(snapshot1, year)
    most_recent_workflows2 = get_most_recent_workflows(snapshot2, year)
    print(f"Méthode 1 {year}:")
    print(f"Nombre de workflows dans le snapshot: {snapshot1.shape[0]}")
    print(f"Nombre de workflows les plus récents: {most_recent_workflows1.shape[0]}\n")
    print(f"Méthode 2 {year}:")
    print(f"Nombre de workflows dans le snapshot: {snapshot2.shape[0]}")
    print(f"Nombre de workflows les plus récents: {most_recent_workflows2.shape[0]}\n")
    year += 1

Méthode 1 2019:
Nombre de workflows dans le snapshot: 3965
Nombre de workflows les plus récents: 1377

Méthode 2 2019:
Nombre de workflows dans le snapshot: 3249
Nombre de workflows les plus récents: 1252

Méthode 1 2020:
Nombre de workflows dans le snapshot: 18902
Nombre de workflows les plus récents: 8657

Méthode 2 2020:
Nombre de workflows dans le snapshot: 16372
Nombre de workflows les plus récents: 7990

Méthode 1 2021:
Nombre de workflows dans le snapshot: 32199
Nombre de workflows les plus récents: 15920

Méthode 2 2021:
Nombre de workflows dans le snapshot: 28453
Nombre de workflows les plus récents: 14762

Méthode 1 2022:
Nombre de workflows dans le snapshot: 44983
Nombre de workflows les plus récents: 21713

Méthode 2 2022:
Nombre de workflows dans le snapshot: 40612
Nombre de workflows les plus récents: 20342

Méthode 1 2023:
Nombre de workflows dans le snapshot: 74622
Nombre de workflows les plus récents: 35557

Méthode 2 2023:
Nombre de workflows dans le snapshot: 69312
N