In [25]:
from parser import *
from snapshoter import *
import pandas as pd
import os
from ruamel.yaml import YAML

## Loading the dataset and filtering the deleted workflows

In [15]:
df = pd.read_csv('../dataset/200_workflowsonly.csv')
print(df.shape[0])

deleted_workflows = df.loc[df['file_hash'].isnull()]
print(f"Number of deleted workflows records : {len(deleted_workflows)}")

df = df.dropna(subset=['file_hash'])
print(f"Number of workflow records after filtering : {df.shape[0]}")

2595399
Number of deleted workflows : 61009
Number of workflows after filtering : 2534390


## Extracting information from a single workflow file


In [17]:
firstWorkflow = df.iloc[0]
file_hash = firstWorkflow['file_hash']

folder_path = f"../dataset/workflows"
file_path = os.path.join(folder_path, file_hash)

In [None]:
if not os.path.isfile(file_path):
    print(f"File {file_path} not found.")
else:
    parsed_data = parse_workflow(file_path)

    if parsed_data:
        print(f"Workflow analysé : {parsed_data['file_path']}")
        print(f"- Nombre de lignes : {parsed_data['lines_count']}")
        print(f"- Déclencheurs : {parsed_data['events']}")
        print(f"- Nombre de jobs : {parsed_data['jobs_count']}")

        for job, details in parsed_data["jobs"].items():
            print(f"  * Job: {job}")
            print(f"    - Nombre de steps: {details['steps_count']}")
            print(f"    - Utilise GitHub Actions: {details['uses_github_actions']}")
            print(f"    - Utilise des commandes shell: {details['uses_commands']}")
            print(f"    - Détails des steps :")
            for step in details["step_details"]:
                print(f"      - {step['name']}: uses={step['uses']}, run={step['run']}")

## Filter the invalid workflows
1. First idea is to deleted all records of each uid that has at least one invalid workflow (valid_yaml = False).
2. Second idea is to keep only the valid workflows.

In [33]:
invalid_uids = df.loc[df['valid_yaml'] == False, 'uid'].unique()
print(f"Number of invalid uids : {len(invalid_uids)}")
print(f"Number of records that corresponds to these invalids uids : {df['uid'].isin(invalid_uids).sum()}\n")

print(f"Number of records in the dataframe before : {df.shape[0]}")
print(f"Total number of uids before filtering : {df['uid'].nunique()}\n")

filtered_df = delete_uid_with_invalid_yaml(df)
print(f"Number of records in the dataframe once {filtered_df.shape[0]}")
print(f"Total number of uids after filtering : {filtered_df['uid'].nunique()}")

Number of invalid uids : 8384
Number of records that corresponds to these invalids uids : 255643

Number of records in the dataframe before : 2534390
Total number of uids before filtering : 219460

Number of records in the dataframe once 2278747
Total number of uids after filtering : 211076


In [34]:
invalid_yaml = df.loc[df['valid_yaml'] == False]
print(f"Number of invalid records : {len(invalid_yaml)}\n")

print(f"Number of records in the dataframe before : {df.shape[0]}\n")
filtered_df = delete_invalid_yaml_records(df)

print(f"Number of records in the dataframe once filtered {filtered_df.shape[0]}")


Number of invalid records : 15417

Number of records in the dataframe before : 2534390

Number of records in the dataframe once filtered 2518973


## Take a snapshot of the dataset
- Filter the dataset to keep only the repositories that were committed before the snapshot date with a limit of 1 month before the snapshot date.
- For each uid in the snapshot, get the most recent workflow.

In [32]:
snapshot = extract_snapshot(df, 2019)
most_recent_workflows = get_most_recent_workflows(snapshot, 2019)
print(f"Nombre de workflows dans le snapshot : {snapshot.shape[0]}")
print(f"Nombre de workflows les plus récents : {most_recent_workflows.shape[0]}")

Nombre de workflows dans le snapshot : 4067
Nombre de workflows les plus récents : 1380
