In [95]:
import parser
import info_extractor
import snapshoter
import pickle
import gzip
import pandas as pd
import altair as alt

from tabulate import tabulate
from plotter import *
from importlib import reload


In [96]:
import parser
import info_extractor
import snapshoter
reload(parser)
reload(info_extractor)
reload(snapshoter)

<module 'snapshoter' from 'C:\\Users\\Thomas\\PycharmProjects\\memoire\\customcode\\snapshoter.py'>

## Loading the dataset

In [50]:
df = pd.read_csv("../dataset/workflows.csv")
print(f"Total number of records: {df.shape[0]}")

# Drop the records that were deleted from the dataset
df = df.dropna(subset=["file_hash"])


# Force the type of the valid_yaml column to be boolean
# (After a check it appears that the column is boolean but better to be sure)
df["valid_yaml"] = df["valid_yaml"].astype(bool)

Total number of records: 2595399


## Counting the number of invalid YAML files per year

In [126]:
# Total number of invalid records in the dataset
nbr_invalid_yaml = df[df['valid_yaml'] == False].shape[0]
print(f"Number of invalid records: {nbr_invalid_yaml}")

df['committed_date'] = pd.to_datetime(df['committed_date'], unit='s')
df['committed_year'] = df['committed_date'].dt.year

# Only keep the records that are invalid and then sort them by year
invalid_df = df[df['valid_yaml'] == False]
invalids_per_year = invalid_df['committed_year'].value_counts().sort_index()

print("Number of invalid records per year: ")
print(invalids_per_year)

Number of invalid records: 15417
Number of invalid records per year: 
committed_year
2019     386
2020    2455
2021    3201
2022    3780
2023    3466
2024    2129
Name: count, dtype: int64


## Snapshot without filtering

In [3]:
unfiltered_snapshots = []
year = 2019
while year <= 2024:
    snapshot = snapshoter.extract_snapshot(df,year)
    unfiltered_snapshots.append((year,snapshot))
    print(f"Year : {year}")
    print(f"Number of workflows in the snapshot : {snapshot.shape[0]}\n")
    year += 1

Year : 2019
Number of workflows in the snapshot : 1905

Year : 2020
Number of workflows in the snapshot : 31145

Year : 2021
Number of workflows in the snapshot : 79553

Year : 2022
Number of workflows in the snapshot : 127814

Year : 2023
Number of workflows in the snapshot : 179073

Year : 2024
Number of workflows in the snapshot : 219401



## Snapshot with filtering the invalid workflows

- Here we only consider the workflows that are valid (valid_yaml = True).
- The other workflows will be dropped from the resulting snapshot dataframe


In [51]:
# This block of code is only used to gather information
# on the number of invalid records

invalid_yaml = df.loc[df['valid_yaml'] == False]
print(f"Number of invalid records : {len(invalid_yaml)}\n")

Number of invalid records : 15417



In [52]:
filtered_snapshots = []

filtered_df = snapshoter.delete_invalid_yaml_records(df)

year = 2019
while year <= 2024:
    snapshot_filtered = snapshoter.extract_snapshot(filtered_df, year)
    filtered_snapshots.append((year,snapshot_filtered))
    print(f"Year: {year}")
    print(f"Number of workflows in the snapshot: {snapshot_filtered.shape[0]}\n")
    year += 1





Year: 2019
Number of workflows in the snapshot: 1902

Year: 2020
Number of workflows in the snapshot: 31092

Year: 2021
Number of workflows in the snapshot: 79476

Year: 2022
Number of workflows in the snapshot: 127675

Year: 2023
Number of workflows in the snapshot: 178886

Year: 2024
Number of workflows in the snapshot: 219186



## Get language repartition

In [26]:
language_repartitions = []

for year, snapshot in filtered_snapshots:
    repartition, nbr_repositories = info_extractor.extract_languages_by_repository(snapshot)
    language_repartitions.append((year, nbr_repositories, repartition))

print("Languages counts: ")
for year, _, language_count in language_repartitions:
    print(f"Year {year}:")
    print(language_count)
    print()


Languages counts: 
Year 2019:
{'TypeScript': 231, 'Go': 139, 'JavaScript': 128, 'Python': 126, 'Java': 96, 'Rust': 93, 'C++': 65, 'Ruby': 62, 'PHP': 55, 'C': 46, 'C#': 43, 'Kotlin': 28, 'Shell': 27, 'Swift': 22, 'Elixir': 12, 'Scala': 9, 'Haskell': 7, 'Jupyter Notebook': 6, 'Lua': 6, 'Dart': 6, 'Dockerfile': 5, 'Vue': 5, 'Perl': 4, 'Objective-C': 3, 'Common Lisp': 3, 'Julia': 3, 'Clojure': 2, 'Erlang': 2, 'Fortran': 2, 'Groovy': 2, 'R': 2, 'TeX': 2, 'Racket': 1, 'Nix': 1, 'OCaml': 1, 'F#': 1, 'Zig': 1, 'Elm': 1}

Year 2020:
{'Python': 1998, 'TypeScript': 1666, 'Go': 1147, 'JavaScript': 1134, 'Java': 987, 'C++': 892, 'Rust': 878, 'PHP': 610, 'C#': 535, 'C': 523, 'Ruby': 370, 'Julia': 314, 'Kotlin': 291, 'Shell': 286, 'R': 221, 'Scala': 152, 'Swift': 145, 'Dart': 104, 'Dockerfile': 97, 'Jupyter Notebook': 97, 'Haskell': 93, 'Elixir': 74, 'Vue': 61, 'Lua': 47, 'OCaml': 38, 'Clojure': 33, 'Groovy': 28, 'Perl': 26, 'F#': 24, 'Nix': 23, 'Objective-C': 21, 'Makefile': 20, 'TeX': 20, 'Erlang':

## Extract percentage of languages for each year

- Make a table for each year with the number of repositories using the corresponding language and the percentage
- If the percentage of a row is less than 1%, then group them as Other

In [27]:
resulting_repartition = []

for year, nbr_repositories, repartition in language_repartitions:
    print(nbr_repositories)
    # Create a dataframe with the languages and their counts
    languages_df = pd.DataFrame(repartition.items(), columns=['Language', 'Count'])
    # Compute the percentage for each language
    languages_df['Percentage'] = round((languages_df['Count'] / nbr_repositories) * 100, 2)
    # Sort the dataframe by count
    languages_df = languages_df.sort_values(by='Count', ascending=False)
    languages_df['Count'] = languages_df['Count'].astype(int)

    '''
    Now we need to group the languages with less than 1% of total occurrences, group tem as Other and delete the rows.
    '''
    # Create a new Dataframe with the grouped languages
    less_represented = languages_df[languages_df['Percentage'] < 1].sum(numeric_only=True)
    less_represented['Language'] = "Other"

    # Deleting the rows with less than 1% in the original df by sorting it before concatenating with the above df
    languages_df = languages_df[languages_df['Percentage'] >= 1]
    languages_df = pd.concat([languages_df, pd.DataFrame([less_represented])], ignore_index=True)

    resulting_repartition.append((year,languages_df))

    table = tabulate(languages_df, headers='keys', tablefmt='psql')

    print(f"Year {year}:")
    print(table)
    print()

1248
Year 2019:
+----+------------+---------+--------------+
|    | Language   |   Count |   Percentage |
|----+------------+---------+--------------|
|  0 | TypeScript |     231 |        18.51 |
|  1 | Go         |     139 |        11.14 |
|  2 | JavaScript |     128 |        10.26 |
|  3 | Python     |     126 |        10.1  |
|  4 | Java       |      96 |         7.69 |
|  5 | Rust       |      93 |         7.45 |
|  6 | C++        |      65 |         5.21 |
|  7 | Ruby       |      62 |         4.97 |
|  8 | PHP        |      55 |         4.41 |
|  9 | C          |      46 |         3.69 |
| 10 | C#         |      43 |         3.45 |
| 11 | Kotlin     |      28 |         2.24 |
| 12 | Shell      |      27 |         2.16 |
| 13 | Swift      |      22 |         1.76 |
| 14 | Other      |      87 |         6.96 |
+----+------------+---------+--------------+

13026
Year 2020:
+----+------------+---------+--------------+
|    | Language   |   Count |   Percentage |
|----+------------+--

## Draw a line chart representing the evolution of the languages over the years

- This plot will represent the evolution of the use of the different languages over the years.
- The languages that are less than 1% of the total will be grouped as Others.
- The percentage of each language will be represented on the y-axis and the year on the x-axis.

In [53]:
'''
The idea is to create a line chart using Altair.
To do so it is needed to regroup the different dataframes into a single one with a structure of year,language,percentage
'''

data = []

# The loop below will create a dictionary with the year, language and percentage for each year
# And then append it to the data list to later create a dataframe
for year, df in resulting_repartition:
    top_10 = df.head(10)
    others = df.iloc[10:]

    for _, row in top_10.iterrows():
        data.append({
            'Year': year,
            'Language': row['Language'],
            'Percentage': row['Percentage']
        })

    other_pct = others['Percentage'].sum()
    data.append({
        'Year': year,
        'Language': 'Other',
        'Percentage': other_pct
    })

resulting_df = pd.DataFrame(data)
resulting_df2= resulting_df[resulting_df['Language'] != 'Other'].copy()

order_2024 = resulting_df2[resulting_df2['Year'] == 2024].sort_values(by='Percentage', ascending=False)['Language'].tolist()
order_2024.append('Autres')

# Random color palette generated thanks to AI.
color_palette = [
    '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
    '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
    '#aec7e8', '#ffbb78', '#98df8a', '#ff9896', '#c5b0d5',
    '#c49c94', '#f7b6d2', '#c7c7c7', '#dbdb8d', '#9edae5'
]

'''
This part is used to create the line chart with Altair
'''
chart = alt.Chart(resulting_df).mark_line(point=True).encode(
    x=alt.X('Year:O', title="Année"),
    y=alt.Y('Percentage:Q', title="Pourcentage", scale=alt.Scale(domain=[0,22])),
    color=alt.Color('Language:N', title="Langages", sort=order_2024,  scale=alt.Scale(range=color_palette)),
    tooltip=["Year", "Language", "Percentage"]
).properties(
    width=800,
    height=700,
).interactive()

chart.show()
chart.save('../out/charts/langages.png')

## Draw a pie chart for each year

In [13]:
years = resulting_df['Year'].unique()
years.sort()
pie_charts = []
for year in years:
    pie_charts.append(pie_chart_by_year(resulting_df, year))

[chart.show() for chart in pie_charts]

[None, None, None, None, None, None]

## Extract the median of the repositories characteristics for each snapshot

In [54]:
repositories_characteristics = []

for year, snapshot in filtered_snapshots:
    median_characteristics = info_extractor.extract_repos_characteristics(snapshot)
    repositories_characteristics.append((year, median_characteristics))

print("Median characteristics of the repositories: ")
for year, median_characteristics in repositories_characteristics:
    print(f"Year {year}:")
    print("Median characteristics:")
    print(median_characteristics)
    print()


Median characteristics of the repositories: 
Year 2019:
Median characteristics:
commits          1442.5
branches            9.0
releases           34.0
contributors       41.0
stars             844.0
issues            218.5
pullrequests      430.5
codelines       29543.0
size            13453.0
dtype: float64

Year 2020:
Median characteristics:
commits          1258.0
branches           10.0
releases           29.0
contributors       36.0
stars             575.5
issues            205.0
pullrequests      363.5
codelines       30792.5
size            12784.5
dtype: float64

Year 2021:
Median characteristics:
commits          1148.0
branches            9.0
releases           24.0
contributors       33.0
stars             524.0
issues            187.0
pullrequests      305.0
codelines       29103.0
size            11083.0
dtype: float64

Year 2022:
Median characteristics:
commits          1071.0
branches            9.0
releases           22.0
contributors       30.0
stars             493.0

## Draw a line chart representing the evolution of the repositories characteristics over the years

In [55]:
data = []

for year, median_values in repositories_characteristics:
    for characteristic, value in median_values.items():
        data.append({
            'Year': year,
            'Characteristic': characteristic,
            'Value': value
        })

df_medians = pd.DataFrame(data)

chart = alt.Chart(df_medians).mark_line(point=True).encode(
    x=alt.X('Year:O', title="Année"),
    y=alt.Y('Value:Q', title="Valeur médiane", scale=alt.Scale(type='log')),
    color=alt.Color('Characteristic:N', title="Caractéristiques"),
    tooltip=['Year', 'Characteristic', 'Value']
).properties(
    width=800,
    height=400
).interactive()

chart.show()
chart.save("../out/charts/median_repo_carac.png")

## Snapshot parsing
- Global snapshot parser to be used for every kind of data extraction after

In [56]:
repositories = []

for year, snapshot in filtered_snapshots:
    repositories.append((year, snapshot['repository'].unique()))

In [57]:
PARSE_DATA = False
parsed_snapshots = []

if PARSE_DATA:
    for year, snapshot in filtered_snapshots:
        parsed_workflows = parser.parse_snapshot(snapshot)
        # Using .txt is too heavy as there are 350MB+ files,
        # So I use pickle to save the data as a bytestream and gzip to compress it
        with gzip.open(f"../out/parsed_snapshot_{year}.pkl.gz", "wb") as f:
            pickle.dump(parsed_workflows, f, protocol=pickle.HIGHEST_PROTOCOL)
        parsed_snapshots.append((year, parsed_workflows))

else:
    for year in range(2019,2025):
        with gzip.open(f"../out/parsed_snapshot_{year}.pkl.gz", "rb") as f:
            parsed_workflows = pickle.load(f)
        parsed_snapshots.append((year, parsed_workflows))

In [58]:
for year, parsed_workflows in parsed_snapshots:
    print(f"Année: {year}")
    print(f"Nombre de workflows parsés: {len(parsed_workflows)}")
    print()

Année: 2019
Nombre de workflows parsés: 1898

Année: 2020
Nombre de workflows parsés: 31044

Année: 2021
Nombre de workflows parsés: 79332

Année: 2022
Nombre de workflows parsés: 127414

Année: 2023
Nombre de workflows parsés: 178450

Année: 2024
Nombre de workflows parsés: 218625



## Extract events from snapshots
- Gather for each workflow of a snapshot its different trigger events

In [59]:
snapshot_events = []

for year, parsed_workflows in parsed_snapshots:
    events = info_extractor.extract_events_from_parsed(parsed_workflows)
    snapshot_events.append((year, events))

- Then we need to extract a count of the different events for each snapshot.
- The idea is to have, on one hand, the proportion of workflow using each event and on the other hand, the proportion of repositories using each event.

In [60]:
events_dataframes, yearly_events_medians, pairs_dataframes = info_extractor.count_events(snapshot_events, repositories)

for year, event_dataframe in events_dataframes:
    print(f"Year {year}:")
    # head(20) will display the top 20
    print(event_dataframe)
    print()

Year 2019:
                  event  workflow_proportion  repo_proportion
0                  push                79.03            87.98
1          pull_request                41.20            48.72
2              schedule                 6.80             8.81
3               release                 3.16             4.17
4                issues                 1.63             2.32
5         issue_comment                 0.95             1.20
6                create                 0.37             0.56
7                status                 0.37             0.56
8   repository_dispatch                 0.32             0.32
9   pull_request_review                 0.21             0.32
10                watch                 0.16             0.16
11            milestone                 0.11             0.16
12                label                 0.05             0.08
13                 fork                 0.05             0.08
14          check_suite                 0.05             0.

## Draw a line chart of the evolution of Events proportion through the snapshots

In [61]:
data_workflow = []
data_repo = []

for year, event_df in events_dataframes:
    top_10_events = event_df.head(10)

    for _, row in top_10_events.iterrows():
        data_workflow.append({
            'Year': year,
            'Event': row['event'],
            'Percentage': row['workflow_proportion'],
        })

        data_repo.append({
            'Year': year,
            'Event': row['event'],
            'Percentage': row['repo_proportion'],
        })

workflow_df = pd.DataFrame(data_workflow)
repo_df = pd.DataFrame(data_repo)

order_event_2024 = workflow_df[workflow_df['Year'] == 2024].sort_values(by='Percentage', ascending=False)['Event'].tolist()
top5_events = order_event_2024[:5]
rest_events = order_event_2024[5:]

# Random color palette generated thanks to AI.
color_palette = [
    '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
    '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
    '#aec7e8', '#ffbb78', '#98df8a', '#ff9896', '#c5b0d5',
    '#c49c94', '#f7b6d2', '#c7c7c7', '#dbdb8d', '#9edae5'
]

workflow_top_df = workflow_df[workflow_df['Event'].isin(top5_events)].copy()
workflow_rest_df = workflow_df[workflow_df['Event'].isin(rest_events)].copy()

repo_top_df = repo_df[repo_df['Event'].isin(top5_events)].copy()
repo_rest_df = repo_df[repo_df['Event'].isin(rest_events)].copy()

workflow_chart_top5 = alt.Chart(workflow_top_df).mark_line(point=True).encode(
    x=alt.X('Year:O', title="Année"),
    y=alt.Y('Percentage:Q', title="Pourcentage", scale=alt.Scale(domain=[0,80])),
    color=alt.Color('Event:N', title="Événements", sort=order_event_2024,  scale=alt.Scale(range=color_palette)),
).properties(
    width=800,
    height=400
).interactive()

workflow_chart_rest = alt.Chart(workflow_rest_df).mark_line(point=True).encode(
    x=alt.X('Year:O', title="Année"),
    y=alt.Y('Percentage:Q', title="Pourcentage", scale=alt.Scale(domain=[0,10])),
    color=alt.Color('Event:N', title="Événements", sort=order_event_2024,  scale=alt.Scale(range=color_palette)),
).properties(
    width=800,
    height=400
).interactive()

repo_chart_top5 = alt.Chart(repo_top_df).mark_line(point=True).encode(
    x=alt.X('Year:O', title="Année"),
    y=alt.Y('Percentage:Q', title="Pourcentage", scale=alt.Scale(domain=[0,100])),
    color=alt.Color('Event:N', title="Événements", sort=order_event_2024,  scale=alt.Scale(range=color_palette)),
).properties(
    width=800,
    height=400
).interactive()

repo_chart_rest = alt.Chart(repo_rest_df).mark_line(point=True).encode(
    x=alt.X('Year:O', title="Année"),
    y=alt.Y('Percentage:Q', title="Pourcentage", scale=alt.Scale(domain=[0,15])),
    color=alt.Color('Event:N', title="Événements", sort=order_event_2024,  scale=alt.Scale(range=color_palette)),
).properties(
    width=800,
    height=400
).interactive()

workflow_chart_top5.show()
workflow_chart_top5.save("../out/charts/top5_events_workflow.png")
workflow_chart_rest.show()
workflow_chart_rest.save("../out/charts/rest_events_workflow.png")
repo_chart_top5.show()
repo_chart_top5.save("../out/charts/top5_events_repo.png")
repo_chart_rest.show()
repo_chart_rest.save("../out/charts/rest_events_repo.png")

### Median number of triggers events per workflow through the snapshots

In [62]:
for year, median in yearly_events_medians:
    print(f"Year {year}:")
    print(f"Median number of events per workflow: {median}")
    print()

Year 2019:
Median number of events per workflow: 1.0

Year 2020:
Median number of events per workflow: 2.0

Year 2021:
Median number of events per workflow: 2.0

Year 2022:
Median number of events per workflow: 2.0

Year 2023:
Median number of events per workflow: 2.0

Year 2024:
Median number of events per workflow: 2.0



### Most used pairs of trigger events through the snapshots

In [63]:
for year, pairs_df in pairs_dataframes:
    print(f"Year {year}:")
    print(f"Most used pairs of events per workflow: {pairs_df.head(20)}")
    print()

Year 2019:
Most used pairs of events per workflow:                                    pair  total  workflow_proportion
0                  (pull_request, push)    584                30.77
1                      (push, schedule)     40                 2.11
2              (pull_request, schedule)     31                 1.63
3                (issues, pull_request)     16                 0.84
4                       (push, release)      5                 0.26
5         (pull_request_review, status)      4                 0.21
6   (pull_request, pull_request_review)      4                 0.21
7                (pull_request, status)      4                 0.21
8           (push, repository_dispatch)      2                 0.11
9   (pull_request, repository_dispatch)      2                 0.11
10            (issue_comment, schedule)      1                 0.05
11                       (issues, push)      1                 0.05
12              (pull_request, release)      1                 0.

## Extract different the different steps from the workflows (run and uses)
- The first thing to do is to extract all the steps present in the different workflows.

In [94]:
parsed_steps = []

for year, parsed_workflows in parsed_snapshots:
    steps = info_extractor.extract_step_type_from_parsed(parsed_workflows)
    parsed_steps.append((year, steps))

- The idea is to get the total number of steps to compute the proportion of 'uses' and 'run' steps among them.
- It is also required to get the proportion of repositories that are using 'uses' and 'run' steps among the repositories of the snapshots.

#### Part 2 Extract the top 10 of Actions used in the workflows along the different snapshots

- For this we will add a bit of code to extract the different actions used in the workflows and their counts in a dictionary.
- actions are formatted as follows: actions/action_name@version. So to get only the action name it is needed to split on '@' and keep the first part.
- I do not generate another code block here because to rank these actions I need to compute the proportion of steps and repositories using them.

In [97]:
steps_dataframes, top10_actions_dataframes, yearly_steps_medians = info_extractor.count_steps(parsed_steps, repositories)

for year, steps_df in steps_dataframes:
    print(f"Year {year}:")
    print(steps_df)
    print()

Year 2019:
   total_steps  total_repositories  uses_proportion  run_proportion  \
0        11510                1248            44.05           55.94   

   repo_uses  repo_run  
0      99.52     86.78  

Year 2020:
   total_steps  total_repositories  uses_proportion  run_proportion  \
0       241069               13026            49.04           50.95   

   repo_uses  repo_run  
0      99.65     90.62  

Year 2021:
   total_steps  total_repositories  uses_proportion  run_proportion  \
0       647850               26621            50.65           49.34   

   repo_uses  repo_run  
0      99.82      94.2  

Year 2022:
   total_steps  total_repositories  uses_proportion  run_proportion  \
0      1030282               34532            52.14           47.86   

   repo_uses  repo_run  
0      99.44     94.24  

Year 2023:
   total_steps  total_repositories  uses_proportion  run_proportion  \
0      1427343               40409            52.64           47.36   

   repo_uses  repo_run  
0

#### Draw line chart representing the evolution of the different proportion over the years

In [83]:
data = []

for year, steps_df in steps_dataframes:

    for _, row in steps_df.iterrows():
        data.append({
            'Year': year,
            '% steps étant des run' : row['run_proportion'],
            '% steps étant des uses': row['uses_proportion'],
            '% dépôts utilisant run' : row['repo_run'],
            '% dépôts utilisant uses': row['repo_uses'],
        })
prop_df = pd.DataFrame(data)

prop_long_df = prop_df.melt(
    id_vars='Year',
    value_vars=['% steps étant des run', '% steps étant des uses', '% dépôts utilisant run', '% dépôts utilisant uses'],
    var_name='ProportionType',
    value_name='Percentage'
)

chart = alt.Chart(prop_long_df).mark_line(point=True).encode(
    x=alt.X('Year:O', title="Année"),
    y=alt.Y('Percentage:Q', title="Pourcentage", scale=alt.Scale(domain=[40,100])),
    color=alt.Color('ProportionType', title="Type de proportion"),
).properties(
    width=800,
    height=400
).interactive()

chart.show()
chart.save("../out/charts/steps_prop.png")

In [98]:
for year, actions_df in top10_actions_dataframes:
    print(f"Year {year}:")
    print(actions_df)
    print()

Year 2019:
                                 action  step_proportion  repo_proportion
0                      actions/checkout            20.81            94.15
1                    actions/setup-node             4.42            24.04
2                  actions/setup-python             2.09            10.10
3                    actions/setup-java             1.43             9.29
4                      actions-rs/cargo             1.40             1.76
5                      actions/setup-go             1.38             7.61
6               actions/upload-artifact             1.21             5.21
7                  actions-rs/toolchain             0.90             1.84
8                    actions/setup-ruby             0.86             4.17
9               hecrj/setup-rust-action             0.41             1.36
10                        actions/stale             0.34             3.04
11                   actions/docker/cli             0.33             0.80
12            actions/downl

## Extract other information on the workflow dimensions
- The idea is to replicate the median characteristics extraction but for the workflows.
- Metrics that could be extracted are :
    - Number of jobs
    - Number of steps
    - Number of events
    - Use of matrix strategy
    - Use of permissions

- Some of these metrics were partially or totally extracted in the previous steps.
- Some like permissions and matrix strategy are not extracted yet.
- The difference here is that we need to extract the number of these metrics for each workflow to have a median value for each snapshot.

### Extract the proportion of strategies through the different snapshots

In [115]:
parsed_strategies = []

for year, parsed_workflows in parsed_snapshots:
    strategies = info_extractor.extract_strategies_from_parsed(parsed_workflows)
    parsed_strategies.append((year, strategies))


In [116]:
strategies_dataframes, yearly_strategies_median = info_extractor.count_strategies(parsed_strategies, repositories)

for year, strategies_df in strategies_dataframes:
    print(f"Year {year}:")
    print(strategies_df)
    print()

Year 2019:
   workflow_proportion  repo_proportion
0                35.62            46.23

Year 2020:
   workflow_proportion  repo_proportion
0                29.97            50.33

Year 2021:
   workflow_proportion  repo_proportion
0                32.35             61.3

Year 2022:
   workflow_proportion  repo_proportion
0                29.77            62.63

Year 2023:
   workflow_proportion  repo_proportion
0                27.54            63.04

Year 2024:
   workflow_proportion  repo_proportion
0                26.61            64.02



#### Draw a line chart for matrix strategies proportion evolution throughout the snapshots

In [119]:
data = []

for year, strategy_df in strategies_dataframes:
    row = strategy_df.iloc[0]
    data.append({
        'Year': year,
        'Type': '% workflows matrix strategy',
        'Percentage': row['workflow_proportion']
    })

    data.append({
        'Year': year,
        'Type': '% dépôts matrix strategy',
        'Percentage': row['repo_proportion']
    })

strategy_prop_df = pd.DataFrame(data)

chart = alt.Chart(strategy_prop_df).mark_line(point=True).encode(
    x=alt.X('Year:O', title="Année"),
    y=alt.Y('Percentage:Q', title="Pourcentage", scale=alt.Scale(domain=[0,70])),
    color=alt.Color('Type:N', title="Type de proportion"),
).properties(
    width=800,
    height=400
).interactive()

chart.show()
chart.save("../out/charts/matrix_strategy.png")

### Extract the proportion of global permissions through the snapshots

In [99]:
parsed_global_permissions = []

for year, parsed_workflows in parsed_snapshots:
    global_permissions = info_extractor.extract_global_permissions_from_parsed(parsed_workflows)
    parsed_global_permissions.append((year, global_permissions))

In [103]:
global_permissions_dataframes = info_extractor.count_permissions(parsed_global_permissions, repositories)

for year, global_permissions_df in global_permissions_dataframes:
    print(f"Year {year}:")
    print(global_permissions_df)
    print()

Year 2019:
   workflow_proportion  repo_proportion
0                  0.0              0.0

Year 2020:
   workflow_proportion  repo_proportion
0                  0.0              0.0

Year 2021:
   workflow_proportion  repo_proportion
0                 0.54             1.13

Year 2022:
   workflow_proportion  repo_proportion
0                 4.25             8.11

Year 2023:
   workflow_proportion  repo_proportion
0                 7.69            16.83

Year 2024:
   workflow_proportion  repo_proportion
0                10.14            22.73



### Extract the proportion of repositories and workflows using jobs.permissions

In [104]:
parsed_jobs_permissions = []

for year, parsed_workflows in parsed_snapshots:
    jobs_permissions = info_extractor.extract_job_permissions_from_parsed(parsed_workflows)
    parsed_jobs_permissions.append((year, jobs_permissions))

In [105]:
jobs_permissions_dataframes = info_extractor.count_permissions(parsed_jobs_permissions, repositories)
for year, jobs_permissions_df in jobs_permissions_dataframes:
    print(f"Year {year}:")
    print(jobs_permissions_df)
    print()

Year 2019:
   workflow_proportion  repo_proportion
0                  0.0              0.0

Year 2020:
   workflow_proportion  repo_proportion
0                  0.0              0.0

Year 2021:
   workflow_proportion  repo_proportion
0                  1.5             3.91

Year 2022:
   workflow_proportion  repo_proportion
0                 4.94            12.74

Year 2023:
   workflow_proportion  repo_proportion
0                 8.14            21.61

Year 2024:
   workflow_proportion  repo_proportion
0                10.54            28.65



#### Draw a line chart for permissions proportion evolution throughout the snapshots

In [114]:
data = []

for year, global_df in global_permissions_dataframes:
    row = global_df.iloc[0]
    data.append({
        'Year': year,
        'Type': '% workflow permissions globales',
        'Percentage': row['workflow_proportion']
    })
    data.append({
        'Year': year,
        'Type': '% dépôts permissions globales',
        'Percentage': row['repo_proportion']
    })

for year, job_df in jobs_permissions_dataframes:
    row = job_df.iloc[0]
    data.append({
        'Year': year,
        'Type': '% workflow permissions de jobs',
        'Percentage': row['workflow_proportion']
    })
    data.append({
        'Year': year,
        'Type': '% dépôts permissions de jobs',
        'Percentage': row['repo_proportion']
    })

permissions_df = pd.DataFrame(data)

chart = alt.Chart(permissions_df).mark_line(point=True).encode(
    x=alt.X('Year:O', title="Année"),
    y=alt.Y('Percentage:Q', title="Pourcentage", scale=alt.Scale(domain=[0,30])),
    color=alt.Color('Type:N', title="Type de proportion"),
).properties(
    width=800,
    height=400
).interactive()

chart.show()
chart.save("../out/charts/permissions.png")


### Extract the jobs and different metrics on them

In [107]:
parsed_jobs = []

for year, parsed_workflows in parsed_snapshots:
    jobs = info_extractor.extract_jobs_from_parsed(parsed_workflows)
    parsed_jobs.append((year, jobs))

### Median number of jobs through the different snapshots

In [122]:
yearly_jobs_median = info_extractor.count_jobs(parsed_jobs)
for year, median in yearly_jobs_median:
    print(f"Year {year}:")
    print(f"Median number of jobs per workflow: {median}")
    print()

Year 2019:
Median number of jobs per workflow: 1.0

Year 2020:
Median number of jobs per workflow: 1.0

Year 2021:
Median number of jobs per workflow: 1.0

Year 2022:
Median number of jobs per workflow: 1.0

Year 2023:
Median number of jobs per workflow: 1.0

Year 2024:
Median number of jobs per workflow: 1.0



### Median number of matrix strategies through the different snapshots

In [121]:
for year, median in yearly_strategies_median:
    print(f"Year {year}:")
    print(f"Median number of global strategies per workflow: {median}")
    print()

Year 2019:
Median number of global strategies per workflow: 1.0

Year 2020:
Median number of global strategies per workflow: 1.0

Year 2021:
Median number of global strategies per workflow: 1.0

Year 2022:
Median number of global strategies per workflow: 1.0

Year 2023:
Median number of global strategies per workflow: 1.0

Year 2024:
Median number of global strategies per workflow: 1.0



### Median number of trigger events through the different snapshots

In [124]:
for year, median in yearly_events_medians:
    print(f"Year {year}:")
    print(f"Median number of events per workflow: {median}")
    print()

Year 2019:
Median number of events per workflow: 1.0

Year 2020:
Median number of events per workflow: 2.0

Year 2021:
Median number of events per workflow: 2.0

Year 2022:
Median number of events per workflow: 2.0

Year 2023:
Median number of events per workflow: 2.0

Year 2024:
Median number of events per workflow: 2.0



### Median number of steps through the different snapshots

In [123]:
for year, median in yearly_steps_medians:
    print(f"Year {year}:")
    print(f"Median number of steps per workflow: {median}")
    print()

Year 2019:
Median number of steps per workflow: 4.0

Year 2020:
Median number of steps per workflow: 6.0

Year 2021:
Median number of steps per workflow: 6.0

Year 2022:
Median number of steps per workflow: 6.0

Year 2023:
Median number of steps per workflow: 6.0

Year 2024:
Median number of steps per workflow: 6.0



## Snapshot with filter the uid history of invalid workflows

- This was another idea of filtering workflows that was based on deleting all records of each uid that has at least one invalid workflow (valid_yaml = False).
- This method is more aggressive than the previous one as it will delete all the records of a uid that has at least one invalid workflow.

In [None]:
invalid_uids = df.loc[df['valid_yaml'] == False, 'uid'].unique()
print(f"Number of invalid uids : {len(invalid_uids)}")
print(f"Number of records that corresponds to these invalids uids : {df['uid'].isin(invalid_uids).sum()}\n")

print(f"Number of records in the dataframe before : {df.shape[0]}")
print(f"Total number of uids before filtering : {df['uid'].nunique()}\n")

filtered_df2 = snapshoter.delete_uid_with_invalid_yaml(df)
print(f"Number of records in the dataframe once {filtered_df2.shape[0]}")
print(f"Total number of uids after filtering : {filtered_df2['uid'].nunique()}")

year = 2019
while year <= 2024:
    # Snapshot with the second method of filtering
    snapshot2 = snapshoter.extract_snapshot(filtered_df2, year)

    # Drop the deleted workflows
    snapshot2 = snapshot2.dropna(subset=['file_hash'])

    print(f"Year {year}:")
    print(f"Number of workflows in the snapshot: {snapshot2.shape[0]}\n")