### File Structure of the Dataset Folder
```
├── journal-meta/
│   ├── 0968-090X.csv         # TRC
│   ├── mini-dataset.csv      # A mini dataset for tutorial, extracted from TRC
│   ├── journal-meta-dataset.csv   # the combined dataset for all journals
│   ├── github_data.json      # the links for the GitHub repository
│   ├── url_data.json         # the links for the data avaiablity urls
│   └── ... (other journal CSV files)
├── journal-full-text/
│   ├── 0968-090X/
│   │   └── 10.1016_j.trc.2023.104311.xml
│   └── ... (other DOI folders)
```

In [None]:
import pandas as pd
import os
from tqdm import tqdm
from utils import *
# replace it with your downloaded folder path
full_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-full-text'
meta_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-meta'

In [None]:
data = pd.read_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'))
data.head()

In [None]:
data['is_github'] = 0
data['num_of_github_urls'] = 0
# create a new json file to store the github data
github_data = []
for i in range(len(data)):
# # In case for the mini test to debug the code
# for i in tqdm(range(1000)):
    github_urls = []
    github_full_urls = []
    journal_path = os.path.join(full_data_folder, data['issn'][i])  # Path to the journal folder
    paper_path = os.path.join(journal_path, data['unique_id'][i] + '.xml')  # Path to the paper folder
    sections = extract_sections_and_text_from_xml(paper_path)
    reorganized_sections = postprocess_sections(sections)
    for section in reorganized_sections:
        urls = extract_github_urls(cleanup(section['text']))
        full_urls = extract_full_github_urls(cleanup(section['text']))
        if urls:
            github_urls.extend(urls)
        if full_urls:
            github_full_urls.extend(full_urls)
        for subsection in section['subsections']:
            urls = extract_github_urls(cleanup(subsection['text']))
            full_urls = extract_full_github_urls(cleanup(subsection['text']))
            if urls:
                github_urls.extend(urls)
            if full_urls:
                github_full_urls.extend(full_urls)
            for subsubsection in subsection['subsubsections']:
                urls = extract_github_urls(cleanup(subsubsection['text']))
                full_urls = extract_full_github_urls(cleanup(subsubsection['text']))
                if urls:
                    github_urls.extend(urls)
                if full_urls:
                    github_full_urls.extend(full_urls)
    if github_urls:
        # Mark as GitHub present
        data.loc[i, 'is_github'] = 1
        # Remove duplicate URLs and count unique URLs
        unique_github_urls = set(github_urls)
        print('GitHub URLs found:', unique_github_urls)
        unique_github_full_urls = set(github_full_urls)
        data.loc[i, 'num_of_github_urls'] = len(unique_github_urls)
        github_data.append({
            'issn': data['issn'][i],
            'unique_id': data['unique_id'][i],
            'title': data['title'][i],
            'github_urls': list(unique_github_urls),
            'github_full_urls': list(unique_github_full_urls)
        })
# save the json file github_data
save_json(github_data, os.path.join(meta_data_folder, 'github_data.json'))

In [None]:
data.to_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'), index=False)

In [None]:
grouped_counts = data.groupby('journal_name')['is_github'].value_counts()
# grouped_counts
grouped_counts = grouped_counts.reset_index(name='count')
# Group by journal_name and calculate the sum of counts for each group
grouped_counts['percentage'] = (
    grouped_counts['count'] / grouped_counts.groupby('journal_name')['count'].transform('sum')
) * 100
# Display the updated DataFrame
grouped_counts[grouped_counts['is_github'] == 1].sort_values('percentage', ascending=False).head(10)