In [1]:
import pandas as pd
import os
import re
folder = '../journal-full-text'
# List all files in the folder with csv
journals = [f for f in os.listdir(folder) if f.endswith('.csv')]
journal_issn_list = [['TRA','0965-8564'],
                     ['TRB','0191-2615'],
                     ['TRC','0968-090X'],
                     ['TRD','1361-9209'],
                     ['TRE','1366-5545'],
                     ['TRF','1369-8478'],
                     ['TRIP','2590-1982']]
journal_issn_df = pd.DataFrame(journal_issn_list, columns=['journal','issn'])

In [None]:
for journal in journals:
    # get the journal without the .csv
    journal_issn = journal.split('.csv')[0]
    journal_folder = os.path.join(folder, journal_issn)
    files = os.listdir(journal_folder)
    count = 0
    for file in files:
        # filter the file with .txt
        if file.endswith('.txt'):
            # read the file
            with open(journal_folder + '/' + file, 'r') as f:
                text = f.read()
                if 'github.com' in text:
                    count += 1
    journal_name = journal_issn_df[journal_issn_df['issn'] == journal_issn]['journal'].values[0]
    print(f"{journal_name}:{(count/len(files) * 100):.2f}%")
    # print the journal name in match with the issn

## Example of llama for the code and data availability

In [28]:
import xml.etree.ElementTree as ET
import json

def extract_sections_and_text_from_xml(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Namespace to handle XML namespaces
    namespaces = {
        'xocs': 'http://www.elsevier.com/xml/xocs/dtd',
        'ce': 'http://www.elsevier.com/xml/common/dtd',
        'ja': 'http://www.elsevier.com/xml/ja/dtd',
        'mml': 'http://www.w3.org/1998/Math/MathML'
    }

    # Extracting the sections using the item-toc element
    sections = []
    for item in root.findall('.//xocs:item-toc-entry', namespaces):
        section_title = item.find('xocs:item-toc-section-title', namespaces)
        section_label = item.find('xocs:item-toc-label', namespaces)
        section_text = []
        
        # Use the section label to find the corresponding section id in <ce:section>
        if section_label is not None:
            label_text = section_label.text.strip()
            section_elem = root.find(f".//ce:section[ce:label='{label_text}']", namespaces)
            if section_elem is not None:
                # Get all text under the section element, including paragraphs and other texts
                section_text_parts = []
                subsections = []
                before_subsection_text = True

                # Iterate over all elements within the section
                for elem in section_elem:
                    # Check if this element is a subsection
                    if elem.tag == f"{{{namespaces['ce']}}}section":
                        # This is a subsection, process it
                        subsection_title_elem = elem.find(f".//ce:section-title", namespaces)
                        if subsection_title_elem is not None:
                            subsection_title = subsection_title_elem.text
                            subsection_paragraphs = []
                            for sub_elem in elem.findall('.//ce:para', namespaces=namespaces):
                                # Append text, taking care of <ce:cross-ref> tags and <mml:math> tags within paragraphs
                                paragraph_text = ''.join(sub_elem.itertext())
                                subsection_paragraphs.append(paragraph_text)
                            subsection_text = ' '.join(subsection_paragraphs)
                            subsections.append({
                                "title": subsection_title,
                                "text": subsection_text
                            })
                    else:
                        # Collect text before any subsection starts
                        if before_subsection_text and elem.tag == f"{{{namespaces['ce']}}}para":
                            # Append text, taking care of <ce:cross-ref> tags and <mml:math> tags within paragraphs
                            paragraph_text = ''.join(elem.itertext())
                            section_text_parts.append(paragraph_text)

                section_text = ' '.join(section_text_parts)
                
                sections.append({
                    "label": section_label.text,
                    "title": section_title.text,
                    "text": section_text,
                    "subsections": subsections
                })

    return sections
# Example usage
file_path = '../10.1016_j.trb.2018.10.011.xml'
sections = extract_sections_and_text_from_xml(file_path)
sectiontext = ""
for section in sections[17:]:
    sectiontext += f"Section {section['label']}: {section['title']}\n"
    sectiontext += f"Text: {section['text']}\n"
    for subsection in section['subsections']:
        sectiontext += f"  Subsection: {subsection['title']}\n"
        sectiontext += f"  Text: {subsection['text']}\n"
    sectiontext += "-" * 50 + "\n"
    break

In [None]:
len(sectiontext)

In [None]:
import ollama

response = ollama.chat(
    model="llama3.2:latest",
    messages=[
        {
            "role": "user",
            "content": sectiontext + "Using the description provided above, can you determine which programming language is used to deploy the algorithm experiments mentioned?",
        },
    ],
)
print(response["message"]["content"])

In [None]:
import ollama

response = ollama.chat(
    model="llama3.2",
    messages=[
        {
            "role": "user",
            "content": sectiontext + "Is the data described here publicly accessible? Can I get the data? Answer with Yes or No. If NOT, why?",
        },
    ],
)
print(response["message"]["content"])

## Sankey diagram toy example for results

In [7]:
import plotly.graph_objects as go

# Define nodes (categories)
nodes = dict(
    label=["Input", "Output 1", "Output 2", "Output 3", "Another Input", "Output A", "Output B"],  # Labels for nodes
    pad=20,       # Padding between nodes
    thickness=20, # Thickness of nodes
    color=["blue", "orange", "green", "red", "purple", "yellow", "cyan"]  # Colors for nodes
)

# Define links (flows between nodes)
links = dict(
    source=[0, 0, 0, 0, 4, 4],  # Indices of source nodes
    target=[1, 2, 3, 4, 5, 6],  # Indices of target nodes
    value=[100, 30, 20, 50, 15, 35],  # Values of the flows
    color=["blue", "orange", "green", "red", "purple", "yellow"]  # Colors for links (optional)
)

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=nodes,
    link=links
))

# Add a title and show the figure
fig.update_layout(title_text="Sankey Diagram Example", font_size=14)
fig.write_html("../sankey_diagram.html")