In [1]:
#import nececessary libraries
import requests
import json
import csv
import os
import re
import time  # For adding delays between requests

API_TOKEN = 'add figma token'

# Figma API URL template
FIGMA_API_URL_TEMPLATE = 'https://api.figma.com/v1/files/{}'

# Headers for the API request
headers = {
    'X-Figma-Token': API_TOKEN
}

# Base URL for the GitHub repository to fetch inputs (images and graphics)
GITHUB_BASE_URL = 'https://raw.githubusercontent.com/UNDP-Data/dsc-energy-academy-pipeline/main/03_Inputs/'

## Process multiple files sequentially & Extract images and graphics URLs

In [2]:
# Function to determine the module (section) number from the file name or URL (e.g., MODULE1 -> 1)
def get_section_number(file_name):
    match = re.search(r'MODULE(\d+)', file_name, re.IGNORECASE)
    return int(match.group(1)) if match else None


#Test Case
"""
The input parameter is the figma file name - In this case its MODULE[NUMBER]
# Example input: 'MODULE1', 'MODULE2' 
# Expected output is the module number: 1, 2
"""
      
# Example test
file_name_1 = 'MODULE1'
file_name_2 = 'MODULE2'
print(f"The module number extracted is: {get_section_number(file_name_1)}")  # Expected output: 1
print(f"The module number extracted is: {get_section_number(file_name_2)}")  # Expected output: 2

The module number extracted is: 1
The module number extracted is: 2


In [3]:
# Function to determine the appropriate header tag based on font size
def determine_header_tag(font_size):
    if font_size >= 32:
        return 'h1'
    elif 24 <= font_size < 32:
        return 'h2'
    elif 20 <= font_size < 24:
        return 'h3'
    elif 16 <= font_size < 20:
        return 'h4'
    elif 14 <= font_size < 16:
        return 'h5'
    else:
        return 'h6'
    
    
#Test Case
"""
The input parameter is font size (integer). The function should return the appropriate header tag.
Example input: 32, 24, 20, 16, 14
Expected output: 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
"""

# Example test
font_size_1 = 50
font_size_2 = 24
font_size_3 = 20
font_size_4 = 16
font_size_5 = 14

print(f"Header tag for font size {font_size_1}: {determine_header_tag(font_size_1)}")  # Expected output: 'h1'
print(f"Header tag for font size {font_size_2}: {determine_header_tag(font_size_2)}")  # Expected output: 'h2'
print(f"Header tag for font size {font_size_3}: {determine_header_tag(font_size_3)}")  # Expected output: 'h3'
print(f"Header tag for font size {font_size_4}: {determine_header_tag(font_size_4)}")  # Expected output: 'h4'
print(f"Header tag for font size {font_size_5}: {determine_header_tag(font_size_5)}")  # Expected output: 'h5'

Header tag for font size 50: h1
Header tag for font size 24: h2
Header tag for font size 20: h3
Header tag for font size 16: h4
Header tag for font size 14: h5


In [4]:
# Function to determine the template type based on frame content
def determine_template_type(frame):
    contains_text = False
    contains_image = False
    label_like_text_only = True  # Assume the text is a label by default

    for node in frame['children']:
        if node['type'] == 'TEXT':
            text = node.get('characters', '').strip()
            # Check if the text is more than a label (like image_1 or graph_1)
            if re.search(r'image_\d+|graph_\d+', text):
                continue  # Ignore label-like text
            elif len(text) > 0:  # Consider non-empty and non-label text as real text
                contains_text = True  # Regular text, not a label
                label_like_text_only = False
        if node['type'] in ['IMAGE', 'VECTOR', 'RECTANGLE']:  # 'VECTOR' or 'RECTANGLE' might be used for images
            contains_image = True

    # determine template type based on whether the frame contains real text or just label-like text
    if contains_image and label_like_text_only:  # Only label-like text and image
        return 'Photo'
    elif contains_image and contains_text:
        return 'Text-Image'
    elif contains_text and not contains_image:
        return 'Text'
    else:
        return 'Unknown'  # In case neither text nor image is found
    
# Test Case
"""
The input parameter is a frame object (a dictionary with children nodes)n form figma data.
# Example input: A frame containing text or images from figma
# Expected output: 'Photo', 'Text-Image', 'Text', or 'Unknown'
"""
# Example frame input
frame_with_text = {
    'children': [
        {'type': 'TEXT', 'characters': 'Sample Text'}
    ]
}

frame_with_image = {
    'children': [
        {'type': 'IMAGE'}
    ]
}

frame_with_text_image = {
    'children': [
        {'type': 'TEXT', 'characters': 'Sample Text'},
        {'type': 'IMAGE'}
    ]
}

print(f"Template type for frame with text: {determine_template_type(frame_with_text)}")  # Expected output: 'Text'
print(f"Template type for frame with image: {determine_template_type(frame_with_image)}")  # Expected output: 'Photo'
print(f"Template type for frame with text and image: {determine_template_type(frame_with_text_image)}")  # Expected output: 'Text-Image'

Template type for frame with text: Text
Template type for frame with image: Photo
Template type for frame with text and image: Text-Image


In [5]:
# Function to fetch image and graphic URL from GitHub
def get_image_url(text, section_number, extension='.png'):
    # Check if the text contains image or graph labels, and fetch the corresponding URL
    image_match = re.search(r'image_(\d+)', text)
    graph_match = re.search(r'graph_(\d+)', text)
    
    if image_match:
        return f'{GITHUB_BASE_URL}Images/image_{image_match.group(1)}{extension}'
    elif graph_match:
        return f'{GITHUB_BASE_URL}Graphics/Module%20{section_number}/graph_{graph_match.group(1)}{extension}'
    
    return None

# Test Case
"""
The input is a string containing references to images or graphs (e.g., "image_1" or "graph_2"), along with a section number  
The function generates a URL for that image or graph.
Example input: Text with image reference, Text with graph reference, Text without any reference
Expected output: A Github URL for the image/graph or None if no reference is found
"""
# Example test
text_with_image = 'image_1'
text_with_graph = 'graph_2'
text_with_no_image_or_graph = 'No references here'

GITHUB_BASE_URL = 'https://raw.githubusercontent.com/UNDP-Data/dsc-energy-academy-pipeline/main/03_Inputs/'

print(f"Image URL: {get_image_url(text_with_image, 1)}") # Expected output: URL pointing to image_1.png'
print(f"Graph URL: {get_image_url(text_with_graph, 1)}") # Expected output: URL pointing to graph_2.png
print(f"No image or graph URL found on the frame: {get_image_url(text_with_no_image_or_graph, 1)}") # Expected output: None


Image URL: https://raw.githubusercontent.com/UNDP-Data/dsc-energy-academy-pipeline/main/03_Inputs/Images/image_1.png
Graph URL: https://raw.githubusercontent.com/UNDP-Data/dsc-energy-academy-pipeline/main/03_Inputs/Graphics/Module%201/graph_2.png
No image or graph URL found on the frame: None


In [6]:
# Function to process a frame and extract relevant information
def process_frame(frame, section_number, subsection_number=None):
    # Check if 'absoluteBoundingBox' exists
    if 'absoluteBoundingBox' not in frame:
        print(f"Warning: 'absoluteBoundingBox' missing for frame '{frame.get('name', 'Unnamed')}'. Skipping this frame.")
        return None  # Return None to indicate skipping this frame
    
    # Determine template type (Photo/Text or Text-Image) based on content
    template_type = determine_template_type(frame)
    
    frame_data = {
        'Section': section_number,
        'Subsection': subsection_number, 
        'Section Title': frame['name'],  
        'Template Type': template_type,
        'Column': 0,  
        'Text': '',  
        'x': frame['absoluteBoundingBox']['x'],  # Capture the x position for sorting
        'y': frame['absoluteBoundingBox']['y'],  # Capture the y position for sorting
        'image_url': None  # Placeholder for the image URL
    }

    # Process all nodes within the frame
    for node in frame['children']:
        if node['type'] == 'TEXT':
            font_size = node.get('style', {}).get('fontSize', 14)  # Default to 14 if no fontSize is found
            header_tag = determine_header_tag(font_size)
            text = node['characters']
            # Concatenate text with the appropriate header tags
            frame_data['Text'] += f'<{header_tag}>{text}</{header_tag}>'

            # Add the image/graphic URL if image_[number] or graph_[number] is found
            image_url = get_image_url(text, section_number)
            if image_url:
                frame_data['image_url'] = image_url
    
    return frame_data

# Test Case
"""
The input parameters are a frame object with elements inside such as text and images and section number
The function processes this frame, extracting important information like section number, template type, position, and any image/graph URLs.
Example input: A frame with text, images, and bounding box information from Figma.
Expected output: Processed frame data with section, subsection, title, template type, and image URL if applicable.
"""
frame_input = {
    'name': 'Sample Frame',
    'absoluteBoundingBox': {'x': 100, 'y': 200},
    'children': [
        {'type': 'TEXT', 'characters': 'Sample Text'}
    ]
}

section_number = 1
print(f"Processed frame data: {process_frame(frame_input, section_number)}")
# Expected output: A dictionary with processed frame data including section number, title, and text.

Processed frame data: {'Section': 1, 'Subsection': None, 'Section Title': 'Sample Frame', 'Template Type': 'Text', 'Column': 0, 'Text': '<h5>Sample Text</h5>', 'x': 100, 'y': 200, 'image_url': None}


In [7]:
# Function to recursively extract frames from the file data and assign subsections
def extract_frames(nodes, section_number=1):
    frames = []  # Initialize a new list for each call to avoid data mix-up
    
    # Extract all frames in a flat list
    for node in nodes:
        if node['type'] == 'FRAME':
            # Process the frame with the current section and default subsection number
            processed_frame = process_frame(node, section_number)
            if processed_frame:  # Only add if the frame was successfully processed
                frames.append(processed_frame)
        
        # Recursively process child nodes (if any)
        if 'children' in node:
            frames.extend(extract_frames(node['children'], section_number))

    return frames

# Test Case
"""
The input parameter is the list of nodes from actual Figma data.
# Example input: A list of nodes containing frames from Figma.
# Expected output: A list of processed frames extracted from the nodes.
"""
nodes_input = [
    {
        'type': 'FRAME',
        'name': 'Frame 1',
        'absoluteBoundingBox': {'x': 100, 'y': 200},
        'children': [
            {'type': 'TEXT', 'characters': 'Sample Text'}
        ]
    },
    {
        'type': 'FRAME',
        'name': 'Frame 2',
        'absoluteBoundingBox': {'x': 150, 'y': 250},
        'children': [
            {'type': 'IMAGE'}
        ]
    }
]

print(f"Extracted frames: {extract_frames(nodes_input)}")
# Expected output: List of processed frames with section numbers and corresponding data.


Extracted frames: [{'Section': 1, 'Subsection': None, 'Section Title': 'Frame 1', 'Template Type': 'Text', 'Column': 0, 'Text': '<h5>Sample Text</h5>', 'x': 100, 'y': 200, 'image_url': None}, {'Section': 1, 'Subsection': None, 'Section Title': 'Frame 2', 'Template Type': 'Photo', 'Column': 0, 'Text': '', 'x': 150, 'y': 250, 'image_url': None}]


In [8]:
# Function to dynamically assign subsections based on similar y-values (rows)
def assign_subsections_by_row(frames, y_threshold=100):
    if not frames:
        return frames

    # Sort frames by their y-position first, then by x-position
    frames.sort(key=lambda f: (f['y'], f['x']))

    subsection_number = 1
    last_y = frames[0]['y']
    
    for frame in frames:
        # If the vertical difference between frames is larger than the threshold, start a new subsection
        if abs(frame['y'] - last_y) > y_threshold:
            subsection_number += 1

        # Assign the subsection number
        frame['Subsection'] = subsection_number

        # Update last_y to the current frame's y-position
        last_y = frame['y']
    
    return frames



# Test Case
"""
The input parameter is a list of frames with y-values for sorting by row.
# Example input: A list of frames with y-values
# Expected output: Frames with subsections assigned based on y-values.
"""

frames_input = [
    {'y': 100, 'x': 100, 'Section': 1, 'Subsection': None},
    {'y': 150, 'x': 100, 'Section': 1, 'Subsection': None},
    {'y': 250, 'x': 100, 'Section': 1, 'Subsection': None}
]

print(f"Frames with subsections: {assign_subsections_by_row(frames_input)}")
# Expected output: Frames with subsections assigned based on y-value differences.

Frames with subsections: [{'y': 100, 'x': 100, 'Section': 1, 'Subsection': 1}, {'y': 150, 'x': 100, 'Section': 1, 'Subsection': 1}, {'y': 250, 'x': 100, 'Section': 1, 'Subsection': 1}]


In [9]:
# Function to save the frames data to a CSV file
def save_to_csv(frames, module_number, subsection_number, filename):
    # Assign the correct column number after sorting
    for i, frame in enumerate(frames):
        frame['Column'] = i + 1
        frame.pop('x', None)  # Remove 'x' after sorting
        frame.pop('y', None)  # Remove 'y' after sorting

    # Define folder path for the current module and subsection
    module_folder = os.path.join('..', '01_Results', f'Module {module_number}')
    subsection_folder = os.path.join(module_folder, f'Subsection {subsection_number}')
    
    os.makedirs(subsection_folder, exist_ok=True)  # Ensure the folder structure exists
    
    # Full path to the CSV file
    csv_path = os.path.join(subsection_folder, filename)
    
    # Open the CSV file for writing
    with open(csv_path, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=['Section', 'Subsection', 'Section Title', 'Template Type', 'Column', 'Text', 'image_url'])
        writer.writeheader()
        
        # Write each frame's data to the CSV file
        for frame in frames:
            writer.writerow(frame)
                            
# # Test Case
# """
# The input parameters are a list of frames, module number, subsection number, and filename.
# # Example input: A list of processed frames with structured data.
# # Expected output: The content that would be saved in CSV format is printed to the console.
# """

# # Example frames data 
# frames_to_csv = [
#     {'Section': 1, 'Subsection': 1, 'Section Title': 'Title 1', 'Template Type': 'Text', 'Column': 1, 'Text': 'Sample text 1', 'image_url': None},
  
# ]

# module_number = 1
# subsection_number = 1
# csv_filename = 'output.csv'

# save_to_csv(frames_to_save, module_number, subsection_number, csv_filename)
# print("Data saved to CSV successfully.")

In [10]:
# Function to save the frames data to JSON files
def save_to_json(frames, module_number, subsection_number, filename):
    # Assign the correct column number after sorting
    for i, frame in enumerate(frames):
        frame['Column'] = i + 1
        frame.pop('x', None)  # Remove 'x' after sorting
        frame.pop('y', None)  # Remove 'y' after sorting

    # Define folder path for the current module and subsection
    module_folder = os.path.join('..', '00_API', f'Module {module_number}')
    subsection_folder = os.path.join(module_folder, f'Subsection {subsection_number}')
    
    os.makedirs(subsection_folder, exist_ok=True)  # Ensure the folder structure exists
    
    # Full path to the JSON file
    json_path = os.path.join(subsection_folder, filename)
    
    # Convert the frames list to a JSON structure and save to file
    with open(json_path, 'w', encoding='utf-8') as json_file:
        json.dump(frames, json_file, indent=4, ensure_ascii=False)
        

# Test Case 
"""
The input parameters are a list of frames, module number, subsection number, and filename.
# Example input: A list of processed frames with structured data.
# Expected output: The content that would be saved in JSON format is printed to the console.
"""

# Example frames data
# frames_to_json = [
#     {'Section': 1, 'Subsection': 1, 'Section Title': 'Frame 1', 'Template Type': 'Text', 'Column': 1, 'Text': 'Sample text 1', 'image_url': None},
  
# ]
# module_number = 1
# subsection_number = 1
# json_filename = 'output.json'

# Call the function to save the frames to a JSON file
# save_to_json(frames_to_json, module_number, subsection_number, json_filename)


'\nThe input parameters are a list of frames, module number, subsection number, and filename.\n# Example input: A list of processed frames with structured data.\n# Expected output: The content that would be saved in JSON format is printed to the console.\n'

In [11]:
# Fuction to fecth data adding delay and retry logic for handling rate limits and avoiding timeouts
def fetch_file_data(file_key):
    max_retries = 5
    retry_count = 0
    delay_between_retries = 10  # delay of 10 seconds
    while retry_count < max_retries:
        response = requests.get(FIGMA_API_URL_TEMPLATE.format(file_key.split('/')[0]), headers=headers)
        if response.status_code == 200:
            return response.json()
        elif response.status_code == 429:  # Rate limit hit, Figma API returns 429
            print("Rate limit hit. Waiting for rate limit reset...")
            retry_after = int(response.headers.get('Retry-After', delay_between_retries))
            time.sleep(retry_after)
            retry_count += 1
        else:
            print(f"Error: Failed to fetch data for file key {file_key}. Status code: {response.status_code}")
            retry_count += 1
            time.sleep(delay_between_retries)
    print(f"Max retries reached for file {file_key}. Skipping...")
    return None

# Test Case
"""
The input parameter is a real Figma file key, and the API will return actual Figma data.
Example input: A valid Figma file key
Expected output: JSON response containing file data.
"""

# Replace with an actual Figma file key that you have access to
file_key = 'XN57QSSgnI4exbB2OTK1QM/MODULE1'

# Uncomment to run 
# fetched_data = fetch_file_data(file_key)
# print(f"Fetched data from Figma: {fetched_data}")


<h2> Main Processing Function </h2>

In [12]:
file_keys = ['XN57QSSgnI4exbB2OTK1QM/MODULE1', '5cH2oMyNQPZ1w9fUPM9Mfe/MODULE2']  # Ensure MODULE name is part of the file key or name

# Adjust the main function to save frames data for each subsection
def process_figma_files(file_keys):
    for file_key in file_keys:
        module_number = get_section_number(file_key) or 0
        print(f"Processing file for Module {module_number}:")

        file_data = fetch_file_data(file_key)
        if file_data and 'document' in file_data:
            frames = extract_frames(file_data['document']['children'], module_number)
            
            # Assign subsections based on rows (y-coordinate differences)
            frames = assign_subsections_by_row(frames, y_threshold=100)
            
            # Save CSV and JSON for each subsection separately
            subsections = set(frame['Subsection'] for frame in frames)
            for subsection in subsections:
                subsection_frames = [frame for frame in frames if frame['Subsection'] == subsection]
                
                # Define filenames for the subsection
                csv_filename = f'figma_frames_module{module_number}_subsection{subsection}.csv'
                json_filename = f'figma_frames_module{module_number}_subsection{subsection}.json'

                # Save CSV and JSON in their respective module and subsection folders
                save_to_csv(subsection_frames, module_number, subsection, csv_filename)
                print(f"Frames data has been exported to {csv_filename} in Module {module_number}, Subsection {subsection}")

                save_to_json(subsection_frames, module_number, subsection, json_filename)
                print(f"Frames data has been exported to {json_filename} in Module {module_number}, Subsection {subsection}")
        else:
            print(f"Warning: 'document' key not found in file {file_key}. Skipping.")
            
# Execute the main function with the list of file keys
process_figma_files(file_keys)

Processing file for Module 1:
Frames data has been exported to figma_frames_module1_subsection1.csv in Module 1, Subsection 1
Frames data has been exported to figma_frames_module1_subsection1.json in Module 1, Subsection 1
Frames data has been exported to figma_frames_module1_subsection2.csv in Module 1, Subsection 2
Frames data has been exported to figma_frames_module1_subsection2.json in Module 1, Subsection 2
Processing file for Module 2:
Frames data has been exported to figma_frames_module2_subsection1.csv in Module 2, Subsection 1
Frames data has been exported to figma_frames_module2_subsection1.json in Module 2, Subsection 1
