In [1]:
pip install python-dotenv # library to read key-value pairs from .env file

Note: you may need to restart the kernel to use updated packages.


ERROR: Invalid requirement: '#': Expected package name at the start of dependency specifier
    #
    ^

[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
#import nececessary libraries
import requests
import json
import csv
import os
import re
import time  # For adding delays between requests
from dotenv import load_dotenv # for accessing the api key in .env file

# Load environment variables from the .env file
load_dotenv()

# Access the API key
API_TOKEN = os.getenv('FIGMA_API_KEY')

# Figma API URL template
FIGMA_API_URL_TEMPLATE = 'https://api.figma.com/v1/files/{}'

# Headers for the API request
headers = {
    'X-Figma-Token': API_TOKEN
}

# Base URL for the GitHub repository to fetch inputs (images and graphics)
GITHUB_BASE_URL = 'https://raw.githubusercontent.com/UNDP-Data/dsc-energy-academy-pipeline/main/03_Inputs/'

## Process multiple files sequentially & Extract images and graphics URLs

In [3]:
# # Function to extract the file name as the section from the URL
# def get_file_name_from_url(url):
#     # Extract the part after '/design/' and before the first '?' (if present)
#     match = re.search(r'/design/[A-Za-z0-9]+/([^/?]+)', url)
#     if match:
#         # Return the full file name (section name)
#         return match.group(1)
#     return None


# # Example usage with your provided URL
# url = 'https://www.figma.com/design/OQwKrUjzZo7yNxlEXAgiIl/SEA-Test-Module-1?node-id=0-1&node-type=canvas&t=BKlbwkqXlDCJbIXI-0'
# file_name = get_file_name_from_url(url)

# print(f"File Name: {file_name}")

In [4]:
# Function to determine the appropriate header tag based on font size
def determine_header_tag(font_size):
    if font_size >= 32:
        return 'h1'
    elif 24 <= font_size < 32:
        return 'h2'
    elif 20 <= font_size < 24:
        return 'h3'
    elif 16 <= font_size < 20:
        return 'h4'
    elif 14 <= font_size < 16:
        return 'h5'
    else:
        return 'h6'

In [5]:
# Function to determine the template type based on frame content
def determine_template_type(frame):
    contains_text = False
    contains_image = False
    label_like_text_only = True  # Assume the text is a label by default

    for node in frame['children']:
        if node['type'] == 'TEXT':
            text = node.get('characters', '').strip()
            # Check if the text is more than a label (like image_1 or graph_1)
            if re.search(r'image_\d+|graph_\d+', text):
                continue  # Ignore label-like text
            elif len(text) > 0:  # Consider non-empty and non-label text as real text
                contains_text = True  # Regular text, not a label
                label_like_text_only = False
        if node['type'] in ['IMAGE', 'VECTOR', 'RECTANGLE']:  # 'VECTOR' or 'RECTANGLE' might be used for images
            contains_image = True

    # determine template type based on whether the frame contains real text or just label-like text
    if contains_image and label_like_text_only:  # Only label-like text and image
        return 'Photo'
    elif contains_image and contains_text:
        return 'Text-Image'
    elif contains_text and not contains_image:
        return 'Text'
    else:
        return 'Unknown'  # In case neither text nor image is found

In [6]:
# Function to fetch image and graphic URL from GitHub
def get_image_url(text, section_name, extension='.png'):
    # Check if the text contains image or graph labels, and fetch the corresponding URL
    image_match = re.search(r'image_(\d+)', text)
    graph_match = re.search(r'graph_(\d+)', text)
    
    if image_match:
        return f'{GITHUB_BASE_URL}Images/image_{image_match.group(1)}{extension}'
    elif graph_match:
        return f'{GITHUB_BASE_URL}Graphics/{section_name}/graph_{graph_match.group(1)}{extension}'
    
    return None


# # Test Case
# """
# The input is a string containing references to images or graphs (e.g., "image_1" or "graph_2"), along with a section number  
# The function generates a URL for that image or graph.
# Example input: Text with image reference, Text with graph reference, Text without any reference
# Expected output: A Github URL for the image/graph or None if no reference is found
# """
# # Example test
# text_with_image = 'image_1'
# text_with_graph = 'graph_2'
# text_with_no_image_or_graph = 'No references here'

# GITHUB_BASE_URL = 'https://raw.githubusercontent.com/UNDP-Data/dsc-energy-academy-pipeline/main/03_Inputs/'

# print(f"Image URL: {get_image_url(text_with_image, 1)}") # Expected output: URL pointing to image_1.png'
# print(f"Graph URL: {get_image_url(text_with_graph, 1)}") # Expected output: URL pointing to graph_2.png
# print(f"No image or graph URL found on the frame: {get_image_url(text_with_no_image_or_graph, 1)}") # Expected output: None


In [7]:
# Function to process a frame and extract relevant information
def process_frame(frame, section_name, chapter_name, lesson_number, column_number):
    if 'absoluteBoundingBox' not in frame:
        print(f"Warning: 'absoluteBoundingBox' missing for frame '{frame.get('name', 'Unnamed')}'. Skipping this frame.")
        return None
    
    template_type = determine_template_type(frame)
    
    frame_data = {
        'Section': section_name,
        'Chapter': chapter_name,
        'Lesson': lesson_number,
        'Frame Title': frame['name'],
        'Template Type': template_type,
        'Column': column_number,
        'Text': '',
        'x': frame['absoluteBoundingBox']['x'],
        'y': frame['absoluteBoundingBox']['y'],
        'image_url': None
    }

    for node in frame['children']:
        if node['type'] == 'TEXT':
            font_size = node.get('style', {}).get('fontSize', 14)  # Default to 14 if no fontSize is found
            header_tag = determine_header_tag(font_size)
            text = node['characters']
            frame_data['Text'] += f'<{header_tag}>{text}</{header_tag}>'

            # Add the image/graphic URL if image_[number] or graph_[number] is found
            image_url = get_image_url(text, section_name)
            if image_url:
                frame_data['image_url'] = image_url
    
    return frame_data

In [8]:
# Function to extract frames from lessons
def extract_frames(nodes, section_name, chapter_name, lesson_number):
    frames = []
    column_number = 1  # Track the column within each lesson

    for node in nodes:
        if node['type'] == 'FRAME':
            processed_frame = process_frame(node, section_name, chapter_name, lesson_number, column_number)
            if processed_frame:
                frames.append(processed_frame)
                column_number += 1
    
    return frames

In [9]:
# Function to dynamically assign lessons based on similar y-values (rows)
def assign_lessons_by_row(frames, y_threshold=500):  # You can adjust the y_threshold
    if not frames:
        return frames

    # Sort frames by their y-position first, then by x-position
    frames.sort(key=lambda f: (f['absoluteBoundingBox']['y'], f['absoluteBoundingBox']['x']))

    lesson_number = 1
    last_y = frames[0]['absoluteBoundingBox']['y']
    
    for frame in frames:
        current_y = frame['absoluteBoundingBox']['y']

        # If the vertical difference between frames is larger than the threshold, start a new lesson
        if abs(current_y - last_y) > y_threshold:
            lesson_number += 1

        # Assign the lesson number
        frame['Lesson'] = lesson_number

        # Update last_y to the current frame's y-position
        last_y = current_y
    
    return frames


In [10]:
# Function to extract chapters and lessons dynamically based on vertical spacing
def extract_chapters_and_lessons(document_children):
    chapters = []
    
    for canvas_node in document_children:
        if canvas_node['type'] == 'CANVAS':
            for chapter_node in canvas_node['children']:
                if 'Chapter' in chapter_node['name']:  # Detect chapters by their name (e.g., "Chapter 1", "Chapter 2")
                    chapter_name = chapter_node['name']
                    all_frames = [frame for frame in chapter_node['children'] if frame['type'] == 'FRAME' and not frame['name'].startswith('Vector')]

                    # Assign lessons based on vertical spacing
                    frames_with_lessons = assign_lessons_by_row(all_frames, y_threshold=500)

                    # Group frames by their lesson number
                    lessons = []
                    current_lesson = []
                    current_lesson_number = frames_with_lessons[0]['Lesson']
                    
                    for frame in frames_with_lessons:
                        if frame['Lesson'] != current_lesson_number:
                            lessons.append(current_lesson)
                            current_lesson = []
                            current_lesson_number = frame['Lesson']
                        
                        current_lesson.append(frame)

                    # Append the last lesson
                    if current_lesson:
                        lessons.append(current_lesson)

                    # Add chapter and its associated lessons to the list
                    chapters.append({
                        'chapter_name': chapter_name,
                        'lessons': lessons
                    })

    return chapters


In [11]:
# Function to save the frames data to a CSV file
def save_to_csv(frames, section_name, chapter_name, lesson_number, filename):
    # Sort frames by y-position if 'y' exists, otherwise fallback to unsorted
    frames_with_y = [frame for frame in frames if 'y' in frame]
    frames_without_y = [frame for frame in frames if 'y' not in frame]
    
    # Sort frames with 'y' by y-position
    frames_with_y.sort(key=lambda frame: frame['y'])
    
    # Combine frames (sorted ones with 'y' first, then those without 'y')
    sorted_frames = frames_with_y + frames_without_y
    
    # Assign the correct column number after sorting
    for i, frame in enumerate(sorted_frames):
        frame['Column'] = i + 1
        frame.pop('x', None)  # Remove 'x' after sorting
        frame.pop('y', None)  # Remove 'y' after sorting if it exists

    # Define folder path for the current Figma file (section) and chapter
    csv_folder = os.path.join('..', '01_Results', f'{section_name}')
    chapter_folder = os.path.join(csv_folder, f'{chapter_name}')
    
    os.makedirs(chapter_folder, exist_ok=True)  # Ensure the folder structure exists
    
    # Full path to the CSV file (e.g., Lesson_1.csv, Lesson_2.csv)
    csv_path = os.path.join(chapter_folder, filename)
    
    # Open the CSV file for writing
    with open(csv_path, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=['Section', 'Lesson', 'Chapter', 'Frame Title', 'Template Type', 'Column', 'Text', 'image_url'])
        writer.writeheader()
        
        # Write each frame's data to the CSV file
        for frame in sorted_frames:
            writer.writerow(frame)


In [12]:
# Function to save the frames data to JSON files
def save_to_json(frames, section_name, chapter_name, lesson_number, filename):
    # Sort frames by y-position if 'y' exists, otherwise fallback to unsorted
    frames_with_y = [frame for frame in frames if 'y' in frame]
    frames_without_y = [frame for frame in frames if 'y' not in frame]
    
    # Sort frames with 'y' by y-position
    frames_with_y.sort(key=lambda frame: frame['y'])
    
    # Combine frames (sorted ones with 'y' first, then those without 'y')
    sorted_frames = frames_with_y + frames_without_y

    # Assign the correct column number after sorting
    for i, frame in enumerate(sorted_frames):
        frame['Column'] = i + 1
        frame.pop('x', None)  # Remove 'x' after sorting
        frame.pop('y', None)  # Remove 'y' after sorting if it exists

    # Define folder path for the current Figma file (section) and chapter
    json_folder = os.path.join('..', '00_API', f'{section_name}')
    chapter_folder = os.path.join(json_folder, f'{chapter_name}')
    
    os.makedirs(chapter_folder, exist_ok=True)  # Ensure the folder structure exists
    
    # Full path to the JSON file (e.g., Lesson_1.json, Lesson_2.json)
    json_path = os.path.join(chapter_folder, filename)
    
    # Convert the frames list to a JSON structure and save to file
    with open(json_path, 'w', encoding='utf-8') as json_file:
        json.dump(sorted_frames, json_file, indent=4, ensure_ascii=False)


In [13]:
# Fuction to fecth data adding delay and retry logic for handling rate limits and avoiding timeouts

def fetch_file_data(file_key):
    max_retries = 5
    retry_count = 0
    delay_between_retries = 10  # Delay of 10 seconds between retries

    while retry_count < max_retries:
        response = requests.get(FIGMA_API_URL_TEMPLATE.format(file_key), headers=headers)
        
        if response.status_code == 200:
            # If the request is successful, return the data
            return response.json()
        
        elif response.status_code == 429:  # Rate limit hit
            print("Rate limit hit. Waiting for rate limit reset...")
            retry_after = int(response.headers.get('Retry-After', delay_between_retries))
            time.sleep(retry_after)
            retry_count += 1
        
        else:
            print(f"Error: Failed to fetch data for file key {file_key}. Status code: {response.status_code}")
            retry_count += 1
            time.sleep(delay_between_retries)

    print(f"Max retries reached for file {file_key}. Skipping...")
    return None


<h2> Main Processing Function </h2>

In [14]:
file_keys = ['XN57QSSgnI4exbB2OTK1QM'] 

In [15]:
# Main processing function
def process_figma_files(file_keys):
    for file_key in file_keys:
        file_data = fetch_file_data(file_key)
        if file_data and 'name' in file_data:
            section_name = file_data['name']  # Retrieve file name (e.g., "Module 1")
        else:
            print(f"Error: Could not retrieve file name for file key: {file_key}")
            continue

        if file_data and 'document' in file_data:
            document_children = file_data['document']['children']

            # Extract chapters and lessons dynamically based on vertical spacing
            chapters = extract_chapters_and_lessons(document_children)

            # Process each chapter and its lessons
            for chapter in chapters:
                chapter_name = chapter['chapter_name']  # Example: "Chapter 1"
                lessons = chapter['lessons']

                # Process each lesson
                for lesson_number, lesson_frames in enumerate(lessons, start=1):
                    frames = extract_frames(lesson_frames, section_name, chapter_name, lesson_number)
                    
                    # Save CSV and JSON for each lesson
                    csv_filename = f'Lesson_{lesson_number}.csv'
                    json_filename = f'Lesson_{lesson_number}.json'

                    save_to_csv(frames, section_name, chapter_name, lesson_number, csv_filename)
                    print(f"CSV file '{csv_filename}' has been saved in {chapter_name} 01_Results folder")

                    save_to_json(frames, section_name, chapter_name, lesson_number, json_filename)
                    print(f"JSON file '{json_filename}' has been saved in {chapter_name} 00_API folder")

        else:
            print(f"Error: 'document' key not found in file {file_key}.")


In [16]:
# the main function with the list of file keys
process_figma_files(file_keys)

CSV file 'Lesson_1.csv' has been saved in Chapter 3 01_Results folder
JSON file 'Lesson_1.json' has been saved in Chapter 3 00_API folder
CSV file 'Lesson_2.csv' has been saved in Chapter 3 01_Results folder
JSON file 'Lesson_2.json' has been saved in Chapter 3 00_API folder
CSV file 'Lesson_1.csv' has been saved in Chapter 2 01_Results folder
JSON file 'Lesson_1.json' has been saved in Chapter 2 00_API folder
CSV file 'Lesson_2.csv' has been saved in Chapter 2 01_Results folder
JSON file 'Lesson_2.json' has been saved in Chapter 2 00_API folder
CSV file 'Lesson_1.csv' has been saved in Chapter 1 01_Results folder
JSON file 'Lesson_1.json' has been saved in Chapter 1 00_API folder
CSV file 'Lesson_2.csv' has been saved in Chapter 1 01_Results folder
JSON file 'Lesson_2.json' has been saved in Chapter 1 00_API folder
