In [1]:
#import nececessary libraries
import requests
import json
import csv
import os
import re
import time  # For adding delays between requests
from dotenv import load_dotenv # for accessing the api key in .env file

# Load environment variables from the .env file
load_dotenv()

# Access the API key
API_TOKEN = os.getenv('FIGMA_API_KEY')

# Figma API URL template
FIGMA_API_URL_TEMPLATE = 'https://api.figma.com/v1/files/{}'

# Headers for the API request
headers = {
    'X-Figma-Token': API_TOKEN
}

# Base URL for the GitHub repository to fetch inputs (images and graphics)
GITHUB_BASE_URL = 'https://raw.githubusercontent.com/UNDP-Data/dsc-energy-academy-pipeline/main/03_Inputs/'

## Process multiple files sequentially & Extract images and graphics URLs

In [2]:
def extract_module_number(module_name):
    # Look for a pattern like "module X" or just a number in the module name
    match = re.search(r'\d+', module_name)
    if match:
        return match.group(0)  # Return the module number as a string
    return None  # Return None if no module number is found


In [3]:
# Function to determine the appropriate header tag based on font size
def determine_header_tag(font_size):
    if font_size >= 32:
        return 'h1'
    elif 24 <= font_size < 32:
        return 'h2'
    elif 20 <= font_size < 24:
        return 'h3'
    elif 16 <= font_size < 20:
        return 'h4'
    elif 14 <= font_size < 16:
        return 'h5'
    else:
        return 'h6'


In [4]:
# Function to determine the template type based on frame content
def determine_template_type(frame):
    contains_text = False
    contains_image = False
    label_like_text_only = True  # Assume the text is a label by default

    for node in frame['children']:
        if node['type'] == 'TEXT':
            text = node.get('characters', '').strip()
            # Check if the text is more than a label (like image_1 or graph_1)
            if re.search(r'image_\d+|graph_\d+', text):
                continue  # Ignore label-like text
            elif len(text) > 0:  # Consider non-empty and non-label text as real text
                contains_text = True  # Regular text, not a label
                label_like_text_only = False
        
        # First, check if the node's name contains 'ignore_rect' and skip it
        if 'ignore_rect' in node.get('name', '').lower():
            continue

        # Then, check if the node is an IMAGE, VECTOR, or RECTANGLE
        if node['type'] in ['IMAGE', 'VECTOR', 'RECTANGLE']:
            contains_image = True  # Only mark this if it's not ignored

    # Determine template type based on whether the frame contains real text or just label-like text
    if contains_image and label_like_text_only:  # Only label-like text and image
        return 'Photo'
    elif contains_image and contains_text:
        return 'Text-Image'
    elif contains_text and not contains_image:
        return 'Text'
    else:
        return 'Unknown'  # In case neither text nor image is found



### Extracting the images from Github

In [5]:
# Function to fetch image and graphic URL from GitHub based on image node name
def get_image_url(image_name, module_name, extension='.png'):
    # Extract the module number to be able to extract graphics
    module_number = extract_module_number(module_name)
        
    # Build URL directly using image name (like image_8 or graph_2)
    if image_name.startswith('image_'):
        return f'{GITHUB_BASE_URL}Images/{image_name}{extension}'
    elif image_name.startswith('graph_'):
        return f'{GITHUB_BASE_URL}Graphics/module%20{module_number}/{image_name}{extension}'
    
    return None

### Specific frames from Module Intro

In [6]:
# Function to extract data for the Module_Title-1440 template
def extract_module_title_1440(frame, module_name, column_number):
    label_text = ""
    title_text = ""
    module_number_text = ""
    subtitle_text = ""

    # Sort nodes by position to ensure correct reading order
    sorted_nodes = sorted(frame['children'], key=lambda n: (n.get('absoluteBoundingBox', {}).get('y', 0), n.get('absoluteBoundingBox', {}).get('x', 0)))

    for node in sorted_nodes:
        if node['type'] == 'TEXT':
            font_size = node.get('style', {}).get('fontSize', 14)  # Default font size if not specified
            header_tag = determine_header_tag(font_size)
            text = node['characters'].strip()

            if re.match(r'\bMODULE\s*\d+\b', text, re.IGNORECASE):
                label_text = f'<{header_tag}>{text}</{header_tag}>'
            elif re.match(r'^\d+$', text):  # Only numbers should be captured as module number
                module_number_text = f'<h4>{text}</h4>'
            elif font_size >= 32 and len(text.split()) > 1:  # Large font with multiple words as title
                title_text += f'<{header_tag}>{text}</{header_tag}>'
            elif "MODULE" not in text.upper():  # Exclude "MODULE" text from subtitle
                subtitle_text += f'<{header_tag}>{text}</{header_tag}> '

    return {
        "Label": label_text.strip(),
        "Title": title_text.strip(),
        "Subtitle": subtitle_text.strip(),
        "Module Number": module_number_text.strip()
        
    }

In [7]:
# Function to extract data for the Module_Overview-1080 template
def extract_module_overview_1080(frame):
    title_text = ""
    subtitle_text = ""
    body_text = ""

    # Sort nodes by position to ensure correct reading order
    sorted_nodes = sorted(frame['children'], key=lambda n: (n.get('absoluteBoundingBox', {}).get('y', 0), n.get('absoluteBoundingBox', {}).get('x', 0)))

    first_text_found = False  # Flag to differentiate between subtitle and main text

    for node in sorted_nodes:
        if node['type'] == 'TEXT':
            font_size = node.get('style', {}).get('fontSize', 14)
            header_tag = determine_header_tag(font_size)
            text = node['characters'].strip()

            if font_size >= 32 and "MODULE OVERVIEW" in text.upper():  # Main title
                title_text = f'<{header_tag}>{text}</{header_tag}>'
            elif not first_text_found:  # First text block after title is subtitle
                subtitle_text = f'<{header_tag}>{text}</{header_tag}>'
                first_text_found = True
            else:
                # Append remaining text as main body text
                body_text += f'<{header_tag}>{text}</{header_tag}> '

    return {
        "Title": title_text.strip(),
        "Subtitle": subtitle_text.strip(),
        "Text": body_text.strip()
    }


In [8]:
# def extract_learning_objectives_1440(frame):
#     title_text = ""
#     subtitle_text = ""
#     objectives = []
#     images = []

#     sorted_nodes = sorted(frame['children'], key=lambda n: (n.get('absoluteBoundingBox', {}).get('y', 0), n.get('absoluteBoundingBox', {}).get('x', 0)))

#     current_objective_text = ""
#     current_number = None
#     subtitle_captured = False

#     expected_number = 1

#     for node in sorted_nodes:
#         if node['type'] == 'TEXT':
#             font_size = node.get('style', {}).get('fontSize', 14)
#             header_tag = determine_header_tag(font_size)
#             text = node['characters'].strip()

#             # Debug: Print detected text and font size
#             print(f"Detected text: '{text}', Font Size: {font_size}, Type: {node['type']}")

#             if font_size >= 32 and "LEARNING OBJECTIVES" in text.upper():
#                 title_text = f'<{header_tag}>{text}</{header_tag}>'
#             elif not subtitle_captured:
#                 subtitle_text = f'<{header_tag}>{text}</{header_tag}>'
#                 subtitle_captured = True
#             elif re.match(r'^\d+$', text):  # Detect numbered objectives
#                 # Save the previous objective before moving to the next
#                 if current_number and current_objective_text:
#                     objectives.append(f'<h3>{current_number}</h3>: <h4>{current_objective_text.strip()}</h4>')
#                     print(f"Added Objective {current_number}: {current_objective_text.strip()}")

#                 # Check for sequential order and update expected number
#                 if int(text) != expected_number:
#                     print(f"Warning: Expected objective number {expected_number} but found {text}.")
#                 expected_number = int(text) + 1

#                 # Set the new number and reset text for the new objective
#                 current_number = text
#                 current_objective_text = ""  # Reset to capture the new objective's text
#             else:
#                 # Accumulate text for the current objective
#                 current_objective_text += f' {text}'

#         elif node['type'] in ['IMAGE', 'RECTANGLE']:
#             image_url = get_image_url(node['name'], frame['name'])
#             if image_url:
#                 images.append(image_url)
#                 print(f"Image URL added: {image_url}")

#     # Add the last objective if present
#     if current_number and current_objective_text:
#         objectives.append(f'<h3>{current_number}</h3>: <h4>{current_objective_text.strip()}</h4>')
#         print(f"Added Objective {current_number}: {current_objective_text.strip()}")

#     print("Final Objectives:", objectives)
#     print("Final Images:", images)

#     return {
#         "Title": title_text.strip(),
#         "Subtitle": subtitle_text.strip(),
#         "Objectives": objectives,
#         "Images": images
#     }


### Specific frames from Chapter introductions

In [9]:
# Function to extract data for the Chapter Title template, returning only Content
def extract_chapter_title_1200(frame, module_name, chapter_name, column_number):
    title_text = ""
    chapter_number = ""
    image_urls = []
    
    # Sort the nodes by y-position (top to bottom) for consistent ordering
    sorted_nodes = sorted(frame['children'], key=lambda n: n.get('absoluteBoundingBox', {}).get('y', 0))

    for node in sorted_nodes:
        # Check for text nodes and extract chapter number and title
        if node['type'] == 'TEXT':
            text = node['characters'].strip()
            font_size = node.get('style', {}).get('fontSize', 14)
            header_tag = determine_header_tag(font_size)
            
            # Identify chapter number (only numbers) and exclude "Chapter" text
            if re.match(r'^\d+$', text):  # Only numbers, assume it's the chapter number
                chapter_number = f"<{header_tag}>{text}</{header_tag}>"
            elif "chapter" not in text.lower():  # Exclude "Chapter" text and take only main title
                title_text += f"<{header_tag}>{text}</{header_tag}>"

         # Check for image nodes and gather URLs
        elif node['type'] in ['IMAGE', 'RECTANGLE']:
            image_url = get_image_url(node['name'], module_name)
            if image_url:
                image_urls.append(image_url)

    # Return only the Content field
    return {
        "Content": {
            "Title": title_text.strip(),  # Only the main title without "Chapter"
            "Chapter number": chapter_number.strip()  # Only the number part
        },
        "image_url": image_urls
    }


In [10]:
# Function to extract data for the Chapter_Overview-720 template
def extract_chapter_overview_720(frame):
    title_text = ""
    main_text = ""

    # Sort nodes by position to ensure correct reading order
    sorted_nodes = sorted(frame['children'], key=lambda n: (n.get('absoluteBoundingBox', {}).get('y', 0), n.get('absoluteBoundingBox', {}).get('x', 0)))

    first_text_found = False  # Flag to identify the first large text block as the title

    for i, node in enumerate(sorted_nodes):
        if node['type'] == 'TEXT':
            font_size = node.get('style', {}).get('fontSize', 14)
            header_tag = determine_header_tag(font_size)
            text = node['characters'].strip()

            # Identify the title by searching for "OVERVIEW" in the text and large font
            if not first_text_found and "OVERVIEW" in text.upper():
                title_text = f'<{header_tag}>{text}</{header_tag}>'
                first_text_found = True
            
            # All other blocks are part of the main text
            else:
                main_text += f'<{header_tag}>{text}</{header_tag}> '

    # Return only the Content dictionary
    return {
        "Content": {
            "Title": title_text.strip(),
            "Text": main_text.strip(),
        }
    }


In [25]:
def extract_chapter_toc_1440(frame, module_name, chapter_name, column_number):
    title_text = ""
    subtitle_text = ""
    lessons = []
    image_urls = []  # Collect all images at top level

    # Sort nodes by y-position for consistent order
    sorted_nodes = sorted(frame['children'], key=lambda n: (n.get('absoluteBoundingBox', {}).get('y', 0), n.get('absoluteBoundingBox', {}).get('x', 0)))

    current_lesson_text = ""
    current_number = None
    subtitle_captured = False

    for node in sorted_nodes:
        if node['type'] == 'TEXT':
            font_size = node.get('style', {}).get('fontSize', 14)
            header_tag = determine_header_tag(font_size)
            text = node['characters'].strip()

            # Identify title and subtitle once at the top
            if font_size >= 32 and "LESSONS" in text.upper():
                title_text = f'<{header_tag}>{text}</{header_tag}>'
            elif not subtitle_captured:
                subtitle_text = f'<{header_tag}>{text}</{header_tag}>'
                subtitle_captured = True
            elif re.match(r'^\d+$', text):  # Detect lesson number
                # Finalize the last lesson before starting a new one
                if current_number is not None:
                    lessons.append({
                        "number": f'<h3>{current_number}</h3>',
                        "text": f'<h4>{current_lesson_text.strip()}</h4>' if current_lesson_text else ""
                    })
                    current_lesson_text = ""

                # Start a new lesson
                current_number = text
            else:
                # Accumulate text within the lesson
                current_lesson_text += f' {text}'

        elif node['type'] in ['IMAGE', 'RECTANGLE']:
            image_url = get_image_url(node['name'], frame['name'])
            if image_url:
                image_urls.append(image_url)  # Append all images to top-level list
    # Debug print to check each node being processed
#     print(f"Processing node: {node['type']}, Text: {node.get('characters', '').strip()}, Font Size: {node.get('style', {}).get('fontSize', 'N/A')}, Position Y: {node.get('absoluteBoundingBox', {}).get('y', 'N/A')}")

    # Finalize the last lesson if needed
    if current_number:
        lessons.append({
            "number": f'<h3>{current_number}</h3>',
            "text": f'<h4>{current_lesson_text.strip()}</h4>' if current_lesson_text else ""
        })

    return {
        "Content": {
            "Title": title_text.strip(),
            "Subtitle": subtitle_text.strip(),
            "Lessons": lessons
        },
        "image_url": image_urls  # Top-level image URLs
    }


### Processing all the frames

In [26]:
# Updated process_frame function to handle both module intro and chapter intro
def process_frame(frame, module_name, chapter_name, lesson_number, column_number, is_module_intro=False, is_intro=False):
    if 'absoluteBoundingBox' not in frame:
        print(f"Warning: 'absoluteBoundingBox' missing for frame '{frame.get('name', 'Unnamed')}'. Skipping this frame.")
        return None
    
    template_type = determine_template_type(frame)
    image_urls = []
    frame_data = {
        'Module': module_name,
        'Chapter': chapter_name,
        'Template Code': frame['name'],
        'Template Type': template_type,
        'Column': column_number,  # Add column numbering for all frames, including intro
        'Content': '',
        'x': frame['absoluteBoundingBox']['x'],
        'y': frame['absoluteBoundingBox']['y'],
        'image_url': image_urls
    }

    # Add 'Lesson' field only if it's neither a module intro nor a chapter intro
    if not is_module_intro and not is_intro:
        frame_data['Lesson'] = lesson_number

    # Route based on template code and update only 'Content' part
    if frame['name'] == "Chapter_Title-1200":
        chapter_title_data = extract_chapter_title_1200(frame, module_name, chapter_name, column_number)
        frame_data['Content'] = chapter_title_data['Content']
        frame_data['image_url'] = chapter_title_data['image_url']
    elif frame['name'] == "Chapter_Overview-720":
        frame_data['Content'] = extract_chapter_overview_720(frame)['Content']  # Extract only Content
    elif frame['name'] == "Chapter_ToC-1440":
        toc_data = extract_chapter_toc_1440(frame, module_name, chapter_name, column_number)
        frame_data['Content'] = toc_data['Content']
        frame_data['image_url'] = toc_data['image_url']
    elif frame['name'] == "Module_Title-1440":
        frame_data['Content'] = extract_module_title_1440(frame, module_name, column_number)
    elif frame['name'] == "Module_Overview-1080":
        frame_data['Content'] = extract_module_overview_1080(frame)
    else:
        # Default processing for other frames
        sorted_nodes = sorted(frame['children'], key=lambda n: (n.get('absoluteBoundingBox', {}).get('y', 0), n.get('absoluteBoundingBox', {}).get('x', 0)))
        for node in sorted_nodes:
            if node['type'] == 'TEXT':
                font_size = node.get('style', {}).get('fontSize', 14)
                header_tag = determine_header_tag(font_size)
                text = node['characters']
                frame_data['Content'] += f'<{header_tag}>{text}</{header_tag}>'
            
            if node['type'] in ['IMAGE', 'RECTANGLE']:
                image_url = get_image_url(node['name'], module_name)
                if image_url:
                    image_urls.append(image_url)

    return frame_data


In [27]:
# # Main process_frame function to route frames to the correct extraction function
# def process_frame(frame, module_name, chapter_name, lesson_number, column_number, is_module_intro=False):
#     if 'absoluteBoundingBox' not in frame:
#         print(f"Warning: 'absoluteBoundingBox' missing for frame '{frame.get('name', 'Unnamed')}'. Skipping this frame.")
#         return None
    
#     template_type = determine_template_type(frame)
#     image_urls = []

#     # Base frame data structure
#     frame_data = {
#         'Module': module_name,
#         'Chapter': chapter_name,
#         'Template Code': frame['name'],
#         'Template Type': template_type,
#         'Column': column_number,
#         'Content': '',
#         'x': frame['absoluteBoundingBox']['x'],
#         'y': frame['absoluteBoundingBox']['y'],
#         'image_url': image_urls
#     }

#     if not is_module_intro:
#         frame_data['Lesson'] = lesson_number

#     # Route based on template code
#     if frame['name'] == "Module_Title-1440":
#         frame_data['Content'] = extract_module_title_1440(frame, module_name, column_number)
#     elif frame['name'] == "Module_Overview-1080":
#         frame_data['Content'] = extract_module_overview_1080(frame)
#     else:
#         # Default processing for other frames, if any
#         sorted_nodes = sorted(frame['children'], key=lambda n: (n.get('absoluteBoundingBox', {}).get('y', 0), n.get('absoluteBoundingBox', {}).get('x', 0)))

#         for node in sorted_nodes:
#             if node['type'] == 'TEXT':
#                 font_size = node.get('style', {}).get('fontSize', 14)
#                 header_tag = determine_header_tag(font_size)
#                 text = node['characters']
#                 frame_data['Content'] += f'<{header_tag}>{text}</{header_tag}>'
            
#             if node['type'] in ['IMAGE', 'RECTANGLE']:
#                 image_url = get_image_url(node['name'], module_name)
#                 if image_url:
#                     image_urls.append(image_url)

#     return frame_data


In [28]:
# Function to extract frames from lessons
def extract_frames(nodes, module_name, chapter_name, lesson_number):
    frames = []
    column_number = 1  # Track the column within each lesson or intro

    for node in nodes:
         # Process each frame and pass the current column number
        if node['type'] == 'FRAME':
            processed_frame = process_frame(node, module_name, chapter_name, lesson_number, column_number)
            if processed_frame:
                frames.append(processed_frame)
                column_number += 1   # Increment column number for each frame
    
    return frames



In [29]:
# # Function to extract frames from the Module Intro module
# def extract_module_intro(document_children):
#     module_intro_frames = []
#     column_number = 1  # Initialize column number for sequence

#     for canvas_node in document_children:
#         if canvas_node['type'] == 'CANVAS':
#             for module_node in canvas_node['children']:
#                 if 'Module Intro' in module_node['name']:
#                     module_name = module_node['name']
                    
#                     # Extract all frames
#                     frames = [frame for frame in module_node['children'] if frame['type'] == 'FRAME']
                    
#                     # Sort the frames based on their y (and x for frames with the same y) coordinates
#                     frames.sort(key=lambda f: (f['absoluteBoundingBox']['y'], f['absoluteBoundingBox']['x']))
                    
#                     # Process each frame inside Module Intro
#                     for frame in frames:
#                         processed_frame = process_frame(frame, module_name, "", 0, column_number, is_module_intro=True)
#                         if processed_frame:
#                             module_intro_frames.append(processed_frame)
#                             column_number += 1  # Increment the column number based on sequence
    
#     return module_intro_frames


In [30]:
# Function to dynamically assign lessons based on similar y-values (rows)
def assign_lessons_by_row(frames, y_threshold=500):  # You can adjust the y_threshold (this depends on the vertical space btn lesson1 and lesson2)
    if not frames:
        return frames

    # Sort frames by their y-position first, then by x-position
    frames.sort(key=lambda f: (f['absoluteBoundingBox']['y'], f['absoluteBoundingBox']['x']))

    lesson_number = 1
    last_y = frames[0]['absoluteBoundingBox']['y']
    
    for frame in frames:
        current_y = frame['absoluteBoundingBox']['y']

        # If the vertical difference between frames is larger than the threshold, start a new lesson
        if abs(current_y - last_y) > y_threshold:
            lesson_number += 1

        # Assign the lesson number
        frame['Lesson'] = lesson_number

        # Update last_y to the current frame's y-position
        last_y = current_y
    
    return frames

In [31]:
# Function to extract frames from the Module Intro module
def extract_module_intro(document_children):
    module_intro_frames = []
    column_number = 1  # Initialize column number for sequence

    for canvas_node in document_children:
        if canvas_node['type'] == 'CANVAS':
            for module_node in canvas_node['children']:
                if 'Module Intro' in module_node['name']:
                    module_name = module_node['name']
                    
                    # Extract all frames
                    frames = [frame for frame in module_node['children'] if frame['type'] == 'FRAME']
                    
                    # Sort the frames based on their y (and x for frames with the same y) coordinates
                    frames.sort(key=lambda f: (f['absoluteBoundingBox']['y'], f['absoluteBoundingBox']['x']))
                    
                    # Process each frame inside Module Intro
                    for frame in frames:
                        processed_frame = process_frame(frame, module_name, "", 0, column_number, is_module_intro=True)
                        if processed_frame:
                            module_intro_frames.append(processed_frame)
                            column_number += 1  # Increment the column number based on sequence
    
    return module_intro_frames


In [32]:
# Function to extract only chapters and lessons dynamically based on vertical spacing
def extract_chapters_and_lessons(document_children):
    chapters = []
    
    for canvas_node in document_children:
        if canvas_node['type'] == 'CANVAS':
            # Filter only nodes with 'Chapter' in their name to skip templates or images
            chapter_nodes = [
                chapter_node for chapter_node in canvas_node['children']
                if 'Chapter' in chapter_node['name']
            ]
            
            for chapter_node in chapter_nodes:
                chapter_name = chapter_node['name']  # e.g., "Chapter 1"
                all_frames = [
                    frame for frame in chapter_node['children']
                    if frame['type'] == 'FRAME' and not frame['name'].startswith('Vector')
                ]

                # Assign lessons based on vertical spacing
                frames_with_lessons = assign_lessons_by_row(all_frames, y_threshold=500)

                # Group frames by their lesson number
                lessons = []
                current_lesson = []
                current_lesson_number = frames_with_lessons[0]['Lesson']

                for frame in frames_with_lessons:
                    if frame['Lesson'] != current_lesson_number:
                        lessons.append(current_lesson)
                        current_lesson = []
                        current_lesson_number = frame['Lesson']

                    current_lesson.append(frame)

                # Append the last lesson if any
                if current_lesson:
                    lessons.append(current_lesson)

                # Add chapter and its associated lessons to the list
                chapters.append({
                    'chapter_name': chapter_name,
                    'lessons': lessons
                })

    return chapters


In [33]:
# Function to save the frames data to CSV files
def save_to_csv(frames, module_name, chapter_name, filename, is_module_intro=False, is_intro=False):
    frames_with_y = [frame for frame in frames if 'y' in frame]
    frames_without_y = [frame for frame in frames if 'y' not in frame]
    frames_with_y.sort(key=lambda frame: frame['y'])
    sorted_frames = frames_with_y + frames_without_y

    for i, frame in enumerate(sorted_frames):
        frame['Column'] = i + 1  # Assign the correct column number based on sequence
        frame.pop('x', None)
        frame.pop('y', None)

        # Remove 'Lesson' and 'Chapter' for module intros
        if is_module_intro:
            frame.pop('Lesson', None)
            frame.pop('Chapter', None)
        # Remove only 'Lesson' for chapter intros
        elif is_intro:
            frame.pop('Lesson', None)

    if is_module_intro:
        folder = os.path.join('..', '01_Results', module_name, 'Module_Intro')
    else:
        folder = os.path.join('..', '01_Results', module_name, chapter_name)

    os.makedirs(folder, exist_ok=True)

    csv_path = os.path.join(folder, filename)
    
    with open(csv_path, 'w', newline='', encoding='utf-8') as csv_file:
        fieldnames = ['Module', 'Template Code', 'Template Type', 'Column', 'Content', 'image_url']
        if not is_module_intro: # Add 'Lesson' and 'Chapter' only if it's not a module intro
            if not is_intro:  # Add 'Lesson' only if it's not a chapter intro
                fieldnames.insert(1, 'Lesson') 
            fieldnames.insert(2, 'Chapter')   # Add 'Chapter' for chapter intros

        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for frame in sorted_frames:
            writer.writerow(frame)

In [34]:
# Function to save the frames data to JSON files with specific fields removed for intros
def save_to_json(frames, module_name, chapter_name, filename, is_module_intro=False, is_intro=False):
    frames_with_y = [frame for frame in frames if 'y' in frame]
    frames_without_y = [frame for frame in frames if 'y' not in frame]
    frames_with_y.sort(key=lambda frame: frame['y'])
    sorted_frames = frames_with_y + frames_without_y

    for i, frame in enumerate(sorted_frames):
        frame['Column'] = i + 1
        frame.pop('x', None)
        frame.pop('y', None)

        # Remove specific fields for intros
        if is_module_intro:
            frame.pop('Lesson', None)  # Remove 'Lesson' for Module Intro
            frame.pop('Chapter', None)  # Remove 'Chapter' for Module Intro
        elif is_intro:
            frame.pop('Lesson', None)  # Remove only 'Lesson' for Chapter Intro

    # Define folder based on whether it's module intro or chapter
    if is_module_intro:
        folder = os.path.join('..', '00_API', module_name, 'Module_Intro')
    else:
        folder = os.path.join('..', '00_API', module_name, chapter_name)

    os.makedirs(folder, exist_ok=True)
    
    json_path = os.path.join(folder, filename)
    
    with open(json_path, 'w', encoding='utf-8') as json_file:
        json.dump(sorted_frames, json_file, indent=4, ensure_ascii=False)


In [35]:
# Fuction to fecth data adding delay and retry logic for handling rate limits and avoiding timeouts
def fetch_file_data(file_key):
    max_retries = 5
    retry_count = 0
    delay_between_retries = 10  # Delay of 10 seconds between retries

    while retry_count < max_retries:
        response = requests.get(FIGMA_API_URL_TEMPLATE.format(file_key), headers=headers)
        
        if response.status_code == 200:
            # If the request is successful, return the data
            return response.json()
        
        elif response.status_code == 429:  # Rate limit hit
            print("Rate limit hit. Waiting for rate limit reset...")
            retry_after = int(response.headers.get('Retry-After', delay_between_retries))
            time.sleep(retry_after)
            retry_count += 1
        
        else:
            print(f"Error: Failed to fetch data for file key {file_key}. Status code: {response.status_code}")
            retry_count += 1
            time.sleep(delay_between_retries)

    print(f"Max retries reached for file {file_key}. Skipping...")
    return None


<h2> Main Processing Function </h2>

In [36]:
file_keys = ['XN57QSSgnI4exbB2OTK1QM']   # you can add multiple files

In [37]:
# Main processing function with correct flags for intros
def process_figma_files(file_keys):
    for file_key in file_keys:
        file_data = fetch_file_data(file_key)
        if file_data and 'name' in file_data:
            file_name = file_data['name']  # Retrieve file name (e.g., "SEA Test Module 1")
            print(f"Processing Module {file_name}")
        else:
            print(f"Error: Could not retrieve file name for file key: {file_key}")
            continue

        if file_data and 'document' in file_data:
            document_children = file_data['document']['children']

            # Extract Module Intro separately
            module_intro_frames = extract_module_intro(document_children)

            # Save Module Intro as CSV and JSON
            if module_intro_frames:
                csv_filename = 'Module_Intro.csv'
                json_filename = 'Module_Intro.json'
                save_to_csv(module_intro_frames, file_name, "", csv_filename, is_module_intro=True)
                save_to_json(module_intro_frames, file_name, "", json_filename, is_module_intro=True)
                print(f"Module Intro saved as {csv_filename} and {json_filename}")

            # Extract chapters and lessons dynamically based on vertical spacing
            chapters = extract_chapters_and_lessons(document_children)

            # Process each chapter and its lessons
            for chapter in chapters:
                chapter_name = chapter['chapter_name']
                lessons = chapter['lessons']

                # Process each row within the chapter, treating the first as intro and the rest as lessons
                for lesson_number, lesson_frames in enumerate(lessons):
                    is_intro = (lesson_number == 0)
                    actual_lesson_number = None if is_intro else lesson_number

                    # Extract frames for each lesson or intro
                    frames = extract_frames(
                        lesson_frames, 
                        file_name, 
                        chapter_name, 
                        lesson_number=actual_lesson_number  
                    )

                    # Define filenames and save based on type
                    if is_intro:
                        csv_filename = 'intro.csv'
                        json_filename = 'intro.json'
                        print(f"Saving Chapter Intro as {csv_filename} and {json_filename}")
                        save_to_csv(frames, file_name, chapter_name, csv_filename, is_intro=True)
                        save_to_json(frames, file_name, chapter_name, json_filename, is_intro=True)
                    else:
                        csv_filename = f'lesson{actual_lesson_number}.csv'
                        json_filename = f'lesson{actual_lesson_number}.json'
                        print(f"Saving Lesson {actual_lesson_number} as {csv_filename} and {json_filename}")
                        save_to_csv(frames, file_name, chapter_name, csv_filename)
                        save_to_json(frames, file_name, chapter_name, json_filename)
        else:
            print(f"Error: 'document' key not found in file {file_key}.")


In [38]:
# the main function with the list of file keys
process_figma_files(file_keys)

Processing Module SEA Test Module 1
Module Intro saved as Module_Intro.csv and Module_Intro.json
Processing node: TEXT, Text: 5, Font Size: 84.0, Position Y: 9138.5
Saving Chapter Intro as intro.csv and intro.json
Saving Lesson 1 as lesson1.csv and lesson1.json
Saving Lesson 2 as lesson2.csv and lesson2.json
Processing node: TEXT, Text: 5, Font Size: 84.0, Position Y: 194.5
Saving Chapter Intro as intro.csv and intro.json
Saving Lesson 1 as lesson1.csv and lesson1.json
Saving Lesson 2 as lesson2.csv and lesson2.json
Processing node: TEXT, Text: 5, Font Size: 84.0, Position Y: -8406.5
Saving Chapter Intro as intro.csv and intro.json
Saving Lesson 1 as lesson1.csv and lesson1.json
Saving Lesson 2 as lesson2.csv and lesson2.json
