## Data export
docker-compose up

version: '3'
services:
  db:
    image: postgres:15
    environment:
      POSTGRES_USER: postgres
      POSTGRES_PASSWORD: postgres
      POSTGRES_DB: cur_learning_module
    volumes:
      # A teljes sql_files könyvtár csatolása, hogy minden .sql fájl betöltődjön
      - ./sql_files:/docker-entrypoint-initdb.d
    networks:
      - mynetwork
    ports:
      - "5432:5432"

  pgadmin:
    image: dpage/pgadmin4
    environment:
      PGADMIN_DEFAULT_EMAIL: sas@code.hu
      PGADMIN_DEFAULT_PASSWORD: adminpassword
    volumes:
      # A teljes sql_files könyvtár csatolása, hogy minden .sql fájl betöltődjön
      - ./export_files:/export
    ports:
      - "8080:80"
    depends_on:
      - db
    networks:
      - mynetwork

networks:
  mynetwork:
    driver: bridge


### DB Export scripts
#### Learning Moduls
select id, title, learning_units from public.cur_learning_module ;
#### Projects
select id, title, story,  learn,  tasks, hints, background from public.cur_base_project ;
#### Tutorial
select id, title, description, question_id from public.cur_background_material ;

## Processing dumps

In [1]:
import os
import json
import pandas as pd

In [2]:
learning_moduls = pd.read_csv("../csv/modules.csv")
projects = pd.read_csv("../csv/projects.csv")
tutorials = pd.read_csv("../csv/tutorials.csv")

In [13]:
learning_moduls.learning_units[0]

'[{"week": null, "title": "10th week", "topics": null, "description": null, "learningUnitAssignments": [{"id": 5694, "type": "TUTORIAL", "optional": false}, {"id": 15628, "type": "PROJECT", "optional": false, "assignmentType": "SOLO"}, {"id": 15306, "type": "PROJECT", "optional": false, "assignmentType": "SOLO"}]}, {"week": null, "title": "11th week", "topics": null, "description": null, "learningUnitAssignments": [{"id": 5026, "type": "TUTORIAL", "optional": false}, {"id": 5721, "type": "TUTORIAL", "optional": false}, {"id": 5012, "type": "TUTORIAL", "optional": false}, {"id": 15294, "type": "TUTORIAL", "optional": false}, {"id": 15460, "type": "PROJECT", "optional": false, "assignmentType": "SOLO"}, {"id": 15282, "type": "PROJECT", "optional": false, "assignmentType": "SOLO"}]}, {"week": null, "title": "12th week", "topics": null, "description": null, "learningUnitAssignments": [{"id": 15254, "type": "PROJECT", "optional": false, "assignmentType": "SOLO"}]}]'

In [14]:
projects.columns

Index(['id', 'title', 'story', 'learn', 'tasks', 'hints', 'background'], dtype='object')

In [3]:
# LearningUnitAssignments frissítésére
def update_assignment(assignment, projects_df, tutorials_df):
    if assignment['type'] == 'TUTORIAL':
        # Keressük meg a tutorials_df-ben a megfelelő leírást az id alapján
        tutorial_row = tutorials_df[tutorials_df['id'] == assignment['id']]
        if not tutorial_row.empty:
            struct = {"type" : "TUTORIAL"}
            struct['title']= tutorial_row.iloc[0]['title']
            struct['description']= tutorial_row.iloc[0]['description']
            struct['question_id']= tutorial_row.iloc[0]['question_id']
            return struct
    elif assignment['type'] == 'PROJECT':
        # Keressük meg a projects_df-ben a megfelelő hintet az id alapján
        project_row = projects_df[projects_df['id'] == assignment['id']]
        if not project_row.empty:
            struct = {"type" : "PROJECT"}
            struct['title']= project_row.iloc[0]['title']
            struct['story']= project_row.iloc[0]['story']
            struct['learn']= project_row.iloc[0]['learn']
            struct['tasks']= project_row.iloc[0]['tasks']
            struct['hints']= project_row.iloc[0]['hints']
            struct['background']= project_row.iloc[0]['background']
            return struct
    return {}


In [4]:
# LearninUnit feldolgozása
def update_learning_units(row):
    updated_units = []
    jrow = json.loads(row)  # JSON betöltése
    for r in jrow:
        lus = r["learningUnitAssignments"]
        updated_assignments = []
        for lu in lus[:5]:
            # Frissítjük a learningUnitAssignments bejegyzést a megfelelő értékekkel
            updated_assignments.append(update_assignment(lu, projects, tutorials))
        # Frissítjük az eredeti bejegyzést az új értékekkel
        r["learningUnitAssignments"] = updated_assignments
    return jrow 



In [6]:
all= []
id_list = [40652, 39052, 35202, 35852, 31652, 31653, 23452, 39802, 29404, 38552, 31102, 32858]
#selected = learning_moduls[:2]
selected = learning_moduls[learning_moduls['id'].isin(id_list)]
for i, t, k in zip (selected["id"], selected["title"],selected["learning_units"]):
    lus = update_learning_units(k)
    all.append([i,t,lus ])
all = pd.DataFrame(all)

In [None]:
all

### Learning outcome generation part

In [8]:
from dotenv import load_dotenv
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
import openai
from openai import OpenAIError
import tiktoken

load_dotenv()
MAX_TOKENS = 100000
encoding = tiktoken.get_encoding("cl100k_base")

In [9]:
def count_tokens(lu, encoding):
    """
    Counts the number of tokens in the message list.
    
    Parameters:
        messages (list): List of learning units.
        encoding (tiktoken.Encoding): The encoding used for tokens.
    
    Returns:
        int: The total number of tokens.
    """
    try:
        # Summing the token counts for each unit
        count = sum(len(encoding.encode(json.dumps(unit))) for unit in lu)
    except Exception as e:
        print(f"Error in counting tokens: {e}")
        count = 0
    return count


In [10]:
def extract_json_string(text: str) -> str:
    """
    Extracts a JSON string from a block of text.

    Parameters:
        text (str): The text containing the JSON string.

    Returns:
        str: The extracted JSON string, or an empty JSON object/list if extraction fails.

    Logs:
        Warning: If there is an error during JSON extraction.
    """
    def find_matching_end(text, start_pos, open_char, close_char):
        stack = 1
        for i in range(start_pos + 1, len(text)):
            if text[i] == open_char:
                stack += 1
            elif text[i] == close_char:
                stack -= 1
                if stack == 0:
                    return i + 1
        return -1  # No matching closing character found

    try:
        # Find the first occurrence of '{' or '['
        first_brace = text.find('{')
        first_bracket = text.find('[')

        if first_brace == -1 and first_bracket == -1:
            raise ValueError("No JSON object or array found in the text.")

        if first_brace == -1:
            start = first_bracket
            open_char = '['
            close_char = ']'
        elif first_bracket == -1:
            start = first_brace
            open_char = '{'
            close_char = '}'
        else:
            if first_brace < first_bracket:
                start = first_brace
                open_char = '{'
                close_char = '}'
            else:
                start = first_bracket
                open_char = '['
                close_char = ']'

        end = find_matching_end(text, start, open_char, close_char)
        if end == -1:
            raise ValueError("No matching closing character found for JSON.")

        json_str = text[start:end]
        return json.loads(json_str)

    except Exception as e:
        print(f"JSON extraction error: {e}; text: {text}")
        # Determine what to return based on the first opening character
        if 'open_char' in locals():
            if open_char == '{':
                return {}
            elif open_char == '[':
                return []
        return {} # Default to empty JSON object



In [11]:
def summarize_data_with_llm(self, learning_units):
    """
    Summarizes a set of learning units using the LLM.
    
    Parameters:
        learning_units (list): A list of learningUnitAssignments to summarize.
    
    Returns:
        dict: A dictionary containing the summarized text.
    """
    print("Summarise since it is too long to process!")
    # Join all learning unit contents together
    text_data = " ".join([json.dumps(unit) for unit in learning_units])

    # System message to guide the summarization
    sys_msg = """
    You are a tool for summarizing and abstracting text. 
    Your task is to reduce the provided data to less than 10000 words using markdown format. 
    The generated summary should maintain key information and follow the original language of the text.
    """

    # If you're using OpenAI, send the summarization request
    summary = self.query_openai([{'role': 'system', 'content': sys_msg}, {'role': 'user', 'content': text_data}])
    
    return {'role': 'system', 'content': summary}


In [12]:
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10), retry=retry_if_exception_type(OpenAIError))
def query_openai(messages, max_response_tokens=32000):
    """
    Queries the OpenAI API with the given messages, using retries for reliability.

    Parameters:
        messages (list): A list of messages to send to the OpenAI API.
        max_response_tokens (int): The maximum number of tokens for the response.

    Returns:
        str: The content of the response message.

    Raises:
        OpenAIError: If an error occurs during the API call.
    """
    # Calculate total tokens in the input messages
    total_tokens = count_tokens(messages, encoding)

    if total_tokens > MAX_TOKENS:
        print(f"Token limit exceeded ({total_tokens} tokens). Summarizing Learning unit.")
        
        # Summarize the data if token count exceeds the limit
        summarized_message = summarize_data_with_llm(messages)

        # Recalculate the tokens after summarization
        total_tokens = count_tokens([summarized_message], encoding)
        print(f"Token count after summarization: {total_tokens}")

        # Replace the original messages with summarized content
        messages = [summarized_message]
    
    try:
        # Query the OpenAI API with the final set of messages
        completion = openai.chat.completions.create(
            model=os.getenv("MODEL"),
            messages=messages,
            #max_tokens=max_response_tokens,
            temperature=float(os.getenv("TEMPERATURE"))
        )
        #print(f"OpenAI query successful.")
        return completion.choices[0].message.content
    except OpenAIError as e:
        print(f"OpenAIError: {e}", exc_info=True)
        raise


In [13]:
def generate_learning_outcomes(module_title, learning_units):
    """
    Generates learning outcomes for a specific module using the OpenAI API.
    
    Parameters:
        module_title (str): The title of the learning module.
        learning_units (list or str): A list of learning units or a combined string of learning outcomes.
    
    Returns:
        str: The generated learning outcomes.
    """
    
    # Alap prompt létrehozása
    base_prompt = """
    Develop 4-6 measurable learning outcomes for a computer course module on {module_title}, utilizing 
    - Bloom's Taxonomy 
    - reasoning 
    - independent thinking.
    
    Each outcome should:
    
    Use one measurable action verb from Bloom's Taxonomy (e.g., remember, understand, apply, analyze, evaluate, create).
    Be clear, concise, and student-centered.
    Align with the appropriate Bloom's level for the course (e.g., introductory, intermediate, advanced).
    Avoid using vague terms like 'understand' or 'learn.'
    Collectively, the learning outcomes should cover a range of Bloom's levels suitable for the students' level.
    Incorporate Reasoning and Independent Thinking by:
    
    **Understanding the Topic**
    
    Break down the topic into core components, identifying key areas for student exploration.
    Place concepts within broader academic, social, or historical contexts to deepen understanding.
    
    **Critical Analysis**
    
    Encourage evaluation of the reliability, bias, and relevance of information sources.
    Promote assessment of logical consistency in arguments or problem-solving approaches.
    
    **Independent Reasoning**
    
    Prompt generation of original hypotheses or solutions beyond standard answers.
    Facilitate connections between course concepts and broader contexts, suggesting implications or applications.
    Challenge students to address contradictions or gaps in knowledge to foster deeper analysis.
    
    **Synthesizing and Concluding**
    
    Require synthesis of information into coherent arguments or theories.
    Encourage reflection on how learned concepts support or challenge existing knowledge.
    Suggest areas for further study based on identified gaps or limitations.

    Important: You should never add additional text to your response, just only the measurable learning outcomes!
    """

    # Ha a learning_units string típusú, akkor azt közvetlenül használjuk
    if isinstance(learning_units, str):
        learning_units_text = learning_units
    else:
        # Lista esetén az egyes learning unitok címének összefűzése
        learning_units_text = "\n".join([f"- {unit.get('title', 'N/A')}" for unit in learning_units])
    
    # Végleges prompt összeállítása
    final_prompt = base_prompt.format(module_title=module_title) + "\nLearning units included:\n" + learning_units_text

    # Itt hívjuk meg az OpenAI API-t, hogy learning outcome-okat generáljon
    response = query_openai([{'role': 'user', 'content': final_prompt}])
    
    return response


In [None]:
all

In [15]:
def process_modules_for_outcomes(selected_modules):
    """
    Processes the selected modules to generate learning outcomes for each module and each learning unit.

    Parameters:
        selected_modules (DataFrame): The DataFrame containing selected modules.
    
    Returns:
        DataFrame: A new DataFrame containing the module ID, title, learning unit ID, and generated learning outcomes.
    """
    all_outcomes = []
    
    # Végigmegyünk az 'all' DataFrame minden során, és létrehozzuk a learning outcome-okat
    for module_id, module_title, learning_units_json in zip(selected_modules[0], selected_modules[1], selected_modules[2]):
        print(f"Processing module: {module_id} - {module_title}")
        
        # Ellenőrizzük, hogy a learning units JSON-e, és ha szükséges, átalakítjuk
        if isinstance(learning_units_json, str):
            try:
                learning_units = json.loads(learning_units_json)  # JSON stringből Python objektummá alakítás
            except json.JSONDecodeError:
                print(f"Hiba történt a JSON feldolgozása során: {learning_units_json}")
                continue
        else:
            learning_units = learning_units_json
        
        assignments = learning_units

        if not isinstance(assignments, list):
            print(f"Nem megfelelő a learningUnitAssignments formátuma: {assignments}")
            continue

        # 1. Minden learning unitra külön learning outcome generálása
        unit_outcomes = []  # Itt gyűjtjük az egyes learning unit outcome-okat
        for idx, assignment in enumerate(assignments):
            if 'title' not in assignment:
                assignment['title'] = "N/A"  # Ha hiányzik, adjunk neki egy alapértéket
            
            # Generálunk outcome-ot minden learning unitra
            unit_outcome = generate_learning_outcomes(assignment['title'], [assignment])
            print(f"Learning unit outcome: {unit_outcome}")
            
            # Tároljuk az eredményeket a megfelelő learning unit ID-val
            all_outcomes.append([module_id, module_title, idx, unit_outcome])
            
            # Összegyűjtjük a learning unit outcome-okat a modul összegzéshez
            unit_outcomes.append(unit_outcome)

        # 2. Modul szintű összegző learning outcome készítése a learning unit outcome-ok alapján
        combined_unit_outcomes = " ".join(unit_outcomes)  # Egyesítjük a learning unit outcome-okat
        summary_outcome = generate_learning_outcomes(module_title, combined_unit_outcomes)
        print(f"Modul összegző outcome: {summary_outcome}")
        
        # Modul szintű összegzés hozzáadása
        all_outcomes.append([module_id, module_title, "Module Summary", summary_outcome])

    # Eredmény DataFrame létrehozása
    outcome_df = pd.DataFrame(all_outcomes, columns=["Module ID", "Module Title", "Learning_Unit_ID", "Learning Outcomes"])
    
    return outcome_df


In [None]:
outcome_df = process_modules_for_outcomes(all)

In [17]:
outcome_df.to_csv("outcomes.csv", index=False)