# Enviroment

In [1]:
from openai import OpenAI
client = OpenAI()

# Lectures Folder
data_folder = "data/"
# Model to use
model = "gpt-4-1106-preview"

# Retriever vs Context Window LLM

We compare two methodologies for question-answering using an LLM:

1. **Whole Lecture Method**: Feeding the entire lecture text directly to the LLM.
2. **Retrievers Method**: Utilizing a combination of three distinct retrievers.

The three retrievers in the Retrievers Method are as follows:

- **Semantic Section Retriever**: Splits the lecture into meaningful sections using an LLM.
- **Sub-topic Retriever**: Further divides each section into sub-topics.
- **Timestamp SQL Retriever**: Leverages a timestamp-based SQL database of the lecture.

An LLM later, based on the question, will decide which retrievers to use and with what parameters.

## Data loading & Cleaning

In [2]:
# Load lecture transcript
lecture = f"{data_folder}/lecture3/Deep Learning State of the Art (2020).txt"
with open(lecture, 'r') as f:
    lecture_lines = f.readlines()

# Remove Timestamps
cleaned_lines = []
for i in range(0, len(lecture_lines), 3):
    # Add only the second line in each group of three lines
    cleaned_lines.append(lecture_lines[i + 1].strip())

lecture_str = ' '.join(cleaned_lines)
lecture_str = lecture_str.replace("\"", '\'')
lecture_str

"- Welcome to 2020 and welcome to the Deep Learning lecture series. Let's start it off today to take a quick whirlwind tour of all the exciting things that happened in 17, 18 and 19 especially, and the amazing things we're going to see in this year in 2020. Also as part of the series is gonna be a few talks from some of the top people in learning and artificial intelligence. After today, of course, start at the broad, the celebrations from the touring award to the limitations and the debates and the exciting growth first. And first of course, a step back to the quote I've used before, I love it, I'll keep reusing it. AI began not with Alan Turing or McCarthy, but would the ancient wish to forge the gods, of course from Pamela McCorduck Machines Who Think, that visualization there is just 3% of the neurons in our brain of the thalamocortical system, that magical thing between our ears that allows us all to see and hear and think and reason and hope and dream and fear, our eventual morta

## Context Text Splitting

In the retriever module, we use semantic splitting instead of fixed-size chunks. This approach allows the LLM to first divide the lecture text into coherent sections based on content, and then further split each section into subtopics. This method enhances the relevance and context of retrieved information.

### Sections

In [3]:
# System and User message for the LLM

sections_system_message = """You will receive a lecture transcript from the user. 
Your task is to analyze this transcript segment, considering the overall lecture topic and the previous section. 
Divide the segment into meaningful sections, providing a brief description for each in one sentence and including the exact last sentence of the section as it appears in the text. 
Also the last sentence of the last section must finish with the end of file, do not deprecate anything.
The output should be in pure JSON format without additional comments or markdown text like this:
{
  "answer": {
    "sections_number": [number of sections],
    "sections": [
      {
        "section_number": [section number],
        "section_description": "[description of the section]",
        "last_sentence": "[the last sentence of the section as provided in the text, without changing anything]"
      },
      ... // Additional sections if any
    ]
  }
}
"""

sections_user_message = """
1. Lecture Topic: Deep Learning State of the Art (2020)
2. Previous Section: {last_section}
3. Current Transcript Segment:
{segment}
"""

In [4]:
import json

"""
Function that divides lecture transcript text in sections. The input tokens
argument defines roughly the size of the input segment in each pass.
"""
def extract_sections(lecture_str: str, input_tokens: int, system_message: str, user_message: str):
    print(f"Lecture Length: {len(lecture_str)}\n")

    # Calculate roughly input Characters based on the input tokens
    input_chars = 4*input_tokens
    
    # Define segment start and end
    segment_start = 0
    segment_end = segment_start + input_chars
    lecture_sections = []
    while True:
        # Get segment
        segment = lecture_str[segment_start:segment_end]
        print(f"\n---Segment: {segment_start} - {segment_end}---")

        # Last section Title from the previous segment for the next LLM input
        if len(lecture_sections) == 0:
            last_section = "None"
        else:
            last_section = lecture_sections[-1][2]

        # Create User message
        user_segm_message = user_message.format(segment=segment, last_section=last_section)
        print(user_segm_message)

        # Make request and get response
        response = client.chat.completions.create(
        model=model,
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_segm_message},
        ],
        temperature=0
        )
        print("\nGot response\n")
        
        # Json Output
        output = json.loads(response.choices[0].message.content)['answer']
        print('--Output--\n', output)
        sections_number = output['sections_number']
        output_sections = output['sections']

        if sections_number == 1: # Incomplete section
            print("Found a whole Section")
            section_end = segment_start
        else:
            # Get the n-1 sections and add the last one to the next segment
            prev_section_end = 0
            for i in range(0,sections_number):
                # Ignore last section if not last segment
                if (i == (sections_number-1)) and (segment_end < len(lecture_str)):
                    break

                # Section Info
                s = output_sections[i]
                # Section start is the segment start + the previous section end
                section_start = segment_start + prev_section_end
                # Section end is the last character of the last sentence + the segment start
                prev_section_end = segment.find(s['last_sentence']) + len(s['last_sentence'])
                section_end = segment_start + prev_section_end

                print(f'-Section {i}-')
                print('Start:', section_start)
                print('End:', section_end)

                # Add it to the list
                lecture_sections.append((section_start, section_end, s['section_description']))

            # Do not include the last section
            remaining = len(segment) - prev_section_end
            print("\nRemaining:", remaining)

        # Commense new segment run
        if segment_end >= len(lecture_str):
            break
        
        segment_start = section_end
        segment_end += input_chars
        
    return lecture_sections

In [None]:
# Create and store the sections
# You can use the already created ones from the 'dl_sections.json' at the next cell
token_input = 4000
result = extract_sections(lecture_str, token_input, sections_system_message, sections_user_message)
with open('dl_sections.json', 'w') as file:
    json.dump(result, file)

In [14]:
# Load the already create sections
import json 

with open('dl_sections.json', 'r') as file:
    sections_result = json.load(file)

sections = {}
for start,end, desc in sections_result:
    sections[desc] = {"start":start, "end":end}

### Topics

Now each created section is split into sub-topics from the LLM

In [8]:
topic_system_message = """You will receive a section of a lecture transcript from the user. 
Your task is to analyze this section, considering the section description, the lecture title and divide the section into meaningful sub-topics.
Provide a brief description for each sub-topic in one sentence and include the exact last sentence of the sub-topic as it appears in the text. 
You have to provide at least two sub-topics.
The sub-topics should be clear and not overlap.
The last sub-topic should have as last sentence the last sentence of the section, do not deprecate anything.
The output should be in pure JSON format without additional comments or markdown text like this:
{
  "answer": {
    "topics_number": [number of sub-topics],
    "topics": [
      {
        "topic_number": [sub-topic number],
        "topic_description": "[description of the sub-topic]",
        "last_sentence": "[the last sentence of the sub-topic as provided in the text, without changing anything]"
      },
      ... // Additional sub-topics
    ]
  }
}
"""

topic_user_message = """
1. Lecture Title: Deep Learning State of the Art (2020)
2. Section Description: {section_description}
3. Current Transcript Section Segment:
{segment}
"""

In [9]:
from typing import List

"""
Function that divides lecture transcript text sections into sub-topics.
"""
def extract_topics(lecture_str: str, section_list: List[str], system_message: str, user_message: str):
    print(f"Lecture Length: {len(lecture_str)}\n")

    section_topics = {}
    for start, end, desc in section_list:
        section_topics[desc] = []

        # Section segment
        segment = lecture_str[start:end]
        print(f"\n--Section--\nStart: {start}\nEnd: {end}\n")

        # Create User message
        segm_user_message = user_message.format(segment=segment, section_description=desc)
        print(segm_user_message)

        # Make request and get Response
        response = client.chat.completions.create(
        model=model,
        response_format={ "type": "json_object" },
        messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": segm_user_message},
        ],
        temperature=0
        )
        print("\nGot response\n")

        # Json Output
        output = json.loads(response.choices[0].message.content)['answer']
        print('--Output--\n', output)
        output_topics = output['topics']

        # For each topic calculate start and end
        segment_topic_end = 0
        for t in output_topics:
            topic_start = start + segment_topic_end
            segment_topic_end = segment.find(t['last_sentence']) + len(t['last_sentence'])
            topic_end = start + segment_topic_end
            print(f'-Topic {t["topic_description"]}-')
            print('Start:', topic_start)
            print('End:', topic_end)
            section_topics[desc].append((topic_start, topic_end, t["topic_description"]))
        
    return section_topics

In [None]:
# You can also use the already created topics from the next cell
topics = extract_topics(lecture_str, sections_result, topic_system_message, topic_user_message)
with open('dl_topics.json', 'w') as file:
    json.dump(topics, file)

In [11]:
with open('dl_topics.json', 'r') as file:
    topics = json.load(file)

## Vectorstore

In [15]:
from langchain.schema.document import Document

# Convert Sections and Topics to langchain Dcouments
# to store them later in vectorstores

# Section Documents
metadata = {}
section_docs = []
topic_docs = []
for s, time_range in sections.items():
    # Get segment
    section_segment = lecture_str[time_range["start"]:time_range["end"]]
    # Create document
    document = Document(page_content=section_segment, metadata={"description": s})
    # Add to list
    section_docs.append(document)

    # Topic documents
    t_list = topics[s]
    # Same procedure for the sub-topics
    for start, end, desc in t_list:
        topic_segment = lecture_str[start:end]
        document = Document(page_content=topic_segment, metadata={"description": desc})
        topic_docs.append(document)

In [16]:
# Create VectorStores

from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
section_db = FAISS.from_documents(section_docs, embeddings)
topics_db = FAISS.from_documents(topic_docs, embeddings)

In [17]:
related_documents = section_db.similarity_search("hello", k=5)
related_documents

[Document(page_content="- Welcome to 2020 and welcome to the Deep Learning lecture series. Let's start it off today to take a quick whirlwind tour of all the exciting things that happened in 17, 18 and 19 especially, and the amazing things we're going to see in this year in 2020. Also as part of the series is gonna be a few talks from some of the top people in learning and artificial intelligence. After today, of course, start at the broad, the celebrations from the touring award to the limitations and the debates and the exciting growth first. And first of course, a step back to the quote I've used before, I love it, I'll keep reusing it. AI began not with Alan Turing or McCarthy, but would the ancient wish to forge the gods, of course from Pamela McCorduck Machines Who Think, that visualization there is just 3% of the neurons in our brain of the thalamocortical system, that magical thing between our ears that allows us all to see and hear and think and reason and hope and dream and f

## SQL Database

In [18]:
# Create an SQL database for queries related to Timestamps or specific words for the LLM to use
import sqlite3

In [54]:
# Create Database
connection = sqlite3.connect('dl_lecture.db')
cursor = connection.cursor()

# Create table
cursor.execute('''
    CREATE TABLE IF NOT EXISTS transcript (
        start_time INTEGER,
        end_time INTEGER,
        text TEXT
    )
''')

<sqlite3.Cursor at 0x1bdca038bc0>

In [52]:
# Load lecture transcript
lecture = f"{data_folder}/lecture3/Deep Learning State of the Art (2020).txt"
with open(lecture, 'r') as f:
    lecture_lines = f.readlines()

In [55]:
def time_to_seconds(time_str):
    hours, minutes, seconds = map(int, time_str.split(':'))
    return hours * 3600 + minutes * 60 + seconds

# Create Database entries
db_entries = []
for i in range(0, len(lecture_lines), 3):
    # Extract timestamp and text
    timestamp = lecture_lines[i].strip()
    text = lecture_lines[i + 1].strip()

    # Parse the start and end time from the timestamp
    start_time, end_time = timestamp.split(' to ')
    # Remove 'From ' prefix from start time
    start_time = start_time.replace('From ', '')

    # Convert to seconds
    start_time = time_to_seconds(start_time)
    end_time = time_to_seconds(end_time)


    db_entries.append((start_time, end_time, text))

# Insert entries
for start_time, end_time, text in db_entries:
        cursor.execute("INSERT INTO transcript (start_time, end_time, text) VALUES (?, ?, ?)",
                       (start_time, end_time, text))
        
connection.commit()
connection.close()

In [83]:
# Test the database

connection = sqlite3.connect('dl_lecture.db')
cursor = connection.cursor()

def query_transcript(cursor, start_query, end_query):
    start_seconds = time_to_seconds(start_query)
    end_seconds = time_to_seconds(end_query)

    cursor.execute("SELECT text FROM transcript WHERE start_time <= ? AND end_time >= ?", (end_seconds, start_seconds))
    return cursor.fetchall()

# Example usage
start_query = "0:0:10"
end_query = "0:01:00"
results = query_transcript(cursor, start_query, end_query)
connection.close()
results

[("Let's start it off today to take a quick whirlwind tour of all the exciting things",),
 ("that happened in 17, 18 and 19 especially, and the amazing things we're going to see",),
 ('in this year in 2020. Also as part of the series is gonna be a few talks from some of the top',),
 ('people in learning and artificial intelligence. After today, of course, start at the broad,',),
 ('the celebrations from the touring award to the limitations and the debates and the exciting growth first.',),
 ("And first of course, a step back to the quote I've used before, I love it, I'll keep reusing it.",),
 ('AI began not with Alan Turing or McCarthy, but would the ancient wish to forge the gods,',),
 ('of course from Pamela McCorduck Machines Who Think, that visualization there is just 3% of the neurons',)]

## Retriever decider LLM

Our system employs an LLM to select and parameterize the most suitable retrievers for a given question. This decision-making process is informed by a detailed analysis of the lecture content, including metadata like length and structure.

The LLM chooses from the following retrievers, each with specific parameters:

1. **Semantic Section Retriever**
2. **Sub-topic Retriever**
3. **Timestamp SQL Retriever**

The LLM is provided with:
- Detailed descriptions of how each retriever functions.
- Lecture information.

Depending on the chosen retriever(s), the LLM outputs:

- **For Sections and Topics Retrievers**: The 'K' parameter indicating the number of relevant documents to retrieve.
- **For the SQL Retriever**: An SQL query tailored to execute in the lecture's database.

In [19]:
# System and User message for the LLM

retriever_system_message = """You are tasked with selecting the most appropriate method(s) for retrieving information from a lecture transcript in response to a student's question. 
The lecture transcript can be accessed through three different retrievers: sections_retriever, topics_retriever, and SQL_retriever.

sections_retriever:The lecture transcript has been divided contextually related sections. You can specify the number (k) of the most relevant sections to retrieve with that retriever.
topics_retriever: Each section has also been divided in topics. You can specify the number (k) of the most relevant sub-topics to retrieve with this retriever.
SQL_retriever: This retriever accesses the lecture transcript data stored in a database table called 'transcript' with three columns: start_time (INTEGER in seconds), end_time (INTEGER in seconds), and text (TEXT, which is the lecture content between start and end times). 
You should formulate an SQL query to retrieve specific parts of the transcript.

It is encouraged to use multiple retrievers to ensure the appropriate information are retrieved. 
Consider the strengths and specific capabilities of each retriever and how they might complement each other in providing a comprehensive response.

To assist you in making an informed decision, here is some complementary information about the lecture:
Lecture Topic (The general subject or theme of the lecture): {topic}
Lecture Length (Total duration of the lecture in minutes): {length}
Lecture Size (Total number of characters in the lecture transcript: {size}
Number of Sections (Total number of context-related sections in the lecture): {s_num}
Number of Topics (Total number of sub-topics within each section): {t_num}

Your response should be formatted in JSON, containing the following fields:
"answer": a JSON object including:
  "retrievers": an array listing the chosen retriever(s) from the options: SQL_retriever, sections_retriever, topics_retriever.
  "SQL_retriever": (if chosen) the specific SQL query to be executed.
  "sections_retriever": (if chosen) the value of k representing the number of sections to retrieve. Not a dictionairy jsut an integer.
  "topics_retriever": (if chosen) the value of k representing the number of sub-topics to retrieve. Not a dictionairy jsut an integer.
"""

retriever_user_message = """
Student's Question: {st_question}
"""

In [23]:
# Function to call the LLM based on a quesiton
def retr_choices(inp_question: str):
    # System Message
    lecture_topic = "Deep Learning State of the Art (2020)"
    lecture_length = "1 hour, 27 minutes, 17 seconds"
    lecture_size = str(len(lecture_str))
    sections_num = str(len(section_docs))
    topics_num = str(len(section_docs))
    r_system_message = retriever_system_message.format(topic=lecture_topic, length=lecture_length, size=lecture_size, s_num=sections_num, t_num=topics_num)

    question = inp_question
    r_user_message = retriever_user_message.format(st_question=question)

    response = client.chat.completions.create(
        model=model,
        response_format={ "type": "json_object" },
        messages=[
        {"role": "system", "content": r_system_message},
        {"role": "user", "content": r_user_message},
        ],
        temperature=0
        )
    
    return response

# Json Output - Example usage
output = retr_choices("What is the lottery ticket hypothesis and when was discussed?")
output = json.loads(output.choices[0].message.content)['answer']
output

{'retrievers': ['sections_retriever', 'topics_retriever', 'SQL_retriever'],
 'SQL_retriever': "SELECT text FROM transcript WHERE text LIKE '%lottery ticket hypothesis%'",
 'sections_retriever': 2,
 'topics_retriever': 3}

In [24]:
from typing import List

"""
Function to remove duplicate content
"""
def remove_dupl(content_list: List[str], current_string: str):
    result = [x for x in content_list if x not in current_string]
    return result

"""
Function to retrieve the appropraite text segments from the retrievers.
"""
def retrieve(response: dict, question: str):
    # Store the retrieved strings here
    retrieved_str = ""
    chosen_retrievers = response["retrievers"]
    print("Retrievers:", chosen_retrievers)
    for r in chosen_retrievers:
        retrieved_str = retrieved_str + "\n"
        if r == "SQL_retriever":
            print("---SQL_retriever")
            sql_query = response[r]
            print("Query:", sql_query)
            connection = sqlite3.connect('dl_lecture.db')
            cursor = connection.cursor()
            cursor.execute(sql_query)
            sql_str_list = [row[0] for row in cursor.fetchall()]
            sql_str_list = remove_dupl(sql_str_list, retrieved_str)
            sql_str = ' '.join(sql_str_list)
            retrieved_str = retrieved_str + sql_str
        elif r == "sections_retriever":
            print("---sections_retriever")
            k = response[r]
            print("K:", k)
            if isinstance(k, dict):  # LLM mistake
                k = k['k']
            docs = section_db.similarity_search(question, k=k)
            section_str_list = [d.page_content for d in docs]
            section_str_list = remove_dupl(section_str_list, retrieved_str)
            section_str_list = ' '.join(section_str_list)
            retrieved_str = retrieved_str + section_str_list
        elif r == "topics_retriever":
            print("-topics_retriever")
            k = response[r]
            print("K:", k)
            if isinstance(k, dict):  # LLM mistake
                k = k['k']
            docs = topics_db.similarity_search(question, k=k)
            topics_str_list = [d.page_content for d in docs]
            topics_str_list = remove_dupl(topics_str_list, retrieved_str)
            topics_str_list = ' '.join(topics_str_list)
            retrieved_str = retrieved_str + topics_str_list

    return retrieved_str

## Comparing with Context Window

In [25]:
# Retriever based model
rbased_system_message = """
As a student assistant familiar with a lecture on {topic}, 
your role is to help clarify and explain the material covered in the lecture to fellow students. 
When responding, please use only the information provided in the relevant part of the lecture transcript attached. 
Think of yourself as a peer who is helping other students understand the lecture better. 
Do not include external knowledge or general information not found in the lecture. 
If a question cannot be answered with the lecture content, kindly indicate that the information is not covered in the lecture material.

Relevant part of Lecture transcript: \n{segment}
"""

# Cotnext based model
cbased_system_message = """
As a student assistant familiar with a lecture on {topic}, 
your role is to help clarify and explain the material covered in the lecture to fellow students. 
When responding, please use only the information provided in the the lecture transcript attached. 
Think of yourself as a peer who is helping other students understand the lecture better. 
Do not include external knowledge or general information not found in the lecture. 
If a question cannot be answered with the lecture content, kindly indicate that the information is not covered in the lecture material.
The lecture transcript is divided in timestamps with this format 'From HH:MM:SS to HH:MM:SS'
The timestamps are only useful to answer time related questions.

Lecture Transcript: \n{transcript}
"""

student_user_message = """
Student's Question: {question}
"""

In [26]:
question_categories = {"Simple": ["What is the Lottery Ticket Hypothesis?", 
                                  "How did AI begun?"],
                       "Semantic": ["How did the lecture compare the challenge of AI research to a significant historical accomplishment illustrating its difficulty?",
                                    "With what was AI agents resembled in terms of how society is going to treat it?"],
                        "Synthesizing":["For which subjects were there mentioned hopes for the future?",
                                        "Which games are mentioned that AI models have learnt to play"],
                        "Holistic": ["What questions have been asked from the students?", 
                                     "Which AI techniques have been thoroughly discussed in the lecture?"],
                        "Time": ["What was discussed between the 35th minute and the 37th minute of the lecture", 
                                 "When did the Q&A start?"]
}

In [None]:
# Read lecture transcript for the context based model
lecture = f"{data_folder}/lecture3/Deep Learning State of the Art (2020).txt"
with open(lecture, 'r') as f:
    lecture_transcript = f.read()
lecture_topic = "Deep Learning State of the Art (2020)"

answers_retriever = {}
answers_context = {}
for q_cat, q_list in question_categories.items():
    print(f"-{q_cat}")
    answers_retriever[q_cat] = []
    answers_context[q_cat] = []
    
    for question in q_list:
        print("-Question:\n", question)
        # Retriever based model

        # Choose Retrievers
        print("--Retriever Model")
        output = retr_choices(question)
        output = json.loads(output.choices[0].message.content)['answer']
        # Get retrieved text
        retriever_input_str = retrieve(output, question)
        # Ask model
        response = client.chat.completions.create(
        model=model,
        messages=[
        {"role": "system", "content": rbased_system_message.format(topic=lecture_topic, segment=retriever_input_str)},
        {"role": "user", "content": student_user_message.format(question=question)},
        ],
        temperature=0
        )
        response = response.choices[0].message.content
        print("--Retriever model answer:\n", response)
        answers_retriever[q_cat].append(response)

        # Context based model
        print("--Cotnext Model")
        response = client.chat.completions.create(
        model=model,
        messages=[
        {"role": "system", "content": cbased_system_message.format(topic=lecture_topic, transcript=lecture_transcript)},
        {"role": "user", "content": student_user_message.format(question=question)},
        ],
        temperature=0
        )
        response = response.choices[0].message.content
        print("Context model answer:\n",response)
        answers_context[q_cat].append(response)
        

In [24]:
# Store the answers
import pickle

with open('retr_asnwers.pkl', 'wb') as file:
    pickle.dump(answers_retriever, file)

with open('cont_asnwers.pkl', 'wb') as file:
    pickle.dump(answers_context, file)

In [25]:
# Load the asnwers
with open('retr_asnwers.pkl', 'rb') as file:
    answers_retriever = pickle.load(file)

with open('cont_asnwers.pkl', 'rb') as file:
    answers_context = pickle.load(file)

In [28]:
# Store as json
with open('retr_asnwers.json', 'w') as file:
    json.dump(answers_retriever, file)

with open('cont_asnwers.json', 'w') as file:
    json.dump(answers_context, file)

# Notes

This feature creates summarys in Latex and Markdown for each seaction of the lecture. We manually link each section to a set of corresponding lecture note slides, which have been coverted to an image. Then the LLM process them and creates a summary in Markdown and Latex.

### Process

1. **Image Conversion**: Each lecture note slide in PDF is converted into an image format.
2. **GPT-4 Vision Preview Model**: This model, capable of interpreting images, is employed to understand the content in these converted slides.
3. **LLM Association**: For each lecture section, the LLM accesses the current topic and its associated image set.
4. **Content Generation**: The LLM is instructed to produce a summary in Markdown and LaTeX formats, based on the lecture topic and images.
5. **Parsing and Splitting**: The output is parsed to separate the LaTeX content from Markdown.
6. **Content Utilization**: The resulting Markdown and LaTeX content is prepared for integration into the app.

In [29]:
math_sections = {"Matrix Multiplication":[
    "data/lecture4/matrix_operations-1.png",
    "data/lecture4/matrix_operations-2.png",
    "data/lecture4/matrix_operations-3.png"
    ],
    "Laws of Matrix Operations":[
        "data/lecture4/matrix_operations-4.png",
        "data/lecture4/matrix_operations-5.png"
    ],
    "Block Matricies and Block Multiplication":[
        "data/lecture4/matrix_operations-5.png",
        "data/lecture4/matrix_operations-4.png"
    ]
}

In [166]:
l4_user_message = """
Please analyze the attached lecture note images that focus on the topic of {topic}. 
Your task is to generate a brief and concise educational summary, highlighting the key concepts, principles, and examples relevant to {topic}. 
Organize the material into sections if it aids in clarity and comprehension. 
The summary should be formatted in Markdown, embedding necessary mathematical equations and notations in LaTeX, like matrix operations etc.
You need to specify when latex code is embedded in markdown with '```latex'.
Use single dollar signs '$' for markdown. For larger math equations not supported in markdown embed LaTeX code.
This summary is aimed at providing a quick and effective understanding of the essentials of {topic} based on the lecture note images.
Have in mind in your summary to not include information about the following topic {prev_topic}.

Your output should contain only the Markdown and Latex code and nothing else.
"""

In [None]:
import base64
# Encode Image to base64
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

prev_topic = "None"
contents_asnwer = {}
# For each ssection
for ms in math_sections.keys():
    print("Section:", ms)
    # Init the LLM's input
    content_list = []
    # Load the isntructions message to the input
    content_list.append({"type": "text", "text": f"{l4_user_message.format(topic=ms, prev_topic=prev_topic)}"})
    # For each Image
    for img in math_sections[ms]:
        print("Image:", img)
        # Add the image to the input
        base64_image = encode_image(img)
        content_list.append({
                    "type": "image_url",
                    "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_image}"
                    }})
      
    # Model call
    response = client.chat.completions.create(
    model="gpt-4-vision-preview",
    temperature=0,
    messages=[
        {
            "role": "user",
            "content": content_list,
        }
    ],
    max_tokens=4000,
    )
    # Get response
    response_str = response.choices[0].message.content
    # Load it in a file just for debugging
    with open(f"test_{ms}.md", 'w') as f:
        f.write(response_str) 

    # Find the positions of the Latex and Md content parts
    # The Md and Latex cells begin and end with '```'
    print("Locating cells...")
    start = 0
    places = []
    while start < len(response_str):
        position = response_str.find('```', start)
        if position != -1:
            places.append(position)
            print(f"Found at position: {position}")
            start = position + 1
        else:
            break
    
    # Extract the Md and Latex content
    # There is one big Markdown cell and latex cells inside it,
    # so the fist one is Md
    content = "markdown"
    content_list = []
    for i, _ in enumerate(places):
        if (i + 1) >= len(places):
            break
        if content == "markdown":
            content_list.append((content, response_str[places[i]:places[i+1]]))
            content = "latex"
        else:
            content_list.append(("latex", response_str[places[i]:places[i+1]]))
            content = "markdown"
    content_list = [(x[0], x[1].replace('markdown', '').replace('latex', '').replace('```', '')) for x in content_list] # cleaning
    
    contents_asnwer[ms] = content_list
    prev_topic = ms

In [165]:
# Output
contents_asnwer

{'Matrix Multiplication': [('markdown',
   '\n# Matrix Multiplication Summary\n\n## Basic Rules for Matrix Multiplication\n\n- Matrices $A$ with $n$ columns can multiply matrices $B$ with $n$ rows: $A_{m \\times n} B_{n \\times p} = C_{m \\times p}$.\n- The entry in $AB = C$ is a dot product: $C_{ij} = (row\\ i\\ of\\ A) \\cdot (column\\ j\\ of\\ B)$.\n- Matrix multiplication is associative: $(AB)C = A(BC)$, but not commutative: $AB \\neq BA$ in most cases.\n- Block multiplication is possible: $A = [A_1, A_2]$ and $B = \\begin{bmatrix} B_1 \\\\ B_2 \\end{bmatrix}$, then $AB = A_1B_1 + A_2B_2$.\n\n## Conditions for Multiplication\n\n- To multiply $AB$, if $A$ has $n$ columns, $B$ must have $n$ rows.\n- The resulting matrix $AB$ has as many rows as $A$ and as many columns as $B$.\n\n## Examples\n\n- **Example 1**: Square matrices multiplication.\n  $$\n  \\begin{bmatrix}\n  1 & 1 \\\\\n  2 & -1 \\\\\n  \\end{bmatrix}\n  \\begin{bmatrix}\n  2 & 2 \\\\\n  3 & 4 \\\\\n  \\end{bmatrix}\n  =\

In [None]:
# Store
import pickle

with open('notes_data2.pkl', 'wb') as file:
    pickle.dump(contents_asnwer, file)


In [None]:
import pickle

# Load
with open('notes_data2.pkl', 'rb') as file:
    data_loaded = pickle.load(file)
data_loaded

# See results 
for ms in math_sections.keys():
    print(ms)
    clist =  data_loaded[ms]
    for c in clist:
        content = c[0]
        text = c[1]
        if content == "markdown":
            print("-markdown")
            print(text)
        else:
            print("-latex")
            print(text)

# Student Adaptation

In this project we wanted to experiment with ways that using an LLM with could adapt the learning experience to specific student needs.

One way to do this, is by detecting the student's current emotional state and modifying the form of the LLM's answers to that.
For example, if by the wording of a student's question we realize that they are starting to feel frustrated, we can adapt our answers so that they will positively reinforce the student.

In [8]:
class Student:
    """
    Data structure that will hold student data.
    This will be more efficient that keeping just a list of conversation pieces (Langchain Memory).
    We can then "remind" the model at each prompt what the student's level, interests etc. are.
    """
    def __init__(self):
        self.state = "Neutral"

from langchain.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate
from langchain.llms import HuggingFaceHub
from langchain.chat_models import ChatOllama
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

sentiment_analysis_prompt = PromptTemplate(
        input_variables=["question"],
        output_variable=["sentiment"],
        template="""
You are a tutor. You will be given a student's question about a subject.
Based on the question, identify the student's sentiment. 
Respond with their sentiment as a single word (nothing else) like: Neutral, Frustrated, Happy, Sad, Dissapointed, Interested

Question: {question}
""")

modified_answer_template = PromptTemplate(
        input_variables=["sentiment", "answer"],
        output_variable=["answer"],
        template="""
This is an answer to a student's question.
Please rephrase it based on their current emotional state in order to help the student better cope with the learning process. 

The student's state: {sentiment}

The answer: {answer}
        """
)

# KEY="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"
# llm = ChatOpenAI(model="gpt-3.5-turbo-1106", openai_api_key=KEY)
llm = ChatOllama(model="llama2:13b")

# This is the chain responsible for giving answers to the student's questions.
# This is can be replaced by an agent running using the ReAct framework or a zero-shot llm pass.
# For this example, we simply send the student's question to an LLM.
answer_chain = (
    ChatPromptTemplate.from_template("{question}")
    | llm
    | {"answer": RunnablePassthrough()}
)

student = Student()

def found_sentiment(sentiment):
    student.state = sentiment.content
    return sentiment

sentiment_analysis_chain = (
    sentiment_analysis_prompt 
    | llm 
    | RunnableLambda(found_sentiment)
)

modified_answer_chain = (
    modified_answer_template |
    llm
)

student_adaptation_chain = (
    {
        "sentiment": sentiment_analysis_chain,
        "answer": answer_chain
    }
    | modified_answer_chain
)

In [14]:
prompt = "What was Plato about? I don't get it"
response = student_adaptation_chain.invoke({"question": prompt})
print("Question: ", prompt)
print("Student's sentiment: ", student.state)
print("Response: ", response.content)

prompt = "How do I add two matrices, I need to know!"
response = student_adaptation_chain.invoke({"question": prompt})
print("Question: ", prompt)
print("Student's sentiment: ", student.state)
print("Response: ", response.content)

Question:  What was Plato about? I don't get it
Student's sentiment:  Confused
Response:  Plato was a philosopher in ancient Greece who founded the Academy in Athens, the first institution of higher learning in the Western world. He is widely considered one of the most important figures in the development of philosophy, and his writings cover a wide range of topics including ethics, politics, metaphysics, and epistemology. Plato's most famous work is "The Republic," in which he discusses his ideas about justice, the ideal society, and the nature of the human soul. He also wrote extensively about the nature of reality and the importance of reason and logic in understanding the world. Overall, Plato's philosophy is characterized by his belief in the existence of an eternal and unchanging realm of ideas, which he believed could be accessed through the use of reason and critical thinking. I hope this helps clarify things for you!
Question:  How do I add two matrices, I need to know!
Studen

# Test Creation 

For tests, our approach is fairly straight-forward. The teacher
is able to input a set of learning objectives for the course.
Then, our retriever returns a set of relevant documents based
on the learning objectives. Finally, these documents are given
to the LLM with a prompt instructing it to create multi-
ple choice questions.

In [5]:
import glob
import os
from pathlib import Path
from typing import Any, List

from langchain.docstore.document import Document
from langchain.document_loaders import SRTLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders.base import BaseLoader


class LectureLoader(BaseLoader):
    def __init__(self, 
                 lecture_file: str,
                 add_lecture_info: bool = False
                ):
        self.add_lecture_info = add_lecture_info
        self.lecture_file = lecture_file

    @classmethod
    def from_folder(cls, folder_name: str, **kwargs: Any) -> 'LectureLoader':
        return cls(folder_name, kwargs)

    def load(self) -> List[Document]:
        documents = []
        
        for file_name in Path(self.lecture_file).rglob('*'):
            
            file_path = Path(file_name)
            if not file_path.is_file(): continue
                
            with open(file_name, "r") as f:

                metadata = {}
                
                # Load the transcript data
                if file_name.suffix == ".srt":
                    srt_loader = SRTLoader(file_name)
                        
                    if self.add_lecture_info:
                        metadata["lecture_name"] = file_path.parent.name
                        metadata["source"] = file_path.stem
                        metadata["type"] = "transcript"

                    for doc in srt_loader.load():
                        doc.metadata.update(metadata)
                        documents.append(doc)

                elif file_name.suffix == ".txt":
                    txt_loader = TextLoader(file_path)

                    if self.add_lecture_info:
                        metadata["lecture_name"] = file_path.parent.name
                        metadata["source"] = file_path.stem
                        metadata["type"] = "transcript"

                    for doc in txt_loader.load():
                        doc.metadata.update(metadata)
                        documents.append(doc)

        return documents

from langchain.vectorstores import FAISS
from langchain.vectorstores.base import VectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.embeddings import Embeddings
from langchain.docstore.document import Document

from typing import Type, Iterable, Optional, List

class LectureIndex(FAISS):
    """Wrapper around the FAISS VectorStore"""

    @classmethod
    def from_documents(cls, documents: List[Document], embedding: Embeddings):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 100)
        docs_split = text_splitter.split_documents(documents)
        
        return FAISS.from_documents.__func__(cls, docs_split, embedding)

    def similarity_search_with_score_threshold(self, query: str, threshold: float):
        docs_and_scores = self.similarity_search_with_score(query)
        docs_and_scores = filter(lambda d_s : d_s[1] < threshold, docs_and_scores)
        docs = map(lambda d_s : d_s[0], docs_and_scores)
        return list(docs)

In [9]:
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.embeddings import HuggingFaceEmbeddings

from langchain.chat_models import ChatOpenAI


def get_lecture_index():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    lecture_loader = LectureLoader.from_folder("../data/lecture1", add_lecture_info=True)
    lecture_docs = lecture_loader.load()
    lecture_index = LectureIndex.from_documents(lecture_docs, embeddings)

    return lecture_index

lecture_index = get_lecture_index()

response_schemas = [
    ResponseSchema(name="question", description="A multiple choice question from input text snippet.", type="string"),
    ResponseSchema(name="options", description="Possible choices for the multiple choice question.", type="List[string]"),
    ResponseSchema(name="answer", description="Index of the correct answer for the question.", type="int"),
]


output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions(only_json=True)

test_prompt = ChatPromptTemplate(
    messages = [
        SystemMessagePromptTemplate.from_template("""
        Given a text input, generate multiple choice questions along with the correct answer.
        
        {format_instructions}. 
        Make sure to surround each question with ```json\n and ```.
        """),
        HumanMessagePromptTemplate.from_template("""
        Relevant material:
        {user_prompt}
        Make sure the questions follow these learning objectives: {learning_objectives}
        Make three questions.
        """)
    ],
    input_variables=["user_prompt", "learning_objectives"],
    partial_variables={"format_instructions": format_instructions}
)

def get_relevant_snippets(learning_objectives):
    text = ""
    for learning_objective in learning_objectives:
        results = lecture_index.similarity_search_with_score_threshold(learning_objective, 1.0)
        text += "".join([result.page_content for result in results])
    return text

In [11]:
import json

learning_objectives = ["Plato's Life", "Plato's Work", "Plato's connection to Yale"]
snippets = get_relevant_snippets(learning_objectives)
user_query = test_prompt.format_prompt(user_prompt=snippets,
                                       learning_objectives=learning_objectives)
response = llm(user_query.to_messages())

response = response.content
response = response.split('```json\n')[1:]
response = [r.replace("```", "") for r in response]
response = [json.loads(r) for r in response]

for res in response:
    question = res['question']
    options = res['options']
    answer = res['answer']

    print("Question: ", question)
    print("Options: ", options)
    print("Answer: ", answer)

Question:  What was the name of Plato's famous teacher?
Options:  ['Socrates', 'Aristotle', 'Pythagoras', 'Heraclitus']
Answer:  0
Question:  Which of the following is not one of Plato's famous works?
Options:  ['The Republic', 'Symposium', 'The Iliad', 'Phaedrus']
Answer:  2
Question:  What is Plato's connection to Yale?
Options:  ['He was a student at Yale', 'He was a professor at Yale', "Yale is home to a famous collection of Plato's works", 'He was a founder of Yale University']
Answer:  2
