# Strategy 1: Using RAG Chain from Assignment 3
## From Assignment 3, we discovered that a RAG chain built using the following combination yielded the best results:
- Changing the chunk size to 512 with a chunk overlap of 100.
- Adding more data sources.
- Incorporating a prompt template to guide the LLM.
## Hence, we will be using the same RAG chain to generate training data, and expect this RAG chain to provide better answers compared to the RAG chain used in the baseline strategy of assignment 4.

# Strategy 1 Part 1

In [1]:
# Installing required packages
# ----------------
! pip install -q -U peft==0.6.2 transformers==4.35.2 datasets==2.15.0 bitsandbytes==0.41.2.post2 trl==0.7.4 accelerate==0.24.1 wandb==0.16.3
! pip install -q -U langchain==0.1.13 
! pip install -q -U safetensors>=0.3.1
! pip install -q -U faiss-cpu==1.7.4
! pip install -q tiktoken==0.6.0
! pip install -q sentence-transformers==2.3.1
! pip install -q pypdf==4.0.1
! pip install -q protobuf==4.25.2
! pip install -q lxml==5.1.0
! pip install -q rouge_score==0.1.2
! pip install jq
# ----------------



In [2]:
# Importing required packages
# ----------------
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import CacheBackedEmbeddings, HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.llms import HuggingFacePipeline
from langchain.callbacks import StdOutCallbackHandler
from langchain_community.document_loaders import BSHTMLLoader, TextLoader, JSONLoader
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import JsonOutputParser
from langchain_community.document_loaders.csv_loader import CSVLoader
from bs4 import BeautifulSoup

from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from datasets import load_dataset, Dataset
from rouge_score import rouge_scorer

import torch
import re
import os
import pickle
import requests
import json

# ----------------


  from .autonotebook import tqdm as notebook_tqdm


# Clear data folder to avoid duplicates

In [3]:
import os
import shutil

folder = 'data'
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
    except Exception as e:
        print('Failed to delete %s. Reason: %s' % (file_path, e))

# Download PDF documents

In [4]:
# Download SUTD's annual reports
! mkdir -p ./data

! wget -nc -P data https://www.sutd.edu.sg/SUTD/media/SUTD/SUTD_AnnualReport_2022_23.pdf
! wget -nc -P data https://www.sutd.edu.sg/SUTD/media/SUTD/SUTD_AnnualReport_2021.pdf
! wget -nc -P data https://www.sutd.edu.sg/SUTD/media/SUTD/SUTD_AnnualReport_2020.pdf

# Download SUTD's brochures
! wget -nc -P data https://www.sutd.edu.sg/SUTD/media/SUTD/SUTD.pdf
! wget -nc -P data https://www.sutd.edu.sg/SUTD/media/SUTD/ASD.pdf
! wget -nc -P data https://www.sutd.edu.sg/SUTD/media/SUTD/CSD.pdf
! wget -nc -P data https://www.sutd.edu.sg/SUTD/media/SUTD/DAI.pdf
! wget -nc -P data https://www.sutd.edu.sg/SUTD/media/SUTD/EPD.pdf
! wget -nc -P data https://www.sutd.edu.sg/SUTD/media/SUTD/ESD.pdf
! wget -nc -P data https://www.sutd.edu.sg/SUTD/media/SUTD/HASS.pdf
! wget -nc -P data https://www.sutd.edu.sg/SUTD/media/SUTD/SUTD-Capstone.pdf


--2024-04-16 14:27:17--  https://www.sutd.edu.sg/SUTD/media/SUTD/SUTD_AnnualReport_2022_23.pdf
Resolving www.sutd.edu.sg (www.sutd.edu.sg)... 10.1.1.61
Connecting to www.sutd.edu.sg (www.sutd.edu.sg)|10.1.1.61|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16229772 (15M) [application/pdf]
Saving to: ‘data/SUTD_AnnualReport_2022_23.pdf’


2024-04-16 14:27:18 (61.7 MB/s) - ‘data/SUTD_AnnualReport_2022_23.pdf’ saved [16229772/16229772]

--2024-04-16 14:27:19--  https://www.sutd.edu.sg/SUTD/media/SUTD/SUTD_AnnualReport_2021.pdf
Resolving www.sutd.edu.sg (www.sutd.edu.sg)... 10.1.1.61
Connecting to www.sutd.edu.sg (www.sutd.edu.sg)|10.1.1.61|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9129649 (8.7M) [application/pdf]
Saving to: ‘data/SUTD_AnnualReport_2021.pdf’


2024-04-16 14:27:19 (67.8 MB/s) - ‘data/SUTD_AnnualReport_2021.pdf’ saved [9129649/9129649]

--2024-04-16 14:27:20--  https://www.sutd.edu.sg/SUTD/media/SUTD/SUTD_AnnualRepor

# Download FAQ questions as JSON

In [5]:
def extract_faqs(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    faqs = []
    faq_items = soup.find_all('div', class_='accordion-item')

    for item in faq_items:
        question = item.find('div', class_='accordion-title').get_text(strip=True)
        answer = item.find('div', class_='accordion-content').get_text(strip=True)
        faqs.append({"question": question, "answer": answer})
    
    return faqs

url = "https://www.sutd.edu.sg/Admissions/Undergraduate/FAQs"
response = requests.get(url)

if response.status_code == 200:
    faqs = extract_faqs(response.content)

    # Save as JSON
    with open('data/faqs.json', 'w') as f:
        json.dump(faqs, f, ensure_ascii=False, indent=4)
    print("FAQs saved to data/faqs.json")
else:
    print(f"Failed to fetch FAQs. Status code: {response.status_code}")

FAQs saved to data/faqs.json


# Add all relevant SUTD urls

In [6]:
# Define SUTD urls

url_links = [
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Admission-Requirements",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Application-Timeline",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Admission-Requirements/Singapore-Cambridge-GCE-A-Level",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Admission-Requirements/Local-Diploma",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Admission-Requirements/NUS-High-School-Diploma",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Admission-Requirements/International-Baccalaureate-Diploma-\(Singapore\)",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Admission-Requirements/International-Qualifications",
    "https://sutd.edu.sg/About/Overview/Mission-and-Values",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Unique-Curriculum/undergraduate-curriculum",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Programmes",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Minors-and-Tracks",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Unique-Curriculum/Freshmore-Subjects",
    "https://www.sutd.edu.sg/Capstone",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Programmes/Special-Programmes",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Application-Guide",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Early-Consideration-Programmes/SUTD-Early-Admissions-Scholarship-Application",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Financing-Your-Studies/Educational-Expenses",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Financing-Your-Studies/Educational-Expenses/Student-Insurance-Scheme",
    "https://www.sutd.edu.sg/Campus-Life/Housing/Freshmore-Terms-1-2",
    "https://www.sutd.edu.sg/Campus-Life/Housing/Freshmore-Terms-1-2/Room-Type",
    "https://www.sutd.edu.sg/Campus-Life/Housing/Freshmore-Terms-1-2/What-To-Bring",
    "https://www.sutd.edu.sg/Campus-Life/Housing/Undergraduate",
    "https://www.sutd.edu.sg/Campus-Life/Housing/Off-campus-Accommodation",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Financing-Your-Studies/Financial-Options-Financial-Aid/Financial-Aid",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Financing-Your-Studies/Financial-Options-Financial-Aid/Other-Financing-Options",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Financing-Your-Studies/SUTD-Community-Grant",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Financing-Your-Studies/Financial-Options-Financial-Aid/Financial-Aid/SUTD-Education-Opportunity-Grant",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Scholarship/Application-for-scholarships",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Scholarship/External-Sponsoring-Organisations",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Scholarship/Awards",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Early-Matriculation",
    "https://www.sutd.edu.sg/Admissions/Undergraduate/Transition-Into-SUTD/Integrated-Learning-Programme",
    "https://www.sutd.edu.sg/Student-Development/Student-Life/Student-Organisations-Fifth-Row",
    "https://www.sutd.edu.sg/Student-Development/Student-Life/Students@Part-Time-Work-Scheme",
    "https://www.sutd.edu.sg/Student-Development/Global/Student-Exchange/Outbound/GEXP",
    "https://www.sutd.edu.sg/Student-Development/Global/Student-Exchange/Outbound/SUSEP",
    "https://www.sutd.edu.sg/Student-Development/Undergrad-Opportunities-Programme/UTOP",
    "https://www.sutd.edu.sg/Student-Development/Undergrad-Opportunities-Programme/UROP",
    "https://www.sutd.edu.sg/Student-Development/Career-Services/Student/Career-Workshops",
    "https://www.sutd.edu.sg/Student-Development/Global/SUTD-FACT",
    "https://www.sutd.edu.sg/Student-Development/Career-Services/Student/Internships",
    "https://www.sutd.edu.sg/Student-Development/Career-Services/Student/Career-Advisory",
    "https://www.sutd.edu.sg/Student-Development/Career-Services/Student/Career-Resources",
    "https://www.sutd.edu.sg/Student-Development/Career-Services/Student/GEMS-Career-Portal",
    "https://www.sutd.edu.sg/Student-Development/Career-Services/Student/Events",
]

# Append the urls for various scholarships that SUTD offers 
def scrape_scholarship_urls(urls):
    base_url = 'https://www.sutd.edu.sg'
    all_urls = []
    
    for url in urls:
        response = requests.get(url)
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        links = soup.find_all('a')
        for link in links:
            if 'href' in link.attrs:
                href = link.attrs['href']
                if (href.startswith('/Admissions')):
                    full_url = base_url + href
                    if full_url not in all_urls: 
                        all_urls.append(full_url)
    
    return all_urls

urls_scholarships = [
    'https://www.sutd.edu.sg/Admissions/Undergraduate/Scholarship/Awards',
    'https://www.sutd.edu.sg/Admissions/Undergraduate/Scholarship/Application-for-scholarships'
]

scholarship_urls = scrape_scholarship_urls(urls_scholarships)
for url in scholarship_urls:
    url_links.append(url)

# Download HTML files (This takes a while to run...)

In [7]:
import subprocess
import os

def curl_urls_to_files(urls, dest_dir):
    os.makedirs(dest_dir, exist_ok=True)
    
    for url in urls:
        filename = "{}.html".format(get_last_word_after_last_slash(url))
        html_filenames.append(filename)
        filepath = os.path.join(dest_dir, filename)
        
        with open(filepath, 'w') as file:
            try:
                subprocess.check_call(['curl', '-s', url], stdout=file)
                print(f"Saved {url} to {filepath}")
            except subprocess.CalledProcessError as e:
                print(f"Failed to fetch {url}: {e}")

def get_last_word_after_last_slash(url):
    _, _, last_word = url.rpartition('/')
    return last_word

html_filenames = []

dest_dir = "data"
curl_urls_to_files(url_links, dest_dir)

Saved https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Admission-Requirements to data/Admission-Requirements.html
Saved https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Application-Timeline to data/Application-Timeline.html
Saved https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Admission-Requirements/Singapore-Cambridge-GCE-A-Level to data/Singapore-Cambridge-GCE-A-Level.html
Saved https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Admission-Requirements/Local-Diploma to data/Local-Diploma.html
Saved https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Admission-Requirements/NUS-High-School-Diploma to data/NUS-High-School-Diploma.html
Saved https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Admission-Requirements/International-Baccalaureate-Diploma-\(Singapore\) to data/International-Baccalaureate-Diploma-\(Singapore\).html
Saved https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Admission-Requirements/Internatio

# Download csv files for tables

In [8]:
import pandas as pd
import requests

table_urls = [
    'https://www.sutd.edu.sg/Campus-Life/Housing/Undergraduate/Payment-AY2023',
    'https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Application-Timeline',
    'https://www.sutd.edu.sg/Admissions/Undergraduate/Application/Application-Guide',
    'https://www.sutd.edu.sg/Admissions/Undergraduate/Unique-Curriculum/Freshmore-Subjects',
    'https://www.sutd.edu.sg/Capstone',
    'https://www.sutd.edu.sg/Campus-Life/Housing/Freshmore-Terms-1-2/What-To-Bring',
    'https://www.sutd.edu.sg/Campus-Life/Housing/Undergraduate',
    'https://www.sutd.edu.sg/Campus-Life/Housing/Off-campus-Accommodation',
    'https://www.sutd.edu.sg/Admissions/Undergraduate/Financing-Your-Studies/Financial-Options-Financial-Aid/Financial-Aid/SUTD-Education-Opportunity-Grant',
    'https://www.sutd.edu.sg/Admissions/Undergraduate/Transition-Into-SUTD/Integrated-Learning-Programme',
    'https://www.sutd.edu.sg/Student-Development/Global/Student-Exchange/Outbound/GEXP',
    'https://www.sutd.edu.sg/Student-Development/Global/Student-Exchange/Outbound/SUSEP',
    'https://www.sutd.edu.sg/Student-Development/Global/SUTD-FACT',
]

csv_filenames = []
    
def parse_table_to_csv(url):
    r = requests.get(url)
    
    if r.status_code != 200:
        print(f'Failed to fetch the URL. Status code: {response.status_code}')
        return

    tables = pd.read_html(url)
    for t in range(len(tables)):
        filename = 'data/{}-{}.csv'.format(get_last_word_after_last_slash(url), t)
        csv_filenames.append('{}-{}.csv'.format(get_last_word_after_last_slash(url), t))
        df = tables[t]
        df.fillna('', inplace=True)
        df.to_csv(filename, index=False)

for url in table_urls:
    parse_table_to_csv(url)

# Download fees related information as text file

In [9]:
import requests
from bs4 import BeautifulSoup

fees_url = 'https://www.sutd.edu.sg/Admissions/Undergraduate/Financing-Your-Studies/Educational-Expenses/fees'

response = requests.get(fees_url)

if response.status_code == 200:
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    fees_dict = {}

    # Find the table.
    table = soup.find('table')
    table_rows = table.find_all('tr')
    data = []
    for row in table.find_all('tr'):
      row_data = []
      for cell in row.find_all('td'):
          row_data.append(cell.text.replace('\n', '').replace('\t', '').replace('\xa0', ''))
      data.append(row_data)

per_academic_year = data[2][1:]
per_term = data[3][1:]

file_content = ""

file_content += "The subsidised tuition fees for Singapore Citizens (SC) is as follows:\n"
file_content += f"- Per academic year: {per_academic_year[0]}\n"
file_content += f"- Per Term: {per_term[0]}\n\n"

file_content += "The subsidised tuition fees for Singapore Permanent Residents (SPR) is as follows:\n"
file_content += f"- Per academic year: {per_academic_year[1]}\n"
file_content += f"- Per Term: {per_term[1]}\n\n"

file_content += "The subsidised tuition fees for International Students (IS) inclusive of GST is as follows:\n"
file_content += f"- Per academic year: {per_academic_year[2]}\n"
file_content += f"- Per Term: {per_term[2]}\n\n"

file_content += "The non-subsidised tuition fees inclusive of GST is as follows:\n"
file_content += f"- Per academic year: {per_academic_year[3]}\n"
file_content += f"- Per Term: {per_term[3]}\n"

file_path = 'data/fees.txt'
with open(file_path, 'w') as file:
    file.write(file_content)

text_filenames = []
text_filenames.append("fees.txt")

# Split documents

In [10]:
data_root = "./data/"

pdf_filenames = [
    'SUTD_AnnualReport_2020.pdf',
    'SUTD_AnnualReport_2021.pdf',
    'SUTD_AnnualReport_2022_23.pdf',
    'SUTD.pdf',
    'CSD.pdf',
    'DAI.pdf',
    'EPD.pdf',
    'ESD.pdf',
    'HASS.pdf',
    'SUTD-Capstone.pdf'  
]

pdf_metadata = [
    dict(year=2020, source=pdf_filenames[0]),
    dict(year=2021, source=pdf_filenames[1]),
    dict(year=2023, source=pdf_filenames[2]),
    dict(year=2024, source=pdf_filenames[3]),
    dict(year=2024, source=pdf_filenames[4]),
    dict(year=2024, source=pdf_filenames[5]),
    dict(year=2024, source=pdf_filenames[6]),
    dict(year=2024, source=pdf_filenames[7]),
    dict(year=2024, source=pdf_filenames[8]),
    dict(year=2024, source=pdf_filenames[9]),  
]

csv_metadata = [
    dict(year=2023, source=csv_filenames[0]),
    dict(year=2022, source=csv_filenames[1]),
    dict(year=2021, source=csv_filenames[2]),
    dict(year=2020, source=csv_filenames[3]),
    dict(year=2019, source=csv_filenames[4]),
    dict(year=2018, source=csv_filenames[5]),
    dict(year=2017, source=csv_filenames[6]),
    dict(year=2016, source=csv_filenames[7]),
    dict(year=2024, source=csv_filenames[8]),
    dict(year=2024, source=csv_filenames[9]),
    dict(year=2024, source=csv_filenames[9]),
    dict(year=2024, source=csv_filenames[10]),
    dict(year=2024, source=csv_filenames[11]),
]

for filename in csv_filenames[12:]:
    csv_metadata.append(dict(year=2024, source=filename))

html_metadata = []
for filename in html_filenames:
    html_metadata.append(dict(year=2024, source=filename))

json_filenames = ["faqs.json"]
json_metadata = []

for filename in json_filenames:
    json_metadata.append(dict(year=2024, source=filename))

text_metadata = []
for filename in text_filenames:
    text_metadata.append(dict(year=2024, source=filename))

documents = []

# load csv files, attach meta data
for idx, file in enumerate(csv_filenames):
    print("Load file", file)
    loader = CSVLoader(data_root + file)
    document = loader.load()
    for document_fragment in document:
        document_fragment.metadata = csv_metadata[idx]
    documents += document

# load json files, attach meta data
for idx, file in enumerate(json_filenames):
    print("Load file", file)
    loader = JSONLoader(data_root + file, jq_schema='.[] | .question + ": " + .answer')
    document = loader.load()
    for document_fragment in document:
        document_fragment.metadata = json_metadata[idx]
    documents += document

# load pdf files, attach meta data
for idx, file in enumerate(pdf_filenames):
    print("Load file", file)
    loader = PyPDFLoader(data_root + file)
    document = loader.load()
    for document_fragment in document:
        document_fragment.metadata = pdf_metadata[idx]
    documents += document

#load html files, attach meta data
for idx, file in enumerate(html_filenames):
    print("Load file", file)
    loader = BSHTMLLoader(data_root + file)
    document = loader.load()
    for document_fragment in document:
        # remove duplicate whitespace
        document_fragment.page_content = repr(re.sub(r"(?<=\n)(\s+)",r" ", document_fragment.page_content))
        document_fragment.metadata = html_metadata[idx]
    documents += document

#load text files, attach meta data
for idx, file in enumerate(text_filenames):
    print("Load file", file)
    loader = TextLoader(data_root + file)
    document = loader.load()
    for document_fragment in document:
        document_fragment.metadata = text_metadata[idx]
    documents += document


text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=512,
    chunk_overlap=100
)

docs = text_splitter.split_documents(documents)


#------------------------------
print(f'# of Document Pages {len(documents)}')
print(f'# of Document Chunks: {len(docs)}')

Load file Payment-AY2023-0.csv
Load file Application-Timeline-0.csv
Load file Application-Timeline-1.csv
Load file Application-Guide-0.csv
Load file Freshmore-Subjects-0.csv
Load file Freshmore-Subjects-1.csv
Load file Freshmore-Subjects-2.csv
Load file Capstone-0.csv
Load file Capstone-1.csv
Load file Capstone-2.csv
Load file What-To-Bring-0.csv
Load file Undergraduate-0.csv
Load file Off-campus-Accommodation-0.csv
Load file SUTD-Education-Opportunity-Grant-0.csv
Load file SUTD-Education-Opportunity-Grant-1.csv
Load file SUTD-Education-Opportunity-Grant-2.csv
Load file SUTD-Education-Opportunity-Grant-3.csv
Load file SUTD-Education-Opportunity-Grant-4.csv
Load file SUTD-Education-Opportunity-Grant-5.csv
Load file SUTD-Education-Opportunity-Grant-6.csv
Load file SUTD-Education-Opportunity-Grant-7.csv
Load file SUTD-Education-Opportunity-Grant-8.csv
Load file SUTD-Education-Opportunity-Grant-9.csv
Load file SUTD-Education-Opportunity-Grant-10.csv
Load file Integrated-Learning-Programme-

In [11]:
# Create embeddings of document chunks and store them in vector store for fast lookup
store = LocalFileStore("./cache/")

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

core_embeddings_model = HuggingFaceEmbeddings(
    model_name=embed_model_id
)

embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=embed_model_id
)

vector_store = FAISS.from_documents(docs, embedder)


In [12]:
# Load Llama-2 13B LLM model 

model_id = "NousResearch/Llama-2-13b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_config = AutoConfig.from_pretrained(
    model_id
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto'
)

tokenizer = AutoTokenizer.from_pretrained(model_id)


Loading checkpoint shards: 100%|██████████| 3/3 [01:55<00:00, 38.62s/it]


In [13]:
# check that the model can generate text
prompt = "Today was an amazing day because"
inputs = tokenizer(prompt, return_tensors="pt")

outputs = model.generate(**inputs, do_sample=True, num_beams=1, max_new_tokens=100)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))



['Today was an amazing day because…\n\n1. I woke up early and had a great breakfast, which gave me the energy to tackle my tasks for the day.\n2. I had a productive meeting with my team, where we brainstormed new ideas and made progress on our current projects.\n3. I took a walk outside during my lunch break and enjoyed the beautiful weather.\n4. I had a great conversation with a friend who I haven’t spoken to in a while,']


In [14]:
# Create a text generation pipeline with the LLM model 
generate_text = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    return_full_text=False,
    temperature=0.5,
    do_sample=True,
    max_new_tokens=500
)

llm = HuggingFacePipeline(pipeline=generate_text)

In [15]:
# instantiate retriever model and callback handler for QA results
retriever = vector_store.as_retriever()
handler = StdOutCallbackHandler()


In [16]:
# Custom prompt template

template = """You are a helpful assistant. Use the following pieces of context to answer the question at the end.
Answer the following questions about the Singapore University of Technology and Design (SUTD).
You are answering to their prospective students and current students. 
Use three sentences maximum and keep the answer as concise as possible directly addressing the asked question.

Context: {context}

Question: {question}

Helpful Answer:"""
custom_rag_prompt = PromptTemplate.from_template(template)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)


In [17]:
# Test RAG with example question
rag_chain.invoke("What types of student organizations and clubs are available on campus?")

' As a student at SUTD, you have access to a variety of student organizations and clubs that cater to different interests and passions. These include academic clubs, cultural and ethnic clubs, sports and fitness clubs, and community service clubs. You can explore these options and join the ones that align with your goals and interests.'

In [19]:

# QUESTION: When generating data with LLMs, it is helpful to parse the LLLM output into structured data formats. 
# Create a JsonOutputParser from langchain. Name the variable 'output_parser'. Print the format instructions that come with the parser.

#--- ADD YOUR SOLUTION HERE (5 points)---
output_parser = JsonOutputParser()
print(output_parser.get_format_instructions())

#---------------------------------



Return a JSON object.


In [20]:
# When generating data, it is often helpful to guide the generation process through some hierachical structure. 
# Before we create question-answer pairs, let's generate some topics which the questions should be about.

# QUESTION: Create a function 'generate_topics' which takes an integer n_length as input and outputs a dictionary with key 'topics' 
# and as value a list of n_length topics which prospective students might care about such as financial aid, campus life etc.
# Use the LLM and an appropriate prompt to generate these topics and the Json parser to parse the LLM output (use the format instructions). 
# Make sure your function is robust to non well-formed LLM output.

#--- ADD YOUR SOLUTION HERE (20 points)---
def generate_topics(n_length):

    prompt = PromptTemplate(
        template="Generate a dictionary with key 'topics' and as value a list of {n_length} topics which prospective students of the Singapore University of Technology and Design (SUTD) might care about such as financial aid, campus life, undergraduate programs, etc. Keep your answer concise without any unnecessary details or information and ensure there are {n_length} topics. {format_instructions}\nYou can use the provided context to help you generate the list of topics.\nContext: {context}\nJSON Object:",
        input_variables=["n_length"],
        partial_variables={"format_instructions": output_parser.get_format_instructions(), "context": retriever | format_docs},
    )

    try:
        chain = prompt | llm | output_parser
        output = chain.invoke({"n_length": n_length})
        return output
    except Exception as e:
        print(f"Error: {e}")
        return {"topics": []}
#---------------------------------



In [21]:
# Now let's generate a list of 20 topics 
# We save a copy to disk and reload it from there if the file exists


# generate topics
if os.path.exists("topics_strategy1.txt"):
    print("File with topics from strategy1 exists. Read topics from file..")
    with open("topics_strategy1.txt", "r") as fin:
        topics = {"topics": fin.read().splitlines()}
else:
    print("Generate topics..")
    n_topics = 20
    topics = generate_topics(n_topics)
    with open("topics_strategy1.txt", "w") as fout:
        fout.write("\n".join(topics['topics']))
print(topics)


Generate topics..
{'topics': ['financial aid', 'campus life', 'undergraduate programs', 'graduate programs', 'scholarships', 'career services', 'student organizations', 'academic calendar', 'course catalog', 'admissions requirements', 'application process', 'tuition fees', 'housing options', 'meal plans', 'student life', 'campus resources', 'library services', 'research opportunities', 'internship opportunities', 'study abroad programs', 'student reviews']}


In [22]:
# Now we need another function to generate questions for the topics.

# QUESTION: Create a function 'generate_questions' which a topic string and takes an integer n_length as input and outputs a dictionary with key 'questions' 
# and as value a list of at least n_length questions which prospective students might have about this topic.
# Again, use the LLM and an appropriate prompt and the Json parser to parse the LLM output (use the format instructions). 
# Make sure your function is robust to non well-formed LLM output.

#--- ADD YOUR SOLUTION HERE (20 points)---

example_output={'questions': ['What types of library resources are available at SUTD?', 'How do I access library resources remotely?']}

def generate_questions(topic, n_length):
    
    prompt = PromptTemplate(
        template="Generate a dictionary with key 'questions' and as value a list of {n_length} questions which prospective students of the Singapore University of Technology and Design (SUTD) might have regarding the topic {topic}. Keep your answer concise without any unnecessary details or information and ensure there are exactly {n_length} questions. Example output with 2 questions on the topic 'library': {example_output}. {format_instructions}\nJSON Object:",
        input_variables=["topic", "n_length"],
        partial_variables={"format_instructions": output_parser.get_format_instructions(), "example_output": example_output},
    )

    chain = prompt | llm | output_parser
    output = chain.invoke({"topic": topic, "n_length": n_length})
    return output

#---------------------------------

In [23]:
# Now let's generate some questions for the topics.

# QUESTION: For every topic, generate at least 10 questions. 
# LLM generation can take time, save intermediate results to disk and reload them if necessary to speed up subsequent runs.
# Store all questions in a list of strings 'questions_all'
# Extra points: check that there is diversity in the generated questions, i.e. they are not all the same or too similar.
# You can achieve this by checking that questions are not too similar to each other

n_questions_per_topic = 10
questions_all = []

#--- ADD YOUR SOLUTION HERE (20 points)---
for topic in topics['topics']:
    while True:
        try:
            questions = generate_questions(topic, n_questions_per_topic)['questions']
        except Exception as e:
            print(f"Error: {e}")
        else:
            break
    questions_all.extend(questions)

#---------------------------------

Error: Invalid json output: {'questions': [
    'What are the admission requirements for undergraduate programs at SUTD?',
    'What are the available undergraduate majors and minors at SUTD?',
    'How do I apply for financial aid and scholarships at SUTD?',
    'What is the academic calendar for undergraduate programs at SUTD?',
    'What are the on-campus housing options available for undergraduate students at SUTD?',
    'What are the extracurricular activities and clubs available for undergraduate students at SUTD?',
    'How do I stay updated on the latest developments and announcements related to undergraduate programs at SUTD?',
    'What are the career prospects and placement statistics for undergraduate students at SUTD?',
    'How do I contact the undergraduate admissions office at SUTD?',
    'What are the international study opportunities available for undergraduate students at SUTD?'
]}

Note: The questions should be relevant and useful for prospective students who are co



Error: Invalid json output: {
"questions": [
"What is the tuition fee structure at SUTD?",
"Are there any scholarships or financial aid options available?",
"What does the tuition fee cover?",
"Can I pay the tuition fees in installments?",
"What if I need to withdraw from my course?",
"How do I appeal for a refund of my tuition fees?",
"What are the deadlines for paying tuition fees?",
"Can I pay my tuition fees using a credit card or online payment methods?",
"Are there any penalties for late payment of tuition fees?"
"How do I check my tuition fee balance?"
]
}

Note: The questions should be clear and concise, and directly related to the topic of tuition fees at SUTD.




Error: Invalid json output: {'questions': ['What is the total tuition fee for the undergraduate programme?', 'Are there any scholarships or financial aid available for international students?', 'How do I pay my tuition fees?', 'What is the deadline for paying tuition fees?', 'Can I pay my tuition fees in installments?', 'Are there any additional fees or charges that I need to pay?', 'What is the refund policy if I withdraw from the programme?', 'Can I use my tuition fees to pay for other expenses such as living expenses or textbooks?', 'Are there any discounts or waivers available for tuition fees?']}




Error: Invalid json output: {'questions': [
'What types of meal plans are available at SUTD?',
'How do I purchase a meal plan?',
'What is the cost of a meal plan?',
'Can I customize my meal plan?',
'How do I access my meal plan account?',
'What is the deadline to purchase a meal plan?',
'Can I use my meal plan at off-campus locations?',
'How do I check my meal plan balance?',
'What happens if I run out of funds in my meal plan account?',
'Can I transfer funds from my meal plan account to another account?']}

Note: Please ensure that the questions are concise and to the point, avoiding any unnecessary details or information.




Error: Invalid json output: {'questions': ['What types of meal plans are available at SUTD?', 'How do I purchase a meal plan?', 'What are the meal plan options for vegetarians/vegans?', 'Can I customize my meal plan?', 'How do I use my meal plan at the dining hall?', 'Can I use my meal plan at off-campus locations?', 'What is the cost of a meal plan?', 'How do I cancel my meal plan?', 'What is the refund policy for meal plans?', 'How do I change my meal plan?']}

Please note that the questions are based on general assumptions and may not be actual questions that prospective students of SUTD might have.




Error: Invalid json output: {
"questions": [
"What types of internship opportunities are available at SUTD?",
"How do I apply for an internship at SUTD?",
"What are the eligibility criteria for internships at SUTD?",
"How many hours do I need to commit to an internship at SUTD?",
"What kind of support can I expect from SUTD during my internship?",
"How do I find a suitable internship at SUTD?",
"What are the deadlines for applying to internships at SUTD?",
"Can I apply for multiple internships at SUTD?",
"How will my performance in my internship be evaluated at SUTD?"
"What are the benefits of completing an internship at SUTD?"
]
}




Error: Invalid json output: {'questions': ['What scholarships are available for study abroad programs at SUTD?', 'How do I apply for study abroad programs at SUTD?', 'What are the eligibility criteria for study abroad programs at SUTD?', 'How do I select the right study abroad program for my interests and career goals?', 'What are the benefits of studying abroad through SUTD?', 'How do I maintain my academic progress while studying abroad?', 'What kind of support services does SUTD offer for study abroad students?', 'How do I stay connected with SUTD while studying abroad?', 'What are the opportunities for internships and career development while studying abroad through SUTD?']}

Here are ten potential questions that prospective students of the Singapore University of Technology and Design (SUTD) might have regarding study abroad programs:

1. What scholarships are available for study abroad programs at SUTD?
2. How do I apply for study abroad programs at SUTD?
3. What are the eligibil



In [24]:
# save questions to disk 
if not os.path.exists("questions_strategy1.txt"):
    print("Write all questions to questions_strategy1.txt")
    with open("questions_strategy1.txt", "w") as fout:
        fout.write("\n".join(questions_all))
else:
      print("File questions_strategy1.txt exists. skip")

Write all questions to questions_strategy1.txt


In [25]:
# Now create answers to questions using the RAG pipeline

# QUESTION: For every question, generate an answer using the RAG system
# Store all answers in a list of strings 'answers_all'
# Extra points: check that there is diversity in the generated questions, i.e. they are not all the same or too similar.
# You can achieve this by checking that questions are not too similar to each other

answers_all = []

#--- ADD YOUR SOLUTION HERE (10 points)---
for question in questions_all:
    answers_all.append(rag_chain.invoke(question))


#---------------------------------



In [26]:
# save a copy of the answers to disk

if not os.path.exists("answers_strategy1.txt"):
    print("Write all answers to answers_strategy1.txt")
    with open("answers_strategy1.txt", "w") as fout:
        fout.write("\n".join(answers_all))
else:
    print("File answers_strategy1.txt exists. skip")
   

Write all answers to answers_strategy1.txt


In [27]:
# create huggingface dataset to make it easier to work with the data

# QUESTION: create a huggingface dataset object with the keys 'question' and 'answer' and the questions and answers you have generated, respectively
# shuffle the dataset. use a fixed seed.

#--- ADD YOUR SOLUTION HERE (5 points)---
sutd_qa_dataset = Dataset.from_dict({"question": questions_all, "answer": answers_all})
sutd_qa_dataset = sutd_qa_dataset.shuffle(seed=42)

#---------------------------------


In [28]:
# inspect schema and size of dataset
sutd_qa_dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 206
})

In [29]:
# inspect first instance
sutd_qa_dataset[0]

{'question': 'How do I access the digital library?',
 'answer': ' You can access the digital library by logging into the SUTD portal and clicking on the "Library" tab. From there, you can access a variety of online resources, including e-books, academic journals, and other publications. Additionally, you can use the library\'s online catalog to search for physical resources and request items for delivery to the library.'}

In [30]:
# save dataset to disk
with open('sutd_qa_dataset_strategy1.pkl', 'wb') as f:
    pickle.dump(sutd_qa_dataset, f)



In [31]:
from huggingface_hub import login

# log in to huggingface, you need to put your huggingface access token
# https://huggingface.co/docs/hub/en/security-tokens

hf_access_token = ""
login(token=hf_access_token)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/jovyan/.cache/huggingface/token
Login successful


In [32]:
# push dataset to huggingface
sutd_qa_dataset.push_to_hub("sutd_qa_dataset_strategy1")



Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 146.61ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.57s/it]


### This concludes the first part of Strategy 1. Continue with the next part.