# Preprocessing Data

Process the data from URLs using BeatifulSoup and stores them as maps of content, relevant links, tables, Lecture URL and pargraph number.

In [15]:
# Libraries

import markdown
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup

In [2]:
!pip install groq langchain==0.1.16 langchain-core langchain-groq
# Langchains used as LLMs in project

Collecting groq
  Downloading groq-0.9.0-py3-none-any.whl (103 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.5/103.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain==0.1.16
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core
  Downloading langchain_core-0.2.7-py3-none-any.whl (315 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.6/315.6 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-groq
  Downloading langchain_groq-0.1.5-py3-none-any.whl (11 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain==0.1.16)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain==0.1.16)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.32 (f

In [3]:
sections = []
# Extracting the sections from urls of lectures and readme file

In [4]:
# Function to extract text from HTML content
def extract_text(html_content, url):
    soup = BeautifulSoup(html_content, 'html.parser')
    headers = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']

    index = 1

    for header_tag in soup.find_all(headers):
        section_title = header_tag.text.strip()
        content = ''
        links = []
        tables = []

        for tag in header_tag.find_next_siblings():
            if tag.name in headers:
                break
            for a_tag in tag.find_all('a'):
                links.append(a_tag.get('href'))
            if tag.name == 'table':
                table_data = []
                table_headers = [header.text.strip() for header in tag.find_all('th')]
                table_data.append(table_headers)
                for row in tag.find_all('tr'):
                    row_data = [cell.text.strip() for cell in row.find_all('td')]
                    if row_data:
                        table_data.append(row_data)
                tables.append(table_data)
            else:
                content += tag.get_text(strip=True)

        # Storing the data as maps containing content of paragraph of lectures, url, title, relevant links in paragraph and paragraph number in url
        sections.append({
            'lecture_url': url,
            'title': section_title,
            'content': content,
            'links': links,
            'tables': tables,
            'paragraph_number': index
        })
        index += 1

    return

# URL of the webpage
# url = 'https://stanford-cs324.github.io/winter2022/lectures/introduction/'

# # Send a GET request to the URL
# response = requests.get(url)

# # Check if the request was successful (status code 200)
# if response.status_code == 200:
#     # Get the HTML content of the webpage
#     html_content = response.text

#     # Extract text from HTML content
#     sections = extract_text(html_content)

#     # Print sections
#     for section in sections:
#         print("Title:", section['title'])
#         print("Content:", section['content'])
#         print("Links:", section['links'])
#         print("Tables:", section['tables'])
#         print()
# else:
#     print("Failed to retrieve HTML content. Status code:", response.status_code)

In [5]:
# Extracting the html content from the url
def extract_html_content(url):
  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful (status code 200)
  if response.status_code == 200:
      # Get the HTML content of the webpage
      html_content = response.text
      extract_text(html_content,url)
      return
  print("Failed to retrieve HTML content. Status code:", response.status_code)
  return

In [6]:
# Lectures of standford list
lectures_list = [
    'https://stanford-cs324.github.io/winter2022/lectures/introduction/',
    'https://stanford-cs324.github.io/winter2022/lectures/capabilities/',
    'https://stanford-cs324.github.io/winter2022/lectures/harms-1/',
    'https://stanford-cs324.github.io/winter2022/lectures/harms-2/',
    'https://stanford-cs324.github.io/winter2022/lectures/data/',
    'https://stanford-cs324.github.io/winter2022/lectures/modeling/',
    'https://stanford-cs324.github.io/winter2022/lectures/training/',
    'https://stanford-cs324.github.io/winter2022/lectures/selective-architectures/',
    'https://stanford-cs324.github.io/winter2022/lectures/adaptation/',
    'https://stanford-cs324.github.io/winter2022/lectures/environment/',
    ]
# Readme files list
md_file_list = ['https://raw.githubusercontent.com/Hannibal046/Awesome-LLM/main/README.md']

In [7]:
def convert_readme_to_html(url):
  # URL of the webpage

  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful (status code 200)
  if response.status_code == 200:
      # Get the text content of the webpage
      markdown_text = response.text
  else:
      print("Failed to retrieve Markdown content. Status code:", response.status_code)

  html_content = markdown.markdown(markdown_text)
  return html_content

In [8]:
# Storing paragraphs of each url

for url in lectures_list:
  extract_html_content(url)

for url in md_file_list:
  extract_text(convert_readme_to_html(url),url)

In [9]:
len(sections)

133

# Method - 1
This method utilize cosine similarity to check the number of paragraphs from data which best match the user input query, hence the current input already contain the data from which LLM need to answer the input query

In [10]:
contents = [item['content'] for item in sections]

In [11]:
def check_similarity(num_top_paragraphs, query):

  # Initialize TF-IDF vectorizer
  list_of_maps = []

  tfidf_vectorizer = TfidfVectorizer()

  # Fit the vectorizer to the paragraphs and transform them into TF-IDF vectors
  tfidf_matrix = tfidf_vectorizer.fit_transform(contents)

  # Transform the user query into a TF-IDF vector
  query_vector = tfidf_vectorizer.transform([query])

  # Calculate cosine similarity between the query vector and paragraph vectors
  cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)

  # Rank paragraphs based on cosine similarity scores
  ranked_paragraphs = sorted(enumerate(cosine_similarities[0]), key=lambda x: -x[1])

  for (idx, score) in ranked_paragraphs[:num_top_paragraphs]:
        list_of_maps.append(sections[idx])

  return list_of_maps

In [18]:
n = 2 # Declate the number here

def enhanced_question(user_question):
  similar_sections = check_similarity(n,user_question)
  text = ""
  text += user_question
  unique_urls = []
  text += "/n Kindly answer the question based on the details provided below along with the links and tables, it contains certain contents, relevant links, and tables one by one. \n"
  for item in similar_sections:
    text += f"Content: {item['content']} \n"
    text += f"Links: {item['links']} \n"
    text += f"Tables: {item['tables']} \n\n"

    if(item['lecture_url'] not in unique_urls):
      unique_urls.append(item['lecture_url'])
  print(unique_urls)
  return text

In [23]:
import os

from langchain.chains import LLMChain
from langchain_core.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
)
from langchain_core.messages import SystemMessage
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain_groq import ChatGroq


def main():
    """
    This function is the main entry point of the application. It sets up the Groq client, the Streamlit interface, and handles the chat interaction.
    """

    # Get Groq API key
    groq_api_key = "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"
    model = 'llama3-8b-8192'
    # Initialize Groq Langchain chat object and conversation
    groq_chat = ChatGroq(
            groq_api_key=groq_api_key,
            model_name=model
    )

    print("Hello! I'm your friendly Groq chatbot. I can help answer your questions, provide information, or just chat. I'm also super fast! Let's start our conversation!")

    system_prompt = 'You are a friendly conversational chatbot'
    conversational_memory_length = 5 # number of previous messages the chatbot will remember during the conversation

    memory = ConversationBufferWindowMemory(k=conversational_memory_length, memory_key="chat_history", return_messages=True)


    #chat_history = []
    while True:
        user_question = input("Ask a question: ")
        user_question = enhanced_question(user_question)
        #chat_history.append({"role": "user", "content": user_question})

        # user_question = enhanced_question(user_question)

        # If the user has asked a question,
        if user_question:

            # Construct a chat prompt template using various components
            prompt = ChatPromptTemplate.from_messages(
                [
                    SystemMessage(
                        content=system_prompt
                    ),  # This is the persistent system prompt that is always included at the start of the chat.

                    MessagesPlaceholder(
                        variable_name="chat_history"
                    ),  # This placeholder will be replaced by the actual chat history during the conversation. It helps in maintaining context.

                    HumanMessagePromptTemplate.from_template(
                        "{human_input}"
                    ),  # This template is where the user's current input will be injected into the prompt.
                ]
            )

            # Create a conversation chain using the LangChain LLM (Language Learning Model)
            conversation = LLMChain(
                llm=groq_chat,  # The Groq LangChain chat object initialized earlier.
                prompt=prompt,  # The constructed prompt template.
                verbose=False,   # TRUE Enables verbose output, which can be useful for debugging.
                memory=memory,  # The conversational memory object that stores and manages the conversation history.
            )
            # The chatbot's answer is generated by sending the full prompt to the Groq API.
            response = conversation.predict(human_input=user_question)
            print("Chatbot:", response)

if __name__ == "__main__":
    main()

Hello! I'm your friendly Groq chatbot. I can help answer your questions, provide information, or just chat. I'm also super fast! Let's start our conversation!
Ask a question: What are some milestone model architectures and papers in the last few years?
['https://stanford-cs324.github.io/winter2022/lectures/environment/', 'https://raw.githubusercontent.com/Hannibal046/Awesome-LLM/main/README.md']
Chatbot: What a fascinating topic! You're interested in exploring the milestone model architectures and papers in the last few years, primarily focusing on the intersection of artificial intelligence (AI) and climate change. I'd be delighted to help you explore this area.

To start with, let's focus on the climate change aspect. Did you know that the average global temperature has increased by 2.14˚F (1.19˚C) since 1900? The top 10 warmest years have all occurred since 2005. Rising temperatures are causing natural disasters like heatwaves, floods, and wildfires, and also threatening coastal com

KeyboardInterrupt: Interrupted by user

# Method 2

This method utilize the approach of providing all the data to the LLM at once at starting so it already contains the whole data and then just answer the input queries from them.

In [None]:
'''Not tried because data is huge and GROQ api can't handle this much data,
but this approach will surely solve your issue of providing data to LLMs and in
return get the relevant links, url, paragraph from where data is recovered
'''
import os

from langchain.chains import LLMChain
from langchain_core.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
)
from langchain_core.messages import SystemMessage
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain_groq import ChatGroq


def main():
    """
    This function is the main entry point of the application. It sets up the Groq client, the Streamlit interface, and handles the chat interaction.
    """

    # Get Groq API key
    groq_api_key = "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"
    model = 'llama3-8b-8192'
    # Initialize Groq Langchain chat object and conversation
    groq_chat = ChatGroq(
            groq_api_key=groq_api_key,
            model_name=model
    )

    print("Hello! I'm your friendly Groq chatbot. I can help answer your questions, provide information, or just chat. I'm also super fast! Let's start our conversation!")

    system_prompt = 'You are a friendly conversational chatbot'
    conversational_memory_length = 5 # number of previous messages the chatbot will remember during the conversation

    memory = ConversationBufferWindowMemory(k=conversational_memory_length, memory_key="chat_history", return_messages=True)

    user_question = '''I am going to provide you some data which contains url from where data is taken,
    paragraph number, content of parargraph, relevant links and tables.
    Kindly give the answers on the basis of these data along with url of lectures provided,
    paragraph number of content, related links and content but and here is the whole data! \n'''
    prompt = ChatPromptTemplate.from_messages(
              [
                  SystemMessage(
                      content=system_prompt
                  ),  # This is the persistent system prompt that is always included at the start of the chat.
                MessagesPlaceholder(
                      variable_name="chat_history"
                  ),  # This placeholder will be replaced by the actual chat history during the conversation. It helps in maintaining context.
                HumanMessagePromptTemplate.from_template(
                      "{human_input}"
                  ),  # This template is where the user's current input will be injected into the prompt.
              ]
          )
    conversation = LLMChain(
                llm=groq_chat,  # The Groq LangChain chat object initialized earlier.
                prompt=prompt,  # The constructed prompt template.
                verbose=False,   # TRUE Enables verbose output, which can be useful for debugging.
                memory=memory,  # The conversational memory object that stores and manages the conversation history.
            )
    for item in sections:
        current_text = ""
        current_text += f"Lecture Url: {item['lecture_url']} \n"
        current_text += f"Paragraph Number of content: {item['paragraph_number']} \n"
        current_text += f"Content: {item['content']} \n"
        current_text += f"Links related to content: {item['links']} \n"
        current_text += f"Tables related to content: {item['tables']} \n\n"
        prompt = ChatPromptTemplate.from_messages(
              [
                  SystemMessage(
                      content=system_prompt
                  ),  # This is the persistent system prompt that is always included at the start of the chat.
                MessagesPlaceholder(
                      variable_name="chat_history"
                  ),  # This placeholder will be replaced by the actual chat history during the conversation. It helps in maintaining context.
                HumanMessagePromptTemplate.from_template(
                      "{human_input}"
                  ),  # This template is where the user's current input will be injected into the prompt.
              ]
        )
        conversation = LLMChain(
                llm=groq_chat,  # The Groq LangChain chat object initialized earlier.
                prompt=prompt,  # The constructed prompt template.
                verbose=False,   # TRUE Enables verbose output, which can be useful for debugging.
                memory=memory,  # The conversational memory object that stores and manages the conversation history.
          )


    #chat_history = []
    while True:
        user_question = input("Ask a question: ")
        user_question = enhanced_question(user_question)
        #chat_history.append({"role": "user", "content": user_question})

        user_question = enhanced_question(user_question)

        # If the user has asked a question,
        if user_question:

            # Construct a chat prompt template using various components
            prompt = ChatPromptTemplate.from_messages(
                [
                    SystemMessage(
                        content=system_prompt
                    ),  # This is the persistent system prompt that is always included at the start of the chat.

                    MessagesPlaceholder(
                        variable_name="chat_history"
                    ),  # This placeholder will be replaced by the actual chat history during the conversation. It helps in maintaining context.

                    HumanMessagePromptTemplate.from_template(
                        "{human_input}"
                    ),  # This template is where the user's current input will be injected into the prompt.
                ]
            )

            # Create a conversation chain using the LangChain LLM (Language Learning Model)
            conversation = LLMChain(
                llm=groq_chat,  # The Groq LangChain chat object initialized earlier.
                prompt=prompt,  # The constructed prompt template.
                verbose=False,   # TRUE Enables verbose output, which can be useful for debugging.
                memory=memory,  # The conversational memory object that stores and manages the conversation history.
            )
            # The chatbot's answer is generated by sending the full prompt to the Groq API.
            response = conversation.predict(human_input=user_question)
            print("Chatbot:", response)

if __name__ == "__main__":
    main()

Hello! I'm your friendly Groq chatbot. I can help answer your questions, provide information, or just chat. I'm also super fast! Let's start our conversation!
Ask a question: What about the Content moderation in LLMs?
Chatbot: You're asking about content moderation in Large Language Models (LLMs). Before diving into LLMs, it's essential to understand the critical issue of content moderation.

Content moderation is the process of regulating and managing user-generated content on online platforms. This includes social media sites like Facebook, Twitter, and YouTube, which are constantly battling harmful content, such as hate speech, harassment, pornography, violence, fraud, disinformation, and copyright infringement.

To tackle this issue, companies have employed AI to automate content moderation. The outcome of moderation can be either hard (blocking or deletion) or soft (flagging or hiding). However, the decision on what is allowed is fundamentally political, as it depends on various f

KeyboardInterrupt: Interrupted by user