In [30]:
import streamlit as st
from langchain_community.document_loaders import PyPDFLoader
from openai import OpenAI
from youtube_transcript_api import YouTubeTranscriptApi
import os
import json
from pydantic import BaseModel
from typing import List
from dotenv import load_dotenv
import requests

load_dotenv()

api_key = os.environ.get("API_KEY")
endpoint = os.environ.get("END_POINT")

headers = {
    'Content-Type': 'application/json',
    'api-key': api_key
}

# Define a function to get the chat completion
def get_chat_completion(messages):
    data = {
        "messages": messages,
        # "max_tokens": 500,
        "temperature": 0.7,
        "response_format": {"type": "json_object"}
    }

    # Make the POST request to the API
    response = requests.post(endpoint, headers=headers, data=json.dumps(data))

    # Handle the response
    if response.status_code == 200:
        result = response.json()
        return result['choices'][0]['message']['content']
    else:
        # st.error(f"Error: {response.status_code}, {response.text}")
        print(f"Error: {response.status_code}, {response.text}")
        return None

# Define the MCQ and List_of_MCQs data models
class MCQ(BaseModel):
    question: str
    options: List[str]
    answer: str

class List_of_MCQs(BaseModel):
    mcqs: List[MCQ]

In [31]:
def split_text(text, max_length):
    """Split text into chunks of a maximum length."""
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

In [32]:
def generate_mcqs_from_text(text):
    """Generate MCQs from text using OpenAI."""
    example_json = [
        {
            "question": "What is the capital of France?",
            "options": ["Paris", "London", "Berlin", "Madrid"],
            "answer": "Paris"
        },
        {
            "question": "Which planet is known as the Red Planet?",
            "options": ["Earth", "Mars", "Jupiter", "Saturn"],
            "answer": "Mars"
        }
    ]
    
    text_chunks = split_text(text, 1000)  # Adjust chunk size as needed
    for chunk in text_chunks:
        print(f"chunk size : {len(chunk)} and chunk : {chunk}")
        
    all_mcqs = []

    for chunk in text_chunks:
        prompt = f"Generate multiple-choice questions (MCQs) from the following text in JSON format:\n\n{chunk}"
        messages = [
            {"role": "system", "content": f"Generate MCQs from the following text in JSON format:\n{json.dumps(example_json, indent=4)}"},
            {"role": "user", "content": chunk}
        ]
        response = get_chat_completion(messages)
        print(response)
        mcqs = json.loads(response)
        print(f"\nMCQs Extacted\n*************************:\n {mcqs}")
        try:
            all_mcqs.extend(mcqs['mcqs'])
        except KeyError:
            all_mcqs.extend(mcqs['questions'])
        except:
            print("Error in extracting MCQs")
    
    return all_mcqs

In [1]:
test_text = """
welcome back to another machine learning explained video by assembly ai in this video we talk about supervised learning which is arguably the most important type of machine learning you will learn what it means examples of supervised learning or this data and training types of supervised learning and we touch on specific algorithms of supervised learning let's begin with the very basics what does machine learning mean machine learning is a sub-area of artificial intelligence and it's the study of algorithms that give computers the ability to learn and make decisions based on data and not from explicit instructions a popular example is learning to predict whether an email is spam or no spam by reading many different emails of these two types we typically differentiate between three types of machine learning supervised learning unsupervised learning and reinforcement learning in supervised learning the computer learns by making use of labeled data so we know the corresponding label or target of our data an example is again the spam prediction algorithm where we show many different emails to the computer and for each email we know if this was a spam email or not on the other hand in unsupervised learning the computer learns by making use of unlabeled data so we have data but we don't know the corresponding target an example is to cluster books into different categories on the basis of the title and other book information but not by knowing its actual category and then there is also reinforcement learning where so-called intelligent software agents take actions in an environment and automatically try to improve its behavior this usually works with a system of rewards and punishments and popular examples are games for example a computer can learn to be good in the snake game only by playing the game itself and every time 
it eats an apple or it dies it learns from this actions now in this video we are going to focus on supervised learning where we learn from labeled data now what is data data can be any relevant information we collect for our algorithm this can be for example user information like age and gender or text data or images or information within an image like measurements or color information the possibilities are endless here let's look at a concrete example in the popular iris flower data set we want to predict the type of iris flower based on different measurements we have 150 records of flowers with different attributes that have been measured before so for each flower we have the sepal 
length saypal width petal length and petal width these are called the features and we also have the corresponding species this is called the class the label or the target so this is a supervised case where we know the label we can 
represent this table in a mathematical way so we put each feature into a vector this is the feature vector and then we do this for all the different samples and when we do this for all the different samples we end up in a 2d representation which is also called a matrix additionally we can put all labels into one vector this is called the target vector now in supervised learning we take the features and the labels and show it to the computer so that it learns we call this the training step and the data we use is called the training data training is performed by specific algorithms that usually try to minimize an error during this training process and this is done by mathematical optimization methods which i won't go into more detail here after training we want to show new data to the computer that it has never seen before and where we don't know the label this is called our test data and now the trained computer should be able to make a decision based on the information it has seen and determine the correct target value and this is how supervised learning works there are two types of supervised learning classification and regression in classification we predict a discrete class label in the previous flower classification example our target values can only have the values 0 1 and 2 corresponding to the three different classes if we have more than two possible labels like here we call this a multi-class classification problem if we only have two labels usually zero and one is used then we call this a binary classification problem for example spam or no spam on the other hand in regression we try to predict a continuous target value meaning the target value can have a more or less arbitrary value one example is to predict house prices based on given information about the house and the neighborhood the target variable which is the price can basically have any value here now that we know what supervised learning means let's have a look at concrete algorithms i will not explain them in detail here i simply name them so that you have heard of them they all have a unique design and can be different in the way how it stores the information mathematically how it solves the training process through mathematical operations and how it transforms the data this list is not exhaustive but here are 10 algorithms that are nice to know some of them can be used for either regression or classification and some can even be used for both cases popular algorithms are linear regression logistic regression decision trees random forest naive bayes perceptron and multi-layer perceptron support vector machines or short svm k-nearest neighbors or short knn adaboost and neural networks which are part of the deep learning field alright i hope you enjoyed 
this video if you did so then please hit the like button and consider subscribing to the channel also if you want to try assembly ai for free then grab your free api token using the link in the description below and then i hope to 
see you in the next video bye
"""

In [34]:
all_mcqs = generate_mcqs_from_text(test_text)

chunk size : 1000 and chunk : 
welcome back to another machine learning explained video by assembly ai in this video we talk about supervised learning which is arguably the most important type of machine learning you will learn what it means examples of supervised learning or this data and training types of supervised learning and we touch on specific algorithms of supervised learning let's begin with the very basics what does machine learning mean machine learning is a sub-area of artificial intelligence and it's the study of algorithms that give computers the ability to learn and make decisions based on data and not from explicit instructions a popular example is learning to predict whether an email is spam or no spam by reading many different emails of these two types we typically differentiate between three types of machine learning supervised learning unsupervised learning and reinforcement learning in supervised learning the computer learns by making use of labeled data so we kno

In [35]:
all_mcqs

[{'question': 'What is the main topic of the video by Assembly AI?',
  'options': ['Unsupervised Learning',
   'Reinforcement Learning',
   'Supervised Learning',
   'Deep Learning'],
  'answer': 'Supervised Learning'},
 {'question': 'What is machine learning a sub-area of?',
  'options': ['Computer Science',
   'Artificial Intelligence',
   'Data Mining',
   'Statistics'],
  'answer': 'Artificial Intelligence'},
 {'question': 'What is an example given for supervised learning?',
  'options': ['Predicting the weather',
   'Recommending movies',
   'Predicting whether an email is spam',
   'Detecting anomalies in networks'],
  'answer': 'Predicting whether an email is spam'},
 {'question': 'How many types of machine learning are mentioned in the text?',
  'options': ['Two', 'Three', 'Four', 'Five'],
  'answer': 'Three'},
 {'question': 'In supervised learning, what type of data does the computer learn from?',
  'options': ['Unlabeled data',
   'Synthetic data',
   'Labeled data',
   'Rand

In [39]:
len(all_mcqs)

38

In [36]:
# example_json = [
#         {
#             "question": "What is the capital of France?",
#             "options": ["Paris", "London", "Berlin", "Madrid"],
#             "answer": "Paris"
#         },
#         {
#             "question": "Which planet is known as the Red Planet?",
#             "options": ["Earth", "Mars", "Jupiter", "Saturn"],
#             "answer": "Mars"
#         }
#     ]

# prompt = f"Generate multiple-choice questions (MCQs) from the following text in JSON format:\n\n{test_text}"
# messages = [
#     {"role": "system", "content": f"Generate MCQs from the following text in JSON format:\n{json.dumps(example_json, indent=4)}"},
#     {"role": "user", "content": test_text}
# ]
# response = get_chat_completion(messages)
# print(response)

In [37]:
# mcqs = json.loads(response)

In [38]:
# mcqs['questions']