In [1]:
!pip install langchain google-search-results langchain-google-genai langchain_groq youtube_search
!pip install langchain-community tavily-python crewai crewai_tools
!pip install --upgrade langchain langchain-google-genai langchain-core

Collecting langchain
  Downloading langchain-0.2.6-py3-none-any.whl.metadata (7.0 kB)
Collecting google-search-results
  Downloading google_search_results-2.4.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting langchain-google-genai
  Downloading langchain_google_genai-1.0.7-py3-none-any.whl.metadata (3.8 kB)
Collecting langchain_groq
  Downloading langchain_groq-0.1.6-py3-none-any.whl.metadata (2.8 kB)
Collecting youtube_search
  Downloading youtube_search-2.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading SQLAlchemy-2.0.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting langchain-core<0.3.0,>=0.2.10 (from langchain)
  Downloading langchain_core-0.2.10-py3-none-any.whl.metadata (6.0 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17

In [2]:
from google.colab import userdata
import os

os.environ["TAVILY_API_KEY"] = userdata.get('TAVILY_API_KEY')
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
os.environ["SERPER_API_KEY"] = userdata.get('SERPER_API_KEY')
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')
os.environ["YOUTUBE_API_KEY"] = userdata.get('YOUTUBE_API_KEY')

In [3]:
from langchain_google_genai import ChatGoogleGenerativeAI

gemini_llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

In [4]:
from crewai import Agent, Task, Crew
from crewai_tools import SerperDevTool,WebsiteSearchTool,YoutubeVideoSearchTool
from langchain_community.tools.tavily_search import TavilySearchResults

In [5]:
from crewai import Agent

def planner_Agent(llm, tools, max_iter):
    study_planner_agent = Agent(
        role='Study Planner Agent',
        goal="Break down the user's problem into small, achievable steps for learning {topic} in {duration} with daily sessions of {studytime}. Return the steps as a JSON object with day numbers as keys and study tasks as values.",
        backstory="""
        You are a 'Smart Study Guide' who helps the user create a study plan to learn {topic} in {duration} with daily sessions of {studytime}.
        The user's learning style is {style}, their grade is {grade}, and they belong to the {stream} stream. The user has {knowledge} level of prior knowledge about the domain.
        And the User has distracktion tollerence is {distraction_tolerance}.
        Prepare steps to achieve this goal, ensuring each day includes a new task and the last day includes a small project or revision.

        Here's a possible study plan breakdown for learning {topic} in {duration}:

        The title should not be more than 5 words.

        Make sure to include some exercises for the user to practice.


        Example JSON output:
        (
          'Day 1': 'Task 1',
          'Day 2': 'Task 2',
          'Day 3': 'Task 3',
          ...
          'Day N': 'Project or Revision'
        )
        """,
        llm=llm,
        verbose=True,
        allow_delegation=True,
        max_iter=max_iter,
        cache=False
    )

    return study_planner_agent

# Example usage
study_planner_agent = planner_Agent(
    llm=gemini_llm,
    tools=[],
    max_iter=3
)


In [6]:
from crewai.task import TaskOutput
import json
import os
import pandas as pd

os.makedirs('final_outputs', exist_ok=True)

# Function to clean the raw output
def clean_raw_output(raw_output):
    return raw_output.replace("```", "").replace("json", "").replace(r"\n", '')

# Function to parse JSON with fallback
def parse_json_with_fallback(cleaned_output, output_description):
    try:
        return json.loads(cleaned_output)
    except json.JSONDecodeError as e:
        # Log the error and save the raw output for debugging
        with open(f'final_outputs/{output_description}_raw.txt', 'w') as f:
            f.write(cleaned_output)
        print(f"Error decoding JSON: {e}")
        print(f"Raw output stored in {output_description}_raw.txt")
        return None

# Callback function for Task 1
def planner_callback_function(output: TaskOutput):
    print("Task completed!")
    task1_output = output.raw_output
    cleaned_output = clean_raw_output(task1_output)
    output_dict = parse_json_with_fallback(cleaned_output, output.description)
    if output_dict is None:
        return None

    final_output = list(output_dict.values())
    with open(f'final_outputs/{output.description}.txt', 'w') as f:
        for item in final_output:
            f.write(str(item) + '\n')
    return final_output


planner_task = Task(
    description='Prepare steps to achieve the goal of learning {topic} within {duration}.The title should not be more than 5 words and it should be such that it should be wasy to search it on internet',
    expected_output='A json response where the keys are Day Numbers and values are the titles of important things to study.',
    agent=study_planner_agent,
    callback=planner_callback_function,
    output_file='outputs/studyplan.json',
    create_directory=True
)

In [7]:
# Crew definition
from crewai.process import Process

crew = Crew(
    agents=[study_planner_agent],
    tasks=[planner_task],
    process=Process.sequential,
    verbose=True
)

# Input data
data_input = {
    'topic': 'Deep Learning',
    'duration': '5 days',
    'studytime': '2 hrs/day',
    'style': 'blogs',
    'grade': 'First year',
    'stream': 'Computer Science',
    'knowledge': 'Python advance and Machine Learning',
    'learning Time': 'Evening',
    'distraction_tolerance': 'High',
    'feedback frequency': 'Daily'
}

result = crew.kickoff(data_input)
print(clean_raw_output(result))

[1m[95m [2024-06-29 08:53:51][DEBUG]: == Working Agent: Study Planner Agent[00m
[1m[95m [2024-06-29 08:53:51][INFO]: == Starting Task: Prepare steps to achieve the goal of learning Deep Learning within 5 days.The title should not be more than 5 words and it should be such that it should be wasy to search it on internet[00m


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mThought: I now can give a great answer.
Final Answer:
```json
{
  "Day 1": "Deep Learning Introduction & Neural Networks",
  "Day 2": "Feedforward Neural Networks and Backpropagation",
  "Day 3": "Convolutional Neural Networks (CNNs) for Image Recognition",
  "Day 4": "Recurrent Neural Networks (RNNs) for Sequential Data",
  "Day 5": "Deep Learning Project: Image Classification with CNN"
}
```[0m

[1m> Finished chain.[0m
Task completed!
[1m[92m [2024-06-29 08:53:53][DEBUG]: == [Study Planner Agent] Task output: ```json
{
  "Day 1": "Deep Learning Introduction & Neural Networks",
  "Day 2": "F

In [8]:
json.loads(clean_raw_output(result))

{'Day 1': 'Deep Learning Introduction & Neural Networks',
 'Day 2': 'Feedforward Neural Networks and Backpropagation',
 'Day 3': 'Convolutional Neural Networks (CNNs) for Image Recognition',
 'Day 4': 'Recurrent Neural Networks (RNNs) for Sequential Data',
 'Day 5': 'Deep Learning Project: Image Classification with CNN'}

In [9]:
study_plan = json.loads(clean_raw_output(result))

In [10]:
import pandas as pd

def get_related_blogs(study_plan):
  tavilytool = TavilySearchResults()
  final_output = {}
  for day,title in study_plan.items():
    search_query = f"Find Relevant blogs for {title}"
    search_results = tavilytool.invoke(search_query)  # Ensure tavilytool is properly initialized
    blogs = [result['url'] for result in search_results]  # Extract URLs from search results
    final_output[title] = blogs
  df = pd.DataFrame()
  df['Day'] = list(study_plan.keys())
  df['Title'] = list(study_plan.values())
  df['Blogs'] = list(final_output.values())
  return df

get_related_blogs(study_plan)


Unnamed: 0,Day,Title,Blogs
0,Day 1,Deep Learning Introduction & Neural Networks,[https://sebastianraschka.com/blog/2021/dl-cou...
1,Day 2,Feedforward Neural Networks and Backpropagation,[https://www.baeldung.com/cs/neural-networks-b...
2,Day 3,Convolutional Neural Networks (CNNs) for Image...,[https://www.edge-ai-vision.com/2015/11/using-...
3,Day 4,Recurrent Neural Networks (RNNs) for Sequentia...,[https://shelf.io/blog/recurrent-neural-networ...
4,Day 5,Deep Learning Project: Image Classification wi...,[https://github.com/On-Power-Studio/Image-clas...


In [11]:
blog_df = get_related_blogs(study_plan)

# Blog Scraping

In [12]:
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
import re

class SimpleDocument:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata or {}

def scrape_blog(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text()
    else:
        return ""

def clean_text(text):
    # Remove extra whitespaces
    cleaned_text = ' '.join(text.split())

    # Remove HTML artifacts
    cleaned_text = BeautifulSoup(cleaned_text, "html.parser").get_text()

    # Remove non-textual content (if any)
    cleaned_text = re.sub(r'\[[^\]]*\]', '', cleaned_text)  # Remove text within square brackets
    cleaned_text = re.sub(r'[^\x00-\x7F]+', '', cleaned_text)
    return cleaned_text.strip()

def selecting_best_blog(blog_df):
    # Initialize the dictionary to store the results
    result_dict = {}

    # Convert the dataframe to a list of dictionaries
    selecting_blog = blog_df[['Title', 'Blogs']].to_dict('records')

    # Loop through each blog entry
    for dic in selecting_blog:
        title = dic['Title']
        urls = dic['Blogs'] # Assuming URLs are comma-separated strings

        # Initialize list to hold content for each URL under the same topic
        content_list = []

        for url in urls:
            url = url.strip()  # Remove any extra whitespace
            if url:  # Ensure URL is not empty
                # Scrape the content
                full_text = scrape_blog(url)

                # Create a SimpleDocument object
                doc = SimpleDocument(page_content=full_text)

                # Split the documents to get the first 2500 words
                ts = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=0)
                fd = ts.split_documents([doc])

                # Store only the first chunk (first 2500 words)
                first_2500_words = clean_text(fd[0].page_content) if fd else ""

                # Add the URL and extracted content to the content list
                content_list.append({url: first_2500_words})

        # Store the result in the dictionary
        result_dict[title] = content_list

    return result_dict

result = selecting_best_blog(blog_df)
print(result)


{'Deep Learning Introduction & Neural Networks': [{'https://sebastianraschka.com/blog/2021/dl-course.html': ''}, {'https://www.dataquest.io/blog/tutorial-introduction-to-deep-learning/': 'Tutorial: Introduction to Deep Learning Dashboard Learning Path Catalog Full Catalog Career Paths Skill Paths Individual Courses Data Science Projects Success Stories Resources How to Learn Data Science A Better Way to Learn Understanding Data Roles Live Project Walkthroughs Learning Resources For Teams Sign In Start Free Profile Account Subscription Teams Help Logout March 31, 2023 Tutorial: Introduction to Deep Learning'}, {'https://www.datacamp.com/tutorial/introduction-to-deep-neural-networks': ''}, {'https://www.datacamp.com/blog/how-to-learn-deep-learning': ''}, {'https://towardsdatascience.com/intro-to-deep-learning-c025efd92535': ''}], 'Feedforward Neural Networks and Backpropagation': [{'https://jonaslalin.com/2021/12/10/feedforward-neural-networks-part-1/': 'Feedforward Neural Networks in De

In [13]:
#result['Django Models: Creating Data Structures']

In [14]:
'''from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

class BestURL(BaseModel):
    """Best Url """
    Title: str = Field(description="The name of the title")
    url: str = Field(description="the best url among the urls")

structured_llm = gemini_llm.with_structured_output(BestURL)

template = PromptTemplate(
  input_variables=['topic','distraction_tolerance','content'],
  template="Select the best 1 blog url on the topic {topic} by analysing the {content}"
)
prompt = template.format(topic='Django Models: Creating Data Structures',distraction_tolerance=5,content=result['Django Models: Creating Data Structures'])

print(structured_llm.invoke(prompt))'''

'from langchain.prompts import PromptTemplate\nfrom langchain_core.pydantic_v1 import BaseModel, Field\n\nclass BestURL(BaseModel):\n    """Best Url """\n    Title: str = Field(description="The name of the title")\n    url: str = Field(description="the best url among the urls")\n\nstructured_llm = gemini_llm.with_structured_output(BestURL)\n\ntemplate = PromptTemplate(\n  input_variables=[\'topic\',\'distraction_tolerance\',\'content\'],\n  template="Select the best 1 blog url on the topic {topic} by analysing the {content}"\n)\nprompt = template.format(topic=\'Django Models: Creating Data Structures\',distraction_tolerance=5,content=result[\'Django Models: Creating Data Structures\'])\n\nprint(structured_llm.invoke(prompt))'

In [15]:
from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
import time

class BestURL(BaseModel):
  """Best Url """
  Title: str = Field(description="The name of the title")
  url: str = Field(description="the best url among the urls")

structured_llm = gemini_llm.with_structured_output(BestURL)

# Modified template to avoid multiple function calls
template = PromptTemplate(
  input_variables=['topic', 'distraction_tolerance', 'content'],
  template="Select the best blog 1 url on the topic {topic} by analyzing the {content}"
)

best_video_urls = {}

for topic, content in result.items():
  prompt = template.format(topic=topic, distraction_tolerance=5, content=content)

  try:
    response = structured_llm.invoke(prompt)

    # Check if response is None or not
    if response:
      if response.url.startswith("https://"):
        best_video_urls[topic] = {'Title': response.Title, 'blog_url': response.url}
      else:
        # Handle non-YouTube URLs (optional)
        # You can choose to ignore them, log them, or take other actions
        pass
    else:
      # Handle case where response is None (optional)
      best_video_urls[topic] = {'Title': topic, 'blog_url': content[0].keys()[0]}

  except:
    # Handle potential exceptions (optional)
    pass

best_blog_urls = best_video_urls.values()
print(best_blog_urls)


dict_values([{'Title': 'Deep Learning Introduction & Neural Networks', 'blog_url': 'https://www.dataquest.io/blog/tutorial-introduction-to-deep-learning/'}, {'Title': 'Feedforward Neural Networks and Backpropagation', 'blog_url': 'https://jonaslalin.com/2021/12/10/feedforward-neural-networks-part-1/'}, {'Title': 'CNNs for Image Recognition', 'blog_url': 'https://www.edge-ai-vision.com/2015/11/using-convolutional-neural-networks-for-image-recognition/'}, {'Title': 'Recurrent Neural Networks (RNNs) for Sequential Data', 'blog_url': 'https://neptune.ai/blog/recurrent-neural-network-guide'}, {'Title': 'Deep Learning Project: Image Classification with CNN', 'blog_url': 'https://www.analyticsvidhya.com/blog/2021/06/develop-your-first-image-classification-project-with-convolutional-neural-network/'}])


In [16]:
df = pd.DataFrame(best_blog_urls)
df

Unnamed: 0,Title,blog_url
0,Deep Learning Introduction & Neural Networks,https://www.dataquest.io/blog/tutorial-introdu...
1,Feedforward Neural Networks and Backpropagation,https://jonaslalin.com/2021/12/10/feedforward-...
2,CNNs for Image Recognition,https://www.edge-ai-vision.com/2015/11/using-c...
3,Recurrent Neural Networks (RNNs) for Sequentia...,https://neptune.ai/blog/recurrent-neural-netwo...
4,Deep Learning Project: Image Classification wi...,https://www.analyticsvidhya.com/blog/2021/06/d...


# Video Scraping

In [17]:
import requests
from google.colab import userdata

def request_video(topic: str, api_key: str):
    base_url = "https://www.googleapis.com/youtube/v3/search"
    params = {
        'part': 'snippet',
        'q': topic,
        'type': 'video',
        'maxResults': 5,
        'videoCaption': 'closedCaption',  # Filter for videos with captions
        'key': api_key
    }
    response = requests.get(base_url, params=params)
    response.raise_for_status()
    result = response.json()
    video_links = []
    for item in result.get('items', []):
        title = item['snippet']['title']
        video_id = item['id']['videoId']
        video_url = f"https://www.youtube.com/watch?v={video_id}"
        video_links.append(video_url)
    return {topic: video_links}

def get_related_videos(study_plan, api_key: str):
    study_topics = list(study_plan.values())
    video_links = []
    for topic in study_topics:
        video_links.append(request_video(topic, api_key))
    return video_links

video_data = get_related_videos(study_plan, userdata.get('YOUTUBE_API_KEY'))
print(video_data)

[{'Deep Learning Introduction & Neural Networks': ['https://www.youtube.com/watch?v=aircAruvnKk', 'https://www.youtube.com/watch?v=jmmW0F0biz0', 'https://www.youtube.com/watch?v=q6kJ71tEYqM', 'https://www.youtube.com/watch?v=oV3ZY6tJiA0', 'https://www.youtube.com/watch?v=CqOfi41LfDw']}, {'Feedforward Neural Networks and Backpropagation': ['https://www.youtube.com/watch?v=Ilg3gGewQ5U', 'https://www.youtube.com/watch?v=S5AGN9XfPK4', 'https://www.youtube.com/watch?v=y0wNuFFPGuI', 'https://www.youtube.com/watch?v=jTzJ9zjC8nU', 'https://www.youtube.com/watch?v=CqOfi41LfDw']}, {'Convolutional Neural Networks (CNNs) for Image Recognition': ['https://www.youtube.com/watch?v=QzY57FaENXg', 'https://www.youtube.com/watch?v=K_BHmztRTpA', 'https://www.youtube.com/watch?v=pj9-rr1wDhM', 'https://www.youtube.com/watch?v=KuXjwB4LzSA', 'https://www.youtube.com/watch?v=CYvBjQTOdf4']}, {'Recurrent Neural Networks (RNNs) for Sequential Data': ['https://www.youtube.com/watch?v=AsNTP8Kwu80', 'https://www.you

In [18]:
def make_df_for_videos(video_data):
  keys = []
  values = []
  for dic in video_data:
    for k,v in dic.items():
      keys.append(k)
      values.append(v)
  df = pd.DataFrame({'Title':keys,'Videos':values})
  return df
make_df_for_videos(video_data)

Unnamed: 0,Title,Videos
0,Deep Learning Introduction & Neural Networks,"[https://www.youtube.com/watch?v=aircAruvnKk, ..."
1,Feedforward Neural Networks and Backpropagation,"[https://www.youtube.com/watch?v=Ilg3gGewQ5U, ..."
2,Convolutional Neural Networks (CNNs) for Image...,"[https://www.youtube.com/watch?v=QzY57FaENXg, ..."
3,Recurrent Neural Networks (RNNs) for Sequentia...,"[https://www.youtube.com/watch?v=AsNTP8Kwu80, ..."
4,Deep Learning Project: Image Classification wi...,"[https://www.youtube.com/watch?v=K_BHmztRTpA, ..."


In [19]:
video_df = make_df_for_videos(video_data)

In [1]:
!pip install youtube-transcript-api



In [21]:
from youtube_transcript_api import YouTubeTranscriptApi

def extract_video_id(video_link):
  """Extracts the video ID from a YouTube video link."""
  video_id = video_link.split("v=")[1]
  return video_id

def request_data_using_api(video_id):
  """Requests transcript data for a YouTube video using the YouTube Transcript API.

  Handles potential exceptions like disabled subtitles and generic errors.
  """
  try:
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
  except Exception as e:
    if "Subtitles are disabled for this video" in str(e):
      transcript = 'Subtitles are disabled for this video'
    else:
      transcript = f'An unexpected error occurred: {e}'
  return transcript

def extract_text(transcript):
  """Extracts text from the transcript data.

  If the transcript is a list, iterates through each dictionary and concatenates the text.
  Handles cases where the transcript is not a list (e.g., error message).

  Returns the first 1500 words of the extracted text and the full transcript.
  """
  if isinstance(transcript, list):
    video_text = ''
    word_count = 0
    for dictionary in transcript:
      data = dictionary['text'].strip()  # Remove leading/trailing whitespace
      words = data.split()
      word_count += len(words)
      video_text += ' '.join(words) + ' '  # Add space between sentences
      if word_count >= 1000:
        break
    return video_text, transcript
  else:
    video_text = ''
    return video_text, transcript

def return_first_1000_words(video_link):
  """Returns the first 1000 words of the transcript text and the full transcript.

  Calls the helper functions to extract video ID, request transcript data,
  and extract the desired portion of the text.
  """
  video_id = extract_video_id(video_link)
  transcript = request_data_using_api(video_id)
  first_1000_words, full_transcript = extract_text(transcript)
  return first_1000_words, full_transcript

# Example usage
video_link = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"  # Replace with your video link
first_1000_words, full_transcript = return_first_1000_words(video_link)

print(f"First 1000 words of transcript:\n{first_1000_words}")


First 1000 words of transcript:
[Music] we're no strangers to love you know the rules and so do I I full commitments while I'm thinking of you wouldn't get this from any other guy I just want to tell you how I'm feeling got to make you understand Never Going To Give You Up never going to let you down never going to run around and desert you never going to make you cry never going to say goodbye never going to tell a lie and hurt you we've known each other for so long your heart's been aching but your to sh to say it inside we both know what's been going on we know the game and we're going to playing and if you ask me how I'm feeling don't tell me you're too my you see Never Going To Give You Up never going to let you down never to run around and desert you never going to make you cry never going to say goodbye never going to tell a lie and hurt you never going to give you up never going to let you down never going to run around and desert you never going to make you cry never going to 

In [22]:
def prepare_data_for_best_video(video_df):
    # Initialize the dictionary to store the results
    result_dict = {}
    selecting_video = video_df[['Title', 'Videos']].to_dict('records')

    # Loop through each blog entry
    for dic in selecting_video:
        title = dic['Title']
        urls = dic['Videos']  # Assuming URLs are comma-separated strings

        # Initialize list to hold content for each URL under the same topic
        content_list = []

        for url in urls:
            url = url.strip()  # Remove any extra whitespace
            first_1500_words, full_transcript = return_first_1000_words(url)
            # Add the URL and extracted content to the content list
            content_list.append({url: first_1500_words})

        # Store the result in the dictionary
        result_dict[title] = content_list

    return result_dict

result_dict = prepare_data_for_best_video(video_df)
print(result_dict)



In [23]:
#result_dict['Django Basics: Setting up Environment']

In [24]:
'''from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

class BestURL(BaseModel):
    """Best Url """
    Title: str = Field(description="The name of the title")
    url: str = Field(description="the best url among the urls")

structured_llm = gemini_llm.with_structured_output(BestURL)

template = PromptTemplate(
  input_variables=['topic','distraction_tolerance','content'],
  template="Select the best 1 video url on the topic {topic} by analysing the {content}"
)
prompt = template.format(topic='Django Basics: Setting up Environment',distraction_tolerance=5,content=result_dict['Django Basics: Setting up Environment'])

print(structured_llm.invoke(prompt))'''

'from langchain.prompts import PromptTemplate\nfrom langchain_core.pydantic_v1 import BaseModel, Field\n\nclass BestURL(BaseModel):\n    """Best Url """\n    Title: str = Field(description="The name of the title")\n    url: str = Field(description="the best url among the urls")\n\nstructured_llm = gemini_llm.with_structured_output(BestURL)\n\ntemplate = PromptTemplate(\n  input_variables=[\'topic\',\'distraction_tolerance\',\'content\'],\n  template="Select the best 1 video url on the topic {topic} by analysing the {content}"\n)\nprompt = template.format(topic=\'Django Basics: Setting up Environment\',distraction_tolerance=5,content=result_dict[\'Django Basics: Setting up Environment\'])\n\nprint(structured_llm.invoke(prompt))'

In [25]:
from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
import time

class BestURL(BaseModel):
  """Best Url """
  Title: str = Field(description="The name of the title")
  url: str = Field(description="the best url among the urls")

structured_llm = gemini_llm.with_structured_output(BestURL)

# Modified template to avoid multiple function calls
template = PromptTemplate(
  input_variables=['topic', 'distraction_tolerance', 'content'],
  template="Select the best 1 url on the topic {topic} by analyzing the {content}"
)

best_video_urls = {}

for topic, content in result_dict.items():
  prompt = template.format(topic=topic, distraction_tolerance=5, content=content)

  try:
    response = structured_llm.invoke(prompt)

    # Check if response is None or not
    if response:
      if response.url.startswith("https://www.youtube.com/watch?"):
        best_video_urls[topic] = {'Title': response.Title, 'youtube_url': response.url}
      else:
        # Handle non-YouTube URLs (optional)
        # You can choose to ignore them, log them, or take other actions
        pass
    else:
      # Handle case where response is None (optional)
      best_video_urls[topic] = {'Title': topic, 'youtube_url': content[0].keys()[0]}

  except:
    # Handle potential exceptions (optional)
    pass

best_video_urls = best_video_urls.values()
print(best_video_urls)


dict_values([{'Title': 'Deep Learning Introduction & Neural Networks', 'youtube_url': 'https://www.youtube.com/watch?v=aircAruvnKk'}, {'Title': 'Feedforward Neural Networks and Backpropagation', 'youtube_url': 'https://www.youtube.com/watch?v=S5AGN9XfPK4'}, {'Title': 'CNNs for Image Recognition', 'youtube_url': 'https://www.youtube.com/watch?v=K_BHmztRTpA'}, {'Title': 'Recurrent Neural Networks (RNNs) for Sequential Data', 'youtube_url': 'https://www.youtube.com/watch?v=Or9QSDqzOK0'}])


In [26]:
df2 = pd.DataFrame(best_video_urls)
df2

Unnamed: 0,Title,youtube_url
0,Deep Learning Introduction & Neural Networks,https://www.youtube.com/watch?v=aircAruvnKk
1,Feedforward Neural Networks and Backpropagation,https://www.youtube.com/watch?v=S5AGN9XfPK4
2,CNNs for Image Recognition,https://www.youtube.com/watch?v=K_BHmztRTpA
3,Recurrent Neural Networks (RNNs) for Sequentia...,https://www.youtube.com/watch?v=Or9QSDqzOK0


In [27]:
final_df = df.merge(df2, on='Title')
final_df

Unnamed: 0,Title,blog_url,youtube_url
0,Deep Learning Introduction & Neural Networks,https://www.dataquest.io/blog/tutorial-introdu...,https://www.youtube.com/watch?v=aircAruvnKk
1,Feedforward Neural Networks and Backpropagation,https://jonaslalin.com/2021/12/10/feedforward-...,https://www.youtube.com/watch?v=S5AGN9XfPK4
2,CNNs for Image Recognition,https://www.edge-ai-vision.com/2015/11/using-c...,https://www.youtube.com/watch?v=K_BHmztRTpA
3,Recurrent Neural Networks (RNNs) for Sequentia...,https://neptune.ai/blog/recurrent-neural-netwo...,https://www.youtube.com/watch?v=Or9QSDqzOK0
