# Scrap the html content from website

## Step_1 : extract the content using requests module.

In [None]:
import requests

response = requests.get(url="https://blog.londonappbrewery.com/how-a-doctor-ran-a-tech-startup-while-working-80-hour-weeks-d47e7b4988cb")
print(response.text)


## Step_2 : Using Bs4 the html content is formated properly with help of prettify

In [None]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text,"html.parser")
print(soup.prettify())



### a. using find_all method extract only necessary content and stored in list

In [None]:
tags_list = ["h1","h2","h3","p"]
li = soup.find_all(tags_list)


### b. using that "li" list extract each individuial line text and store in list "content"

In [None]:
website = []
for i in li:
    website.append(i.get_text())
website

# Train the llm model using the scraped data

## Setup llama and import model


In [None]:
requests.get("http://localhost:11434").content

In [None]:
OLLAMA_BASE_URL = "http://localhost:11434/v1"

In [None]:
from openai import OpenAI
ollama = OpenAI(base_url=OLLAMA_BASE_URL, api_key='ollama')

In [None]:
# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish."

system_prompt = """
You are a professional content summarizer. Your task is to read the text provided by the user and generate a concise, clear, and accurate summary. Follow these rules:

1. Identify and include only the main ideas and key points.
2. Remove any redundant, trivial, or irrelevant information.
3. Preserve the original meaning and tone of the text.
4. Make the summary easy to read and understand.
5. Provide the summary in [choose format: paragraph / bullet points / numbered list] as requested by the user.
6. Keep the summary concise, ideally under [user-specified length, e.g., 150 words].

Wait for the user to provide the text to summarize.

Respond in markdown. Do not wrap the markdown in a code block - respond just with the markdown.
"""

In [None]:
# Define our user prompt

user_prompt_prefix = """
Here are the contents of a website.
Provide a short summary of this website.
If it includes news or announcements, then summarize these too.

"""

In [None]:
# See how this function creates exactly the format above

def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_prefix + website}
    ]

In [None]:
website_content = ','.join(website)
website_content

In [None]:
messages_for(website=website_content)

In [None]:
# # And now: call the OpenAI API. You will get very familiar with this!

# def summarize(url):
#     response = ollama.chat.completions.create(
#         model = "gpt-4.1-mini",
#         messages = messages_for(content)
#     )
#     return response.choices[0].message.content

In [None]:
response = ollama.chat.completions.create(model="mistral:latest", messages=messages_for(website=website_content))

response.choices[0].message.content

In [None]:
response = ollama.chat.completions.create(model="mistral:latest", messages=[{"role": "user", "content": "Tell me a fun fact"}])

response.choices[0].message.content