In [3]:
# Load environment variables in a file called .env

import os
from dotenv import load_dotenv

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


API key found and looks good so far!


In [5]:
# Step 1: Create your prompts

system_prompt = """
You are a snarky assistant that analyzes the contents of a website,
and provides a short, snarky, humorous summary, ignoring text that might be navigation related.
Respond in markdown. Do not wrap the markdown in a code block - respond just with the markdown.
"""
user_prompt = """
Here are the contents of a document
    Databricks Certified Data Engineer Professional — Complete Study
Guide (Expanded)
# Databricks Certified Data Engineer Professional — Complete Study Guide (Expanded)
This document is an expanded, exam-focused, hands-on friendly guide covering all essential topics for the Databricks
Certified Data Engineer Professional exam. It includes concepts, code-level examples, layman analogies, exam tips,
and practical checklists.
---
## PART 1 — FOUNDATIONS & ARCHITECTURE
### Lakehouse & Databricks Overview
- **Lakehouse** merges data lake flexibility with data warehouse management. Databricks implements the Lakehouse
using Delta Lake on cloud storage.
- **Control plane** (Databricks-managed): user interface, workspace, job orchestration, Unity Catalog control.
**Compute plane** (customer cloud): clusters, DBFS, cloud storage (S3/ADLS/GCS).
**Layman analogy:** Lakehouse = a modern library: raw manuscripts in basement (lake), curated books on shelves
(gold tables), librarian catalog (Unity Catalog).
**Exam tip:** Know the roles of control plane vs compute plane and where metadata lives.
---
### Delta Lake Core Concepts
- **ACID transactions**: transaction log (`_delta_log`) ensures commits are atomic and durable.
- **Delta log**: JSON + Parquet transaction log; time travel uses versions recorded here.
- **Time travel**: `SELECT * FROM table VERSION AS OF n` or `TIMESTAMP AS OF '...'`.
- **VACUUM**: removes old files; retains data for default retention; be careful with retention windows.
- **Schema enforcement** and **schema evolution**: `mergeSchema`, `overwriteSchema`, `addNewColumns`.
- **Optimistic Concurrency Control (OCC)**: concurrent writers create commits, Delta verifies against latest state and
throws conflicts.
**Commands:**
```
DESCRIBE HISTORY mydb.mytable;
RESTORE TABLE mydb.mytable TO VERSION AS OF 3;
VACUUM mydb.mytable RETAIN 168 HOURS;
ALTER TABLE mydb.mytable SET TBLPROPERTIES ('delta.enableChangeDataFeed' = true);
```
**Layman analogy:** Delta log is a journal of changes — like saved drafts + edits history for a document.
---
## PART 2 — CHANGE DATA: CDC & CDF
### CDC (Change Data Capture)
- Implemented typically via `MERGE INTO` operations using source snapshot or CDC flags.
- Pattern: dedupe source, identify inserts/updates/deletes, `MERGE` into target.
**Typical SQL MERGE pattern:**
```
MERGE INTO target t
USING source s
ON t.key = s.key
WHEN MATCHED AND THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *
```
**Exam tip:** Understand how to pre-process source to avoid multiple matches (use ROW_NUMBER over partition).
### Change Data Feed (CDF)
- Enable per-table: `ALTER TABLE t SET TBLPROPERTIES (delta.enableChangeDataFeed = true)`.
- Read changes:
"""

from openai import OpenAI
openai = OpenAI()

# Step 2: Make the messages list
messages = [{"role": "system", "content": "Wonderful, you are a helpful assistant"},
{"role": "user", "content": "Here is the summary of the document: " + user_prompt}
] # fill this in

# Step 3: Call OpenAI
response = openai.chat.completions.create(model = "gpt-4.1-nano", messages = messages)

# Step 4: print the result
print(response.choices[0].message.content)

Certainly! Here's a concise summary of the document content you provided:

---

**Title:** Databricks Certified Data Engineer Professional — Complete Study Guide (Expanded)

**Overview:**
An exam-focused, practical guide covering essential concepts, examples, tips, and checklists for the Databricks Certified Data Engineer Professional exam.

---

### PART 1 — FOUNDATIONS & ARCHITECTURE

**Lakehouse & Databricks Overview:**
- Lakehouse combines data lake flexibility and data warehouse management via Delta Lake.
- **Control plane:** Managed by Databricks; includes UI, workspace, job orchestration, Unity Catalog.
- **Compute plane:** Customer cloud components; clusters, cloud storage (S3, ADLS, GCS).
- **Analogy:** Lakehouse = a library combining raw manuscripts (lake) and curated books (tables), with a librarian (Unity Catalog).
- **Exam tip:** Know roles of control vs compute plane and metadata location.

**Delta Lake Core Concepts:**
- **ACID Transactions:** Managed via transaction log

In [6]:
system_prompt = """
You are a snarky assistant that analyzes the contents of a website,
and provides a short, snarky, humorous summary, ignoring text that might be navigation related.
Respond in markdown. Do not wrap the markdown in a code block - respond just with the markdown.
"""

user_prompt = """
Here are the contents of a website.
Provide a short summary of this website.
If it includes news or announcements, then summarize these too.
"""


def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt + website}
    ]



In [15]:
# Let's try out this utility

from scraper import fetch_website_contents

ka= fetch_website_contents("https://blog.khanacademy.org/")
print(ka)

ModuleNotFoundError: No module named 'scraper'

In [None]:
messages_for("https://blog.khanacademy.org/")

In [None]:
def summarize(url):
    website = fetch_website_contents(url)  #pyright: ignore[reportUndefinedVariable]
    response = openai.chat.completions.create(model = "gpt-4.1-nano", messages = messages_for(website))
    return response.choices[0].message.content


In [None]:
summarize("https://blog.khanacademy.org/")

In [None]:
def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))

In [None]:
display_summary("https://blog.khanacademy.org/")