In [None]:
pip install openai datasets

In [None]:
import os
openai.api_key = os.getenv("OPENAI_API_KEY")

# DATA Processing

In [None]:
import json

with open("/content/train_spider.json") as f:
    data = json.load(f)
print(json.dumps(data[0], indent=2))


In [None]:
import json

# Load dataset
with open("/content/train_spider.json", "r") as f:
    dev_data = json.load(f)

# Check each entry for required fields
required_fields = ["db_id", "question", "query"]
incomplete_entries = []

for i, entry in enumerate(dev_data):
    if not all(k in entry for k in required_fields):
        incomplete_entries.append((i, entry))

# Output results
print(f"Total incomplete/malformed entries found: {len(incomplete_entries)}")
if incomplete_entries:
    print("First few problematic entries:")
    for idx, item in incomplete_entries[:3]:
        print(f"\nIndex {idx}: {item}")


In [None]:
import json

# Load dataset
with open("/content/dev.json", "r") as f:
    dev_data = json.load(f)

# Check each entry for required fields
required_fields = ["db_id", "question", "query"]
incomplete_entries = []

for i, entry in enumerate(dev_data):
    if not all(k in entry for k in required_fields):
        incomplete_entries.append((i, entry))

# Output results
print(f"Total incomplete/malformed entries found: {len(incomplete_entries)}")
if incomplete_entries:
    print("First few problematic entries:")
    for idx, item in incomplete_entries[:3]:
        print(f"\nIndex {idx}: {item}")


In [None]:
# Install dependencies (if running in Colab or new env)
!pip install openai sqlparse

import os
import json
import sqlparse
from openai import OpenAI
from getpass import getpass

# Secure API Key (don't hardcode!)
api_key = os.getenv("OPENAI_API_KEY") or getpass("Enter your OpenAI API key: ")
client = OpenAI(api_key=api_key)

# ------------------ SQL Normalization ------------------
def normalize_sql(query):
    try:
        parsed = sqlparse.parse(query)[0]
        return sqlparse.format(str(parsed), keyword_case='lower', strip_comments=True)
    except Exception:
        return query

# ------------------ Schema Prompt Generation ------------------
def get_schema_prompt(db_id, tables_data):
    for table in tables_data:
        if table['db_id'] == db_id:
            schema_lines = []
            table_names = table["table_names_original"]
            column_names = table["column_names_original"]
            pk = table.get("primary_keys", [])
            fk = table.get("foreign_keys", [])

            table_column_map = {t: [] for t in table_names}
            for t_idx, col_name in column_names:
                if t_idx != -1:
                    table_column_map[table_names[t_idx]].append(col_name)

            for table_name in table_names:
                schema_lines.append(f"Table: {table_name}")
                schema_lines.append(f"Columns: {', '.join(table_column_map[table_name])}")

            if pk:
                pk_str = ", ".join([
                    f"{table_names[column_names[i][0]]}({column_names[i][1]})" for i in pk
                ])
                schema_lines.append(f"Primary Keys: {pk_str}")
            if fk:
                fk_str = []
                for from_idx, to_idx in fk:
                    from_tbl_idx, from_col = column_names[from_idx]
                    to_tbl_idx, to_col = column_names[to_idx]
                    fk_str.append(f"{table_names[from_tbl_idx]}({from_col}) → {table_names[to_tbl_idx]}({to_col})")
                schema_lines.append(f"Foreign Keys: {', '.join(fk_str)}")

            return "\n".join(schema_lines)
    return f"Schema for {db_id} not found."

# ------------------ JSONL Builder ------------------
def prepare_jsonl(train_path, tables_path, output_jsonl):
    with open(train_path, 'r') as f:
        train_data = json.load(f)
    with open(tables_path, 'r') as f:
        tables_data = json.load(f)

    jsonl_data = []
    for example in train_data:
        db_id = example['db_id']
        question = example['question']
        sql = normalize_sql(example['query'])

        schema_str = get_schema_prompt(db_id, tables_data)
        system_prompt = (
            "You are a SQL query generator. Given a natural language question and database schema, "
            "generate the correct SQL query. Only return the SQL query.\n\n"
            f"Database Schema:\n{schema_str}\n"
        )

        jsonl_data.append({
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": question},
                {"role": "assistant", "content": sql}
            ]
        })

    with open(output_jsonl, 'w') as f:
        for entry in jsonl_data:
            json.dump(entry, f)
            f.write('\n')

    print(f"✅ Prepared {len(jsonl_data)} samples → {output_jsonl}")
    return output_jsonl

In [None]:
with open("/content/tables.json") as f:
    tables_data = json.load(f)

from pprint import pprint
db_id = data[0]['db_id']
schema_str = get_schema_prompt(db_id, tables_data)
pprint(schema_str)

In [None]:
# ------------------ Run Script ------------------

train_file = "/content/train_spider.json"     # Update if different path
tables_file = "/content/tables.json"          # Update if different path
output_jsonl = "/content/spider_finetune.jsonl"

# Step 1: Prepare dataset
prepare_jsonl(train_file, tables_file, output_jsonl)

In [None]:
prepared_example = {
    "messages": [
        {"role": "system", "content": schema_str},
        {"role": "user", "content": data[0]['question']},
        {"role": "assistant", "content": data[0]['query']}
    ]
}
print(json.dumps(prepared_example, indent=2))

In [None]:
print("Original SQL:\n", data[0]['query'])
print("Normalized SQL:\n", normalize_sql(data[0]['query']))

In [None]:
print("Train examples:", len(data))
with open("/content/dev.json") as f:
    dev_data = json.load(f)
print("Validation examples:", len(dev_data))

# FINE TUNE THE MODEL

In [None]:
# ------------------ Fine-tuning Launch ------------------
def launch_finetune(jsonl_path, model="gpt-4o-mini-2024-07-18", n_epochs=4, lr_mult=0.05):
    with open(jsonl_path, 'rb') as f:
        file = client.files.create(file=f, purpose="fine-tune")
    print(f"📤 Uploaded training file: {file.id}")

    job = client.fine_tuning.jobs.create(
        training_file=file.id,
        model=model,
        hyperparameters={
            "n_epochs": n_epochs,
            "learning_rate_multiplier": lr_mult
        }
    )
    print(f"🚀 Fine-tuning started: Job ID = {job.id}")
    print(f"📊 Monitor at: https://platform.openai.com/finetune/{job.id}")
    return job.id

In [None]:
# Step 2: Fine-tune GPT-4o-mini
job_id = launch_finetune(output_jsonl)

In [None]:
from openai import OpenAI
from getpass import getpass
client = OpenAI(api_key=getpass("Enter your OpenAI API key: "))
fine_tune_id = "ftjob-AmI5FbeFOPSdYwXxt6mtyNiN"  # Replace with the ID from the output
status = client.fine_tuning.jobs.retrieve(fine_tune_id)
print(status)

# Inference

In [None]:
from openai import OpenAI
client = OpenAI(api_key="")
response = client.chat.completions.create(
    model="ft:gpt-4o-mini-2024-07-18:sjsu::BV5M5MaD",
    messages=[
        {"role": "system", "content": "You are a SQL query generator. Given a natural language question and a database schema, generate the correct SQL query. The schema is provided below:\n\nDatabase: department\nSchema details are contextually inferred from the question.\n\nReturn only the SQL query without any explanation."},
        {"role": "user", "content": "Find all courses offered by the Computer Science department."}
    ]
)
print(response.choices[0].message.content)

In [None]:

from sqlparse.tokens import Keyword, Name, String
from getpass import getpass
import json
import pandas as pd
from openai import OpenAI
from datasets import load_dataset

# Evaluate fine tuned model

In [None]:
FINE_TUNE_ID = "ftjob-M8mm3GxzFuIGqxzU82YRUFZh"
FINE_TUNED_MODEL = "ft:gpt-4o-mini-2024-07-18:sjsu::BV5M5MaD"

# Load Spider validation dataset
def load_spider_validation():
    """Load Spider validation dataset from Hugging Face."""
    dataset = load_dataset("spider")
    return dataset['validation']

def get_schema_prompt(db_id):
    """Generate schema prompt using tables.json (Spider format)."""
    try:
        with open('/content/tables.json', 'r') as f:
            tables = json.load(f)
        for table in tables:
            if table['db_id'] == db_id:
                table_names = table["table_names_original"]
                column_names = table["column_names_original"]

                # Map each table name to its columns
                table_column_map = {t: [] for t in table_names}
                for table_idx, col_name in column_names:
                    if table_idx != -1:
                        table_column_map[table_names[table_idx]].append(col_name)

                # Construct schema string
                schema_lines = []
                for table_name in table_names:
                    schema_lines.append(f"Table: {table_name}")
                    schema_lines.append("Columns: " + ", ".join(table_column_map[table_name]))

                return "\n".join(schema_lines)

        return f"Database: {db_id}\nSchema not found in tables.json"
    except Exception as e:
        return f"Error loading schema for {db_id}: {e}"


import sqlparse
import pandas as pd
from sqlparse.tokens import Keyword, Name, String, Number, Punctuation, Operator

def normalize_sql(query):
    try:
        parsed = sqlparse.parse(query)[0]
        normalized_tokens = []

        for token in parsed.flatten():  # Flatten all nested tokens
            if token.ttype in Keyword or token.ttype in Name:
                normalized_tokens.append(token.value.lower())
            elif token.ttype in (String.Single, String.Symbol, Number.Integer, Number.Float):
                normalized_tokens.append(token.value)
            elif token.ttype in (Punctuation, Operator):
                normalized_tokens.append(token.value)
            else:
                normalized_tokens.append(token.value)

        # Join and normalize spacing and remove trailing semicolon
        normalized = ' '.join(''.join(normalized_tokens).split())
        return normalized.strip().rstrip(";")
    except Exception as e:
        print(f"Error normalizing SQL: {e}")
        return query.strip().rstrip(";").lower()


def evaluate_model(validation_data, model_id, max_examples=None):
    """Evaluate the fine-tuned model on the validation set with case-insensitive matching."""
    exact_match_count = 0
    total_examples = 0
    results = []

    for item in validation_data:
        if max_examples and total_examples >= max_examples:
            break

        db_id = item['db_id']
        question = item['question']
        ground_truth_sql = item['query']

        # Prepare system prompt
        schema_prompt = get_schema_prompt(db_id)
        system_prompt = (
            "You are a SQL query generator. Given a natural language question and a database schema, "
            "generate the correct SQL query. The schema is provided below:\n\n"
            f"{schema_prompt}\n\n"
            "Return only the SQL query without any explanation."
        )

        # Generate predicted SQL query
        try:
            response = client.chat.completions.create(
                model=model_id,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": question}
                ],
                max_tokens=200,
                temperature=0.0
            )
            predicted_sql = response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Error generating SQL for question '{question}': {e}")
            predicted_sql = ""

        # Normalize SQL queries for comparison
        normalized_predicted = normalize_sql(predicted_sql) if predicted_sql else ""
        normalized_ground_truth = normalize_sql(ground_truth_sql)

        # Check case-insensitive exact match
        normalized_predicted = normalize_sql(predicted_sql) if predicted_sql else ""
        normalized_ground_truth = normalize_sql(ground_truth_sql)

        is_exact_match = normalized_predicted == normalized_ground_truth


        if is_exact_match:
            exact_match_count += 1
        else:
            # Log mismatch for debugging
            print(f"Mismatch for question '{question}':")
            print(f"Ground Truth: {normalized_ground_truth}")
            print(f"Predicted: {normalized_predicted}")

        # Store results
        results.append({
            "question": question,
            "ground_truth_sql": ground_truth_sql,
            "predicted_sql": predicted_sql,
            "normalized_ground_truth": normalized_ground_truth,
            "normalized_predicted": normalized_predicted,
            "is_exact_match": is_exact_match,
            "schema_prompt": schema_prompt
        })

        total_examples += 1
        if total_examples % 10 == 0:
            print(f"Processed {total_examples} examples...")

    # Calculate accuracy
    exact_match_accuracy = exact_match_count / total_examples if total_examples > 0 else 0
    print(f"\nEvaluation complete!")
    print(f"Total examples: {total_examples}")
    #print(f"Case-insensitive exact match accuracy: {exact_match_accuracy:.4f} ({exact_match_count}/{total_examples})")

    # Save results to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv("/content/evaluation_results_FINETUNED.csv", index=False)
    print("Results saved to /content/evaluation_results_FINETUNED.csv")

    print(f"Correct predictions after normalization: {exact_match_count}/{total_examples}")
    print(f"Accuracy: {exact_match_accuracy:.2%}")


    return exact_match_accuracy, results

def main():
    print("Ensure tables.json is uploaded for accurate schema details.")
    print("Loading Spider validation dataset...")
    validation_data = load_spider_validation()

    print(f"Evaluating model {FINE_TUNED_MODEL}...")
    # Limit to 100 examples for testing
    exact_match_accuracy, results = evaluate_model(validation_data, FINE_TUNED_MODEL, max_examples=100)

    # Download results file
    print("Downloading evaluation results...")

if __name__ == "__main__":
    main()

# Evaluate Base model

In [None]:
import json
import pandas as pd
from openai import OpenAI
from datasets import load_dataset
from google.colab import files
import sqlparse
from sqlparse.tokens import Keyword, Name, String
from getpass import getpass

# Prompt for OpenAI API key
api_key = getpass("Enter your OpenAI API key: ")

# Initialize OpenAI client
client = OpenAI(api_key=api_key)

# Base model ID
BASE_MODEL = "gpt-4o-mini-2024-07-18"

# Load Spider validation dataset
def load_spider_validation():
    """Load Spider validation dataset from Hugging Face."""
    dataset = load_dataset("spider")
    return dataset['validation']

def get_schema_prompt(db_id):
    """Generate schema prompt using tables.json (correct Spider format)."""
    try:
        with open('/content/tables.json', 'r') as f:
            tables = json.load(f)
        for table in tables:
            if table['db_id'] == db_id:
                table_names = table['table_names_original']
                column_names = table['column_names_original']

                table_column_map = {t: [] for t in table_names}
                for table_idx, col_name in column_names:
                    if table_idx != -1:  # -1 means it's "*"
                        table_column_map[table_names[table_idx]].append(col_name)

                schema_lines = []
                for table_name in table_names:
                    schema_lines.append(f"Table: {table_name}")
                    schema_lines.append("Columns: " + ", ".join(table_column_map[table_name]))

                return "\n".join(schema_lines)

        return f"Database: {db_id}\nSchema not found."
    except Exception as e:
        return f"Error loading schema for {db_id}: {e}"


def normalize_sql(query):
    """Normalize SQL query for case-insensitive comparison, removing semicolons and AS aliases."""
    try:
        # Parse the SQL query into tokens
        parsed = sqlparse.parse(query)[0]
        normalized_tokens = []
        skip_next = False

        for i, token in enumerate(parsed.tokens):
            if skip_next:
                skip_next = False
                continue
            if token.value.lower() == 'as' and i + 1 < len(parsed.tokens):
                # Skip 'AS' and the next token (alias) for simple queries
                skip_next = True
                continue
            if token.ttype in (Keyword, Keyword.DML, Keyword.DDL, Keyword.CTE):
                # Convert keywords (e.g., SELECT, WHERE) to lowercase
                normalized_tokens.append(token.value.lower())
            elif token.ttype in (Name, Name.Builtin):
                # Convert identifiers (e.g., table/column names) to lowercase
                normalized_tokens.append(token.value.lower())
            elif token.ttype is String.Single or token.ttype is String.Symbol:
                # Preserve string literals (e.g., 'Computer Science') unchanged
                normalized_tokens.append(token.value)
            elif token.value == ';':
                # Skip semicolons
                continue
            else:
                # Keep other tokens (e.g., operators, numbers, punctuation) unchanged
                normalized_tokens.append(token.value)

        # Join tokens and clean up extra whitespace
        normalized = ' '.join(''.join(normalized_tokens).split())
        return normalized
    except Exception as e:
        print(f"Error normalizing SQL '{query}': {e}")
        return query  # Return original if parsing fails

def evaluate_model(validation_data, model_id, max_examples=None):
    """Evaluate the model on the validation set."""
    exact_match_count = 0
    total_examples = 0
    results = []

    for item in validation_data:
        if max_examples and total_examples >= max_examples:
            break

        db_id = item['db_id']
        question = item['question']
        ground_truth_sql = item['query']

        # Prepare system prompt
        try:
            schema_prompt = get_schema_prompt(db_id)
        except FileNotFoundError as e:
            print(e)
            return 0, []

        system_prompt = (
            "You are a SQL query generator. Given a natural language question and a database schema, "
            "generate the correct SQL query. The schema is provided below:\n\n"
            f"{schema_prompt}\n\n"
            "Return only the SQL query without any explanation."
        )

        # Generate predicted SQL query
        try:
            response = client.chat.completions.create(
                model=model_id,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": question}
                ],
                max_tokens=200,
                temperature=0.0
            )
            predicted_sql = response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Error generating SQL for question '{question}': {e}")
            predicted_sql = ""

        # Normalize SQL queries for comparison
        normalized_predicted = normalize_sql(predicted_sql) if predicted_sql else ""
        normalized_ground_truth = normalize_sql(ground_truth_sql)

        # Check exact match
        is_exact_match = normalized_predicted == normalized_ground_truth
        if is_exact_match:
            exact_match_count += 1

        # Store results
        results.append({
            "question": question,
            "ground_truth_sql": ground_truth_sql,
            "predicted_sql": predicted_sql,
            "normalized_ground_truth": normalized_ground_truth,
            "normalized_predicted": normalized_predicted,
            "is_exact_match": is_exact_match,
            "schema_prompt": schema_prompt
        })

        total_examples += 1
        if total_examples % 10 == 0:
            print(f"Processed {total_examples} examples...")

    # Calculate accuracy
    exact_match_accuracy = exact_match_count / total_examples if total_examples > 0 else 0
    print(f"\nEvaluation complete!")
    print(f"Total examples: {total_examples}")
    print(f"Exact match accuracy: {exact_match_accuracy:.4f} ({exact_match_count}/{total_examples})")

    # Save results to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv("/content/base_evaluation_results_basemodel.csv", index=False)
    print("Results saved to /content/base_evaluation_results_basemodel.csv")

    return exact_match_accuracy, results

def main():
    print("Ensure tables.json is uploaded to /content/tables.json for accurate schema details.")
    try:
        print("Loading Spider validation dataset...")
        validation_data = load_spider_validation()

        print(f"Evaluating base model {BASE_MODEL}...")
        # Limit to 100 examples for testing
        exact_match_accuracy, results = evaluate_model(validation_data, BASE_MODEL, max_examples=100)

        # Download results file
        print("Downloading evaluation results...")
    except Exception as e:
        print(f"Error during evaluation: {e}")

if __name__ == "__main__":
    main()

# Evaluation Script for comparison

In [None]:
!unzip /content/spider_data.zip -d /content/database

In [None]:
import pandas as pd
import sqlite3
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Paths to your evaluation CSVs
finetuned_csv_path = "/content/evaluation_results_FINETUNED.csv"
base_csv_path = "/content/base_evaluation_results_basemodel.csv"

# Path to the root Spider database directory
DB_ROOT = "/content/database/spider_data/test_database"

# --- Helper functions ---

def compute_bleu(reference_sql, predicted_sql):
    """Compute BLEU score between two SQL queries."""
    ref_tokens = reference_sql.strip().lower().split()
    pred_tokens = predicted_sql.strip().lower().split()
    smoothie = SmoothingFunction().method1
    return sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smoothie)

def run_query(query, db_path):
    """Run SQL query on a SQLite database."""
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
        conn.close()
        return results
    except Exception as e:
        return f"ERROR: {e}"

def is_execution_match(predicted_sql, ground_truth_sql, db_path):
    try:
        return sorted(run_query(predicted_sql, db_path)) == sorted(run_query(ground_truth_sql, db_path))
    except:
        return False


def normalize_sql(query):
    """Simple SQL normalization (case-insensitive, strip semicolon)."""
    return query.strip().lower().rstrip(';')

def evaluate_file(csv_path, label):
    """Evaluate BLEU, Execution Accuracy, and Exact Match for a model output CSV."""
    df = pd.read_csv(csv_path)
    bleu_scores = []
    execution_matches = []
    exact_matches = []

    for _, row in df.iterrows():
        pred = row["predicted_sql"]
        gold = row["ground_truth_sql"]
        db_id = row["schema_prompt"].split("\n")[0].split(": ")[-1].lower()

        # BLEU score
        bleu = compute_bleu(gold, pred)
        bleu_scores.append(bleu)

        # Execution match
        db_path = f"{DB_ROOT}/{db_id}/{db_id}.sqlite"
        try:
            exec_match = is_execution_match(pred, gold, db_path)
        except Exception:
            exec_match = False
        execution_matches.append(exec_match)

        # Exact match
        norm_pred = normalize_sql(pred)
        norm_gold = normalize_sql(gold)
        exact_matches.append(norm_pred == norm_gold)

    df["bleu_score"] = bleu_scores
    df["execution_match"] = execution_matches
    df["exact_match"] = exact_matches

    avg_bleu = round(df["bleu_score"].mean(), 4)
    exec_acc = round(df["execution_match"].mean(), 4)
    exact_acc = round(df["exact_match"].mean(), 4)

    return label, avg_bleu, exec_acc, exact_acc

# --- Run evaluation on both models ---

f_label, f_bleu, f_exec, f_exact = evaluate_file(finetuned_csv_path, "Finetuned Model")
b_label, b_bleu, b_exec, b_exact = evaluate_file(base_csv_path, "Base Model")

# --- Show comparison ---

comparison_df = pd.DataFrame({
    "Metric": ["Average BLEU Score", "Execution Accuracy", "Exact Match Accuracy"],
    f_label: [f_bleu, f_exec, f_exact],
    b_label: [b_bleu, b_exec, b_exact]
})

from IPython.display import display
display(comparison_df)


# Inference

In [None]:
from openai import OpenAI
import pandas as pd
from getpass import getpass
import json

# Prompt for API key securely
api_key = getpass("Enter your OpenAI API key: ")
client = OpenAI(api_key=api_key)

# Model IDs
BASE_MODEL = "gpt-4o-mini-2024-07-18"
FINETUNED_MODEL = "ft:gpt-4o-mini-2024-07-18:sjsu::BV5M5MaD"

# Path to Spider schema info
TABLES_PATH = "/content/database/spider_data/tables.json"

# Load Spider validation questions
from datasets import load_dataset
questions = load_dataset("spider")["validation"]

# Load tables.json for schema prompts
with open(TABLES_PATH, 'r') as f:
    tables_data = json.load(f)

def get_schema_prompt(db_id):
    for table in tables_data:
        if table["db_id"] == db_id:
            schema_lines = []
            for i, table_name in enumerate(table["table_names_original"]):
                cols = [col[1] for col in table["column_names_original"] if col[0] == i]
                schema_lines.append(f"Table: {table_name}")
                schema_lines.append("Columns: " + ", ".join(cols))
            return "\n".join(schema_lines)
    return "Schema not found."

# Generate SQL for a question using a specific model
def generate_sql(model_id, question, schema_prompt):
    try:
        response = client.chat.completions.create(
            model=model_id,
            messages=[
                {"role": "system", "content": f"You are a SQL generator. Given a question and a schema, generate SQL.\n\n{schema_prompt}"},
                {"role": "user", "content": question}
            ],
            temperature=0.0,
            max_tokens=200
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"ERROR: {e}"

# Run inference on N examples
N = 10
results = []

for item in questions.select(range(N)):
    q = item["question"]
    db_id = item["db_id"]
    ground_truth = item["query"]
    schema_prompt = get_schema_prompt(db_id)

    base_pred = generate_sql(BASE_MODEL, q, schema_prompt)
    finetuned_pred = generate_sql(FINETUNED_MODEL, q, schema_prompt)

    results.append({
        "question": q,
        "ground_truth": ground_truth,
        "base_model_prediction": base_pred,
        "finetuned_model_prediction": finetuned_pred
    })

# Convert to DataFrame
df = pd.DataFrame(results)
from IPython.display import display
display(df)