In [51]:
import json
from pathlib import Path

def extract_code_from_notebook(notebook_path):
    """Extracts and concatenates code cells from a Jupyter notebook."""
    with open(notebook_path, 'r', encoding='utf-8') as f:
        nb = json.load(f)

    code_cells = [
        cell['source']
        for cell in nb.get('cells', [])
        if cell['cell_type'] == 'code'
    ]

    # Flatten the source lines and join them into a single string
    code = "\n".join("".join(cell) for cell in code_cells)
    return code

# Example usage
notebook_path = Path(r"C:\Users\samle\OneDrive\Documents\Utils\Notebook_Test.ipynb")
code_string = extract_code_from_notebook(notebook_path)


prompt = f"""Extract and list the following details from the provided abstract syntax tree:
- **Source Tables** (table names)
- **Source Columns** (column names)
- **Target Tables** (table names)
- **Target Columns** (column names)
- **Transformations** (describe operations applied)

Code:
{code_string}
"""

print(prompt)


Extract and list the following details from the provided abstract syntax tree:
- **Source Tables** (table names)
- **Source Columns** (column names)
- **Target Tables** (table names)
- **Target Columns** (column names)
- **Transformations** (describe operations applied)

Code:
from pyspark.sql import SparkSession

def main():
    # Start Spark session
    spark = SparkSession.builder.appName("SQL_ETL_with_Variable").getOrCreate()

    # -----------------------------
    # 1. EXTRACT
    # -----------------------------
    source_table = "source_db.people"
    df = spark.sql(f"SELECT id, name, age FROM {source_table}")
    print("Original Data:")
    df.show()

    # -----------------------------
    # 2. TRANSFORM using SQL query
    # -----------------------------
    # Register temp view
    df.createOrReplaceTempView("people_view")

    # SQL query as a variable
    sql_query = """
        SELECT 
            id,
            name,
            age,
            CASE 
                W

In [52]:
from transformers import AutoTokenizer

# Load the tokenizer for CodeT5
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base-multi-sum")


# Tokenize the text and count the tokens
tokens = tokenizer.tokenize(prompt)
num_tokens = len(tokens)

print(f"Number of tokens: {num_tokens}")


Number of tokens: 422


In [None]:
import json
import ast

notebook_path = r"C:\Users\samle\OneDrive\Documents\Utils\Notebook_Test.ipynb"

# Load the Jupyter Notebook
with open(notebook_path, "r") as f:
    notebook = json.load(f)

# Extract code cells
code_cells = [cell['source'] for cell in notebook['cells'] if cell['cell_type'] == 'code']

# Parse each code cell into an AST
for i, code in enumerate(code_cells):
    try:
        tree = ast.parse("\n".join(code))
        print(f"AST for Code Cell {i + 1}:")
        print(ast.dump(tree, indent=4))  # Pretty-print the AST
    except SyntaxError as e:
        print(f"Syntax Error in Code Cell {i + 1}: {e}")

prompt = f"""Extract and list the following details from the provided abstract syntax tree:
- **Source Tables** (table names)
- **Source Columns** (column names)
- **Target Tables** (table names)
- **Target Columns** (column names)
- **Transformations** (describe operations applied)

Code:
{tree}
"""


AST for Code Cell 1:
Module(
    body=[
        ImportFrom(
            module='pyspark.sql',
            names=[
                alias(name='SparkSession')],
            level=0),
        FunctionDef(
            name='main',
            args=arguments(
                posonlyargs=[],
                args=[],
                kwonlyargs=[],
                kw_defaults=[],
                defaults=[]),
            body=[
                Assign(
                    targets=[
                        Name(id='spark', ctx=Store())],
                    value=Call(
                        func=Attribute(
                            value=Call(
                                func=Attribute(
                                    value=Attribute(
                                        value=Name(id='SparkSession', ctx=Load()),
                                        attr='builder',
                                        ctx=Load()),
                                    attr='appName',
         

In [42]:
from transformers import pipeline


In [43]:
import torch
print(torch.__version__)

2.6.0+cpu


In [46]:
import re
import math

def chunk_code(code_string, chunk_size=500, overlap=100):
    """Splits the code into chunks with overlap to preserve context."""
    tokens = re.split(r'(\s+)', code_string)  # Split by whitespace
    total_tokens = len(tokens)
    chunks = []

    for start in range(0, total_tokens, chunk_size - overlap):
        end = min(start + chunk_size, total_tokens)
        chunk = "".join(tokens[start:end])
        chunks.append(chunk)
    
    return chunks



chunks = chunk_code(code_string, chunk_size=50, overlap=100)
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\n{chunk}\n")


In [38]:
summarizer = pipeline(
    "text2text-generation",                     # tells HF the task type
    model="Salesforce/codet5-base-multi-sum",   # pretrained model for code summarization
    tokenizer="Salesforce/codet5-base-multi-sum"
)


# Run the summarization pipeline
summary = summarizer(prompt, max_new_tokens=1000000)


Device set to use cpu


In [39]:
print(summary[0]['generated_text'])

Extract and list all of the details from a module s abstract syntax tree .


In [57]:
import json
from pathlib import Path
from transformers import pipeline
import textwrap

def extract_code_from_notebook(notebook_path):
    """Extracts and flattens all code from a Jupyter notebook into one string."""
    with open(notebook_path, 'r', encoding='utf-8') as f:
        nb = json.load(f)

    all_code = [
        "".join(cell['source'])
        for cell in nb.get('cells', [])
        if cell['cell_type'] == 'code'
    ]
    return "\n".join(all_code)

def chunk_text(text, max_tokens=512):
    """Chunks text based on estimated token count."""
    lines = text.splitlines()
    chunks, current_chunk = [], []
    token_estimate = 0

    for line in lines:
        token_estimate += len(line.split())  # crude token estimate
        current_chunk.append(line)

        if token_estimate >= max_tokens:
            chunks.append("\n".join(current_chunk))
            current_chunk, token_estimate = [], 0

    if current_chunk:
        chunks.append("\n".join(current_chunk))

    return chunks

# --- Load and flatten notebook ---
notebook_path = Path(r"C:\Users\samle\OneDrive\Documents\Utils\Notebook_Test.ipynb")
flattened_code = extract_code_from_notebook(notebook_path)
code_chunks = chunk_text(flattened_code, max_tokens=256)

# --- Initialize summarization pipeline ---
summarizer = pipeline(
    "text2text-generation",
    model="Salesforce/codet5-base-multi-sum",
    tokenizer="Salesforce/codet5-base-multi-sum"
)

# --- Summarize each chunk ---
summaries = []
for i, chunk in enumerate(code_chunks):
    prompt = textwrap.dedent(f"""\
        Extract and list the following details from the provided columns:
        - Source Tables (table names)
        - Source Columns (column names)
        - Target Tables (table names)
        - Target Columns (column names)
        - Transformations (describe operations applied)

        Code:
        {chunk}
    """)
    result = summarizer(prompt, max_new_tokens=512)
    print((chunk, result[0]['generated_text']))
    summaries.append(result[0]['generated_text'])

# --- Print combined result ---
print("\n=== Combined Summary ===\n")
print("\n\n".join(summaries))


Device set to use cpu


('from pyspark.sql import SparkSession\n\ndef main():\n    # Start Spark session\n    spark = SparkSession.builder.appName("SQL_ETL_with_Variable").getOrCreate()\n\n    # -----------------------------\n    # 1. EXTRACT\n    # -----------------------------\n    source_table = "source_db.people"\n    df = spark.sql(f"SELECT id, name, age FROM {source_table}")\n    print("Original Data:")\n    df.show()\n\n    # -----------------------------\n    # 2. TRANSFORM using SQL query\n    # -----------------------------\n    # Register temp view\n    df.createOrReplaceTempView("people_view")\n\n    # SQL query as a variable\n    sql_query = """\n        SELECT \n            id,\n            name,\n            age,\n            CASE \n                WHEN age > 30 THEN \'senior\'\n                ELSE \'junior\'\n            END AS status\n        FROM people_view\n        WHERE age IS NOT NULL\n    """\n\n    transformed_df = spark.sql(sql_query)\n    print("Transformed Data:")\n    transformed_

Device set to use cpu



--- Summary for Cell 1 ---
Extract and list the details from the provided code . This code extracts the data from the source_db . people table and transforms the data from the source_db . people_status table to the target_db . people_status table .

--- Summary for Cell 2 ---
Extract and list the details from the provided code .
