In [1]:
from neo4j_runway import Discovery, GraphDataModeler, PyIngest, UserInput
from neo4j_runway.code_generation import PyIngestConfigGenerator
from neo4j_runway.llm.openai import OpenAIDiscoveryLLM, OpenAIDataModelingLLM

In [None]:
# data_directory = "/home/jovyan/Elastic_DS/csv_downloads/conn.csv"

# data_dictionary = {
#                 '_index': 'unique id for index.',
#                 '@timestamp': 'timestamp for event.',
#                 'message': 'event message.',
#                 'agent.type': 'elastic agent type ex. filebeat, auditbeat ect.',
#                 'host.hostname': "hostename.",
#                 'host.ip': 'host machine IP address.',
#                 'host.mac': 'host machine mac address.',
#                 'host.os.type': 'OS family ex. linux or windows.',
#                 'host.os.version': 'distro version number.',
#                 }

# use_cases = [
#         "which event had falures in the mesage",
#         "how many faild login attempts",
#     ]

# data = load_local_files(data_directory=data_directory,
#                         data_dictionary=data_dictionary,
#                         general_description="This is zeek network data that will be used alongside filebeat for assosiation analysis.",
#                         use_cases=use_cases,
#                         include_files=["output.csv"])

In [None]:
import os
import json
import re
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Retrieve OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("API key not found. Ensure .env file contains 'OPENAI_API_KEY' and restart the script.")

# Initialize OpenAI client with the API key
client = OpenAI(api_key=api_key)

# Define model name (change this to switch models)
model_name = "gpt-4o"

# Helper function to dynamically generate completion parameters based on the model
def get_completion_params(model_name, prompt, tokens=700, temperature=0.1):
    # Use 'max_completion_tokens' for o3-mini; otherwise, use 'max_tokens'
    token_param = "max_completion_tokens" if model_name == "o3-mini" else "max_tokens"
    params = {
        "model": model_name,
        "messages": [{"role": "user", "content": prompt}],
        token_param: tokens,
    }
    # Only include temperature if the model supports it
    if model_name != "o3-mini":
        params["temperature"] = temperature
    return params

# Define CSV file location
data_directory = "/home/jovyan/Elastic_DS/csv_downloads/conn.csv"

# Check if file exists
if not os.path.exists(data_directory):
    raise FileNotFoundError(f"File not found: {data_directory}")

# Load CSV to extract column headers and sample data
df = pd.read_csv(data_directory, nrows=50)  # Load first 50 rows for analysis
columns_list = list(df.columns)
sample_data = df.to_dict(orient="records")[:3]  # Get a small sample of data
print("CSV Columns:", columns_list)

# Function to safely extract JSON from GPT response
def extract_json(text):
    """Extracts and validates JSON from the GPT response."""
    match = re.search(r'\{.*\}', text, re.DOTALL)  # Try regex extraction
    if match:
        json_str = match.group()
    else:
        raise ValueError("No valid JSON object detected in GPT response.")
    
    try:
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        print("Error parsing JSON. GPT Output:\n", text)
        raise e

# Function to dynamically generate dataset context using GPT
def generate_dynamic_context(columns, sample):
    prompt = f"""I have a CSV file with the following column headers:
{columns}
Here is a small sample of the data:
{json.dumps(sample, indent=2)}

Analyze this data and generate:
1. A general description of what this dataset represents.
2. A list of 5 potential use cases relevant to analyzing this data.
3. general description and usecases should be targeted at scoping an llm to look at potential assosiations in data.

Output a JSON object with two keys: "general_description" (a single-line string) and "use_cases" (a list of short bullet points).
Output only a complete valid JSON object (including the curly braces) without any markdown formatting or code fences.
"""
    params = get_completion_params(model_name, prompt, tokens=700, temperature=0.1)
    response = client.chat.completions.create(**params)
    context_str = response.choices[0].message.content.strip()
    print("GPT Context Response:\n", context_str)  # Debugging output
    return extract_json(context_str)

# Generate dynamic description and use cases
context_info = generate_dynamic_context(columns_list, sample_data)
general_description = context_info["general_description"]
use_cases = context_info["use_cases"]

print("\nGenerated General Description:", general_description)
print("Generated Use Cases:", use_cases)

# Function to generate data dictionary using GPT
def generate_data_dictionary(columns, general_description, use_cases):
    prompt = f"""I have a CSV file with the following column headers:
{columns}
The file contains network data.
General description: "{general_description}"
Use cases: {use_cases}
Generate a JSON object that maps each column header to a concise description of what the column represents.
Ensure the descriptions are tailored to this data context and are targeted and stearing an llm to building data assostiations.
Output only a complete valid JSON object (including the curly braces) without any markdown formatting or code fences.
"""
    params = get_completion_params(model_name, prompt, tokens=700, temperature=0.1)
    response = client.chat.completions.create(**params)
    data_dict_str = response.choices[0].message.content.strip()
    print("\nGPT Data Dictionary Response:\n", data_dict_str)  # Debugging output
    return extract_json(data_dict_str)

# Generate the data dictionary using GPT, with error handling to define data_dictionary
try:
    data_dictionary = generate_data_dictionary(columns_list, general_description, use_cases)
    print("\nGenerated Data Dictionary:", data_dictionary)
except ValueError as e:
    print("Failed to generate data dictionary:", e)
    data_dictionary = {}  # Set to an empty dictionary to avoid NameError

# Function to load and structure local files
def load_local_files(data_directory, data_dictionary, general_description, use_cases, include_files):
    """Loads and structures local files into a dictionary for further processing."""
    if not os.path.exists(data_directory):
        raise FileNotFoundError(f"File not found: {data_directory}")

    df = pd.read_csv(data_directory)  # Load full dataset
    
    # Package metadata and data
    data = {
        "file_path": data_directory,
        "columns": list(df.columns),
        "data_dictionary": data_dictionary,
        "general_description": general_description,
        "use_cases": use_cases,
        "sample_data": df.head(10).to_dict(orient="records"),
        "included_files": include_files
    }
    
    return data

# Load and structure the dataset
data = load_local_files(
    data_directory=data_directory,
    data_dictionary=data_dictionary,
    general_description=general_description,
    use_cases=use_cases,
    include_files=["output.csv"]
)

print("\nLoaded Data Structure:", json.dumps(data, indent=2))


In [None]:
import os
import pandas as pd
import json
import re
from openai import OpenAI

# Initialize the OpenAI client
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

def generate_general_description(df):
    """
    Generates a general description of the DataFrame by providing key insights
    such as the number of rows and the list of columns.
    """
    columns_list = list(df.columns)
    num_rows = len(df)
    prompt = f"""I have a dataframe with {num_rows} rows and the following columns: {columns_list}.
Please provide a concise general description of this dataset focusing on its key attributes, potential applications, and any notable insights.
Output only the description text with no additional commentary.
"""
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model="gpt-4o",
        max_tokens=150,
        temperature=0.5,
    )
    description = response.choices[0].message.content.strip()
    return description

def load_local_files(data_directory, data_dictionary, use_cases, include_files):
    try:
        # Load the CSV file into a DataFrame
        df = pd.read_csv(data_directory)
        
        # Rename columns based on the data dictionary if necessary
        # (Here, the mapping is identity, but adjust if needed.)
        df.rename(columns={k: k for k in data_dictionary.keys()}, inplace=True)
        
        # Generate general description using GPT‑4o based on the DataFrame
        general_description = generate_general_description(df)
        
        # Print summary information
        print(f"Loaded data from {data_directory}")
        print(f"General Description: {general_description}")
        print(f"Available Use Cases: {use_cases}")
        
        return df
    except Exception as e:
        print(f"Error loading file: {e}")
        return None

# Now you can call the function
data = load_local_files(data_directory=data_directory,
                        data_dictionary=data_dictionary,
                        use_cases=use_cases,
                        include_files=["/csv_downloads/conn.csv"])

# Display first few rows
if data is not None:
    print(data.head())


In [None]:
import os
from dotenv import load_dotenv

# Load .env file
load_dotenv()

# Manually set the API key in the environment (if not already set)
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

# Initialize LLMs (without api_key argument)
llm_disc = OpenAIDiscoveryLLM(
    model_name='gpt-4o',
    model_params={"temperature": 0}
)

llm_dm = OpenAIDataModelingLLM(
    model_name='gpt-4o',
    model_params={"temperature": 0.5}
)

# Verify the API key is available
print(f"API Key Loaded: {bool(os.getenv('OPENAI_API_KEY'))}")


In [None]:
# Initialize Discovery instance
disc = Discovery(llm=llm_disc, data=data)

# Run Discovery
disc.run()

# Run again with additional parameters
disc.run(show_result=True, notebook=True)


In [None]:
gdm = GraphDataModeler(llm=llm_dm, discovery=disc)
gdm.create_initial_model()

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

def normalize_label(node_obj):
    """
    Converts a node object into a consistent, hashable identifier.
    Tries node_obj.id, node_obj.name, or parses the string representation.
    """
    # If there's a unique 'id' or 'name' attribute, use that
    if hasattr(node_obj, 'id'):
        return str(node_obj.id)  # Ensure it's a string
    if hasattr(node_obj, 'name'):
        return str(node_obj.name)
    
    # Fallback: parse the string representation, e.g. '(:Label)'
    s = str(node_obj)
    if s.startswith('(:') and s.endswith(')'):
        # Strip off the '(:' and the closing ')'
        return s[2:-1].strip()
    
    return s

def visualize_graph(model):
    """Visualizes the model's graph structure using networkx and matplotlib."""
    G = nx.DiGraph()  # Create a directed graph

    # Add nodes with normalized labels
    for node in getattr(model, 'nodes', []):
        node_id = normalize_label(node)
        G.add_node(node_id)

    # Add edges (if the model has relationships)
    if hasattr(model, 'relationships'):
        for rel in model.relationships:
            src_obj = getattr(rel, 'source', getattr(rel, 'start', None))
            dst_obj = getattr(rel, 'target', getattr(rel, 'end', None))
            if src_obj and dst_obj:
                src_id = normalize_label(src_obj)
                dst_id = normalize_label(dst_obj)
                G.add_edge(src_id, dst_id)

    # Draw the graph
    plt.figure(figsize=(8, 6))
    pos = nx.spring_layout(G)
    nx.draw(
        G, pos, 
        with_labels=True, 
        node_color="lightblue", 
        edge_color="gray",
        node_size=2000, 
        font_size=10, 
        font_color="black", 
        arrows=True
    )
    plt.title("Model Visualization")
    plt.show()

# Example usage:
visualize_graph(gdm.current_model)


In [None]:
# import pandas as pd

# # 🚀 STEP 1: Load CSV & Clean Data
# csv_path = "/home/jovyan/Autogen/output.csv"  # Update if needed
# clean_csv_path = "/home/jovyan/Autogen/output_clean.csv"

# # Load the CSV
# df = pd.read_csv(csv_path)

# # Normalize column names (strip spaces, lowercase)
# df.columns = df.columns.str.strip()

# # Rename columns to match Neo4j expectations
# column_mapping = {
#     "agent.id": "agentId",  # Fix agent ID reference
#     "host.id": "hostId",  # Ensure hostId consistency
#     "log.file.device_id": "logFileDeviceId",  # Standardize log file ID
# }
# df.rename(columns=column_mapping, inplace=True)

# # Replace NaN/null with default values to avoid MERGE issues
# df.fillna({"agentId": "UNKNOWN_AGENT"}, inplace=True)
# df.fillna("", inplace=True)

# # Ensure agentId is a string
# df["agentId"] = df["agentId"].astype(str)

# # Save the cleaned CSV
# df.to_csv(clean_csv_path, index=False)
# print("✅ Data cleaned and saved to:", clean_csv_path)

# # 🚀 STEP 2: Generate PyIngest YAML Configuration
# gen = PyIngestConfigGenerator(
#     data_model=gdm.current_model,
#     uri="neo4j://192.168.2.2:7687",
#     database="neo4j",
#     file_directory="/",
#     source_name="output_clean.csv"
# )

# pyingest_yaml = gen.generate_config_string()

# # Fix incorrect column references in YAML
# pyingest_yaml = pyingest_yaml.replace("row.agent.id", "row.agentId")
# pyingest_yaml = pyingest_yaml.replace("row.host.id", "row.hostId")
# pyingest_yaml = pyingest_yaml.replace("row.log.file.device_id", "row.logFileDeviceId")

# # Debugging: Print YAML to verify correct mappings
# print("🔍 Updated YAML Config:\n", pyingest_yaml)

# # 🚀 STEP 3: Ingest Cleaned Data into Neo4j
# PyIngest(config=pyingest_yaml, verbose=True)

# print("✅ Data successfully ingested into Neo4j!")


In [None]:
import os
import re
import json
import yaml  # Requires PyYAML
import time
import threading
import math
import pandas as pd
import numpy as np
from tqdm import tqdm
from openai import OpenAI

# Initialize the OpenAI client using the new interface
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# 🚀 STEP 1: Load CSV & Initial Clean
csv_path = data_directory  # Update if needed

# Define directory for cleaned CSV and ensure it exists.
clean_dir = "/home/jovyan/Elastic_DS/csv_downloads/clean"
os.makedirs(clean_dir, exist_ok=True)
if not clean_dir.endswith(os.sep):
    clean_dir += os.sep

# Define the full path for the cleaned CSV file.
clean_csv_file = os.path.join(clean_dir, "output_clean.csv")

# Load the CSV and normalize column names
df = pd.read_csv(csv_path)
df.columns = df.columns.str.strip()

# 🚀 STEP 2: Dynamically Generate Column Mapping using GPT‑4o
def generate_column_mapping(columns):
    prompt = f"""I have a list of CSV column names:
{columns}
Generate a JSON object mapping each column name to a new column name following Neo4j's camelCase naming convention.
Output only a complete JSON object (including the opening and closing curly braces) and nothing else.
"""
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model="gpt-4o",
        max_tokens=300,
        temperature=0.0,
    )
    
    mapping_str = response.choices[0].message.content.strip()
    match = re.search(r'\{.*\}', mapping_str, re.DOTALL)
    if match:
        mapping_str = match.group()
    else:
        raise ValueError("Invalid JSON response from GPT‑4o")
    
    try:
        mapping = json.loads(mapping_str)
    except json.JSONDecodeError as e:
        raise e
    return mapping

columns_list = list(df.columns)
column_mapping = generate_column_mapping(columns_list)
df.rename(columns=column_mapping, inplace=True)

# --- Dynamic DataFrame Cleaning Function ---
def clean_dataframe(df):
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            df[col] = df[col].apply(lambda x: 0 if pd.isna(x) or (isinstance(x, float) and np.isnan(x)) or x == "" else x)
        elif pd.api.types.is_datetime64_any_dtype(df[col]):
            df[col] = df[col].fillna(pd.Timestamp("1970-01-01"))
        else:
            df[col] = df[col].replace({None: "UNKNOWN", pd.NA: "UNKNOWN"}).fillna("UNKNOWN")
            df[col] = df[col].astype(str).apply(lambda x: "UNKNOWN" if x.lower() in ["nan", "none", "null", ""] else x)
    return df

df = clean_dataframe(df)
df.fillna("UNKNOWN", inplace=True)

# 🚀 STEP 3: Generate PyIngest YAML Configuration
gen = PyIngestConfigGenerator(
    data_model=gdm.current_model,
    uri="neo4j://192.168.2.2:7687",
    database="neo4j",
    file_directory=clean_dir,       # clean_dir ends with a slash
    source_name="output_clean.csv"
)
pyingest_yaml = gen.generate_config_string()

# Replace column references dynamically.
for original, new in column_mapping.items():
    pyingest_yaml = pyingest_yaml.replace(f"row.{original}", f"row.{new}")

# Parse YAML and fix file URLs.
config_dict = yaml.safe_load(pyingest_yaml)
if "files" in config_dict:
    for file_entry in config_dict["files"]:
        if "url" in file_entry:
            url = file_entry["url"].replace("$BASE", "")
            url = re.sub(r'^/+', '/', url)  # collapse multiple leading slashes
            file_entry["url"] = url
pyingest_yaml = yaml.dump(config_dict)

def ensure_dynamic_required_columns(df, yaml_str):
    cols_in_yaml = re.findall(r"row\.([a-zA-Z0-9_.]+)", yaml_str)
    for col in set(cols_in_yaml):
        if col not in df.columns:
            df[col] = f"UNKNOWN_{col.replace('.', '_').upper()}"
    return df

df = ensure_dynamic_required_columns(df, pyingest_yaml)
df = clean_dataframe(df)
df.fillna("UNKNOWN", inplace=True)

def final_clean_value(x):
    if pd.isna(x):
        return "UNKNOWN"
    try:
        s = str(x).strip()
    except Exception:
        return "UNKNOWN"
    return "UNKNOWN" if s.lower() in ["", "nan", "none", "null"] else s

df = df.apply(lambda col: col.map(final_clean_value))
df.replace(r'^\s*$', "UNKNOWN", regex=True, inplace=True)
df.fillna("UNKNOWN", inplace=True)

# Save the cleaned CSV
df.to_csv(clean_csv_file, index=False)

# 🚀 STEP 4: Ingest Cleaned Data into Neo4j with a Progress Bar

# Calculate an estimated total number of batches.
chunk_size = 100
num_batches_per_file = math.ceil(len(df) / chunk_size)
num_file_entries = len(config_dict.get("files", []))
total_batches = num_batches_per_file * num_file_entries

def run_ingestion():
    PyIngest(config=pyingest_yaml, verbose=False)

print("Starting ingestion into Neo4j...")
ingest_thread = threading.Thread(target=run_ingestion)
ingest_thread.start()

progress = 0
# Use a tqdm progress bar with the calculated total batches.
with tqdm(total=total_batches, desc="Ingesting", bar_format="{l_bar}{bar} [ elapsed: {elapsed} ]") as pbar:
    # Simulate progress updates.
    while ingest_thread.is_alive():
        time.sleep(0.2)
        # Update progress by a fixed increment until we reach total_batches - 1.
        if progress < total_batches - 1:
            progress += 1
            pbar.update(1)
    # Ensure the progress bar is complete.
    pbar.n = total_batches
    pbar.refresh()

ingest_thread.join()
print("✅ Data successfully ingested into Neo4j!")
