# Getting Started with Nemesis Data

This notebook demonstrates how to connect to and query Nemesis data using Hasura GraphQL.

## Setup

First, let's import the required libraries and set up our connection to Hasura.

In [None]:
import os

import matplotlib.pyplot as plt
import pandas as pd
from gql import Client, gql
from gql.transport.requests import RequestsHTTPTransport

# Set up the GraphQL client
hasura_url = os.getenv("HASURA_GRAPHQL_URL", "http://hasura:8080/v1/graphql")
admin_secret = os.getenv("HASURA_ADMIN_SECRET", "")

transport = RequestsHTTPTransport(url=hasura_url, headers={"x-hasura-admin-secret": admin_secret})

client = Client(transport=transport, fetch_schema_from_transport=True)
print(f"Connected to Hasura at: {hasura_url}")

## Basic Queries

Let's start with some basic queries to explore the data.

In [None]:
# Get count of files
query = gql("""
    query {
        files_enriched_aggregate {
            aggregate {
                count
            }
        }
    }
""")

result = client.execute(query)
file_count = result["files_enriched_aggregate"]["aggregate"]["count"]
print(f"Total enriched files: {file_count}")

In [None]:
# Sample some file data
query = gql("""
    query {
        files_enriched(limit: 10) {
            object_id
            file_name
            extension
            size
            magic_type
            mime_type
            is_plaintext
            created_at
        }
    }
""")

result = client.execute(query)
files_df = pd.DataFrame(result["files_enriched"])
print("Sample files:")
print(files_df.head())

## Data Analysis Examples

Let's analyze the file types and sizes in the dataset.

In [None]:
# Analyze file extensions
query = gql("""
    query {
        files_enriched {
            extension
            size
        }
    }
""")

result = client.execute(query)
files_df = pd.DataFrame(result["files_enriched"])

# Plot file extensions
plt.figure(figsize=(12, 6))
extension_counts = files_df["extension"].value_counts().head(10)
extension_counts.plot(kind="bar")
plt.title("Top 10 File Extensions")
plt.xlabel("Extension")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Analyze file sizes
plt.figure(figsize=(12, 6))
files_df["size_mb"] = files_df["size"] / (1024 * 1024)
plt.hist(files_df["size_mb"], bins=50, alpha=0.7)
plt.title("File Size Distribution")
plt.xlabel("Size (MB)")
plt.ylabel("Frequency")
plt.yscale("log")
plt.show()

print(f"Average file size: {files_df['size_mb'].mean():.2f} MB")
print(f"Median file size: {files_df['size_mb'].median():.2f} MB")

## Findings Analysis

Let's explore the security findings in the dataset.

In [None]:
# Sample some findings data
query = gql("""
    query {
        findings_aggregate {
            aggregate {
                count
            }
        }
        findings(limit: 10) {
            finding_name
            category
            severity
            origin_name
            created_at
        }
    }
""")

result = client.execute(query)
findings_count = result["findings_aggregate"]["aggregate"]["count"]
findings_df = pd.DataFrame(result["findings"])

print(f"Total findings: {findings_count}")
print("\nSample findings:")
print(findings_df.head())

In [None]:
# Analyze findings by category and severity
query = gql("""
    query {
        findings {
            category
            severity
            finding_name
        }
    }
""")

result = client.execute(query)
all_findings_df = pd.DataFrame(result["findings"])

# Plot findings by category
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
category_counts = all_findings_df["category"].value_counts()
category_counts.plot(kind="bar")
plt.title("Findings by Category")
plt.ylabel("Count")
plt.xticks(rotation=45)

plt.subplot(1, 3, 2)
severity_counts = all_findings_df["severity"].value_counts()
severity_counts.plot(kind="bar")
plt.title("Findings by Severity")
plt.ylabel("Count")
plt.xticks(rotation=45)

plt.subplot(1, 3, 3)
finding_name_counts = all_findings_df["finding_name"].value_counts().head(10)
finding_name_counts.plot(kind="bar")
plt.title("Top 10 Finding Types")
plt.ylabel("Count")
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## Advanced Queries

Here are some more advanced queries that join data across tables.

In [None]:
# Files with findings - joined query
query = gql("""
    query {
        files_enriched {
            object_id
            file_name
            extension
            size
            findingsByObjectId {
                finding_name
                category
                severity
            }
        }
    }
""")

result = client.execute(query)
files_with_findings = []

for file_item in result["files_enriched"]:
    if file_item["findingsByObjectId"]:
        for finding in file_item["findingsByObjectId"]:
            files_with_findings.append(
                {
                    "object_id": file_item["object_id"],
                    "file_name": file_item["file_name"],
                    "extension": file_item["extension"],
                    "size": file_item["size"],
                    "finding_name": finding["finding_name"],
                    "category": finding["category"],
                    "severity": finding["severity"],
                }
            )

files_findings_df = pd.DataFrame(files_with_findings)
print(f"Files with findings: {len(files_findings_df)}")
print(files_findings_df.head())

## Custom Analysis

This is where you can add your own custom queries and analysis!

In [None]:
# Add your custom analysis here
print("Ready for your custom analysis!")