In [3]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('processed_agent_dataset.csv')

# Display basic info about the dataset
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())

# Count None/NaN values for each column
print("\n" + "="*50)
print("NONE/NULL VALUES ANALYSIS")
print("="*50)

# Count None/NaN values in each column
null_counts = df.isnull().sum()
empty_string_counts = (df == '').sum()

print("\nNull/NaN values per column:")
for col in df.columns:
    null_count = null_counts[col]
    empty_count = empty_string_counts[col]
    total_rows = len(df)
    print(f"{col}: {null_count} null values ({null_count/total_rows*100:.1f}%)")
    if empty_count > 0:
        print(f"  + {empty_count} empty strings ({empty_count/total_rows*100:.1f}%)")

print(f"\nTotal rows: {total_rows}")
print(f"Total columns: {len(df.columns)}")

# Show columns with the most missing data
print("\nColumns with missing data (sorted by count):")
missing_data = null_counts[null_counts > 0].sort_values(ascending=False)
if len(missing_data) > 0:
    for col, count in missing_data.items():
        percentage = count/total_rows*100
        print(f"  {col}: {count} ({percentage:.1f}%)")
else:
    print("  No null values found in any column")

# Check for empty strings
print("\nColumns with empty strings (sorted by count):")
empty_data = empty_string_counts[empty_string_counts > 0].sort_values(ascending=False)
if len(empty_data) > 0:
    for col, count in empty_data.items():
        percentage = count/total_rows*100
        print(f"  {col}: {count} ({percentage:.1f}%)")
else:
    print("  No empty strings found in any column")

Dataset shape: (36, 20)

Column names:
['user_id', 'task_id', 'turn_id', 'ground_truth', 'expected_tool_call', 'agent_name', 'agent_role', 'agent_task', 'system_prompt', 'agent_response', 'trace', 'tools_available', 'tool_calls', 'parameters_passed', 'tool_call_results', 'retrieval_query', 'retrieved_context', 'exit_status', 'agent_exit', 'metadata']

NONE/NULL VALUES ANALYSIS

Null/NaN values per column:
user_id: 36 null values (100.0%)
task_id: 0 null values (0.0%)
turn_id: 0 null values (0.0%)
ground_truth: 36 null values (100.0%)
expected_tool_call: 36 null values (100.0%)
agent_name: 0 null values (0.0%)
agent_role: 0 null values (0.0%)
agent_task: 0 null values (0.0%)
system_prompt: 0 null values (0.0%)
agent_response: 2 null values (5.6%)
trace: 28 null values (77.8%)
tools_available: 0 null values (0.0%)
tool_calls: 0 null values (0.0%)
parameters_passed: 0 null values (0.0%)
tool_call_results: 0 null values (0.0%)
retrieval_query: 34 null values (94.4%)
retrieved_context: 36 n

In [8]:
# Check the data types
print(df['tool_calls'].value_counts())


tool_calls
[]                                                                                                                                                                                                  29
[{"tool_name": "user_input", "parameters": {"input": "Please provide more information"}, "call_id": "367a0df1-2f33-45fb-8e8f-42d5ecf2f68a"}]                                                         1
[{"tool_name": "user_input", "parameters": {"input": "Please provide more information"}, "call_id": "ff5dd99b-17d2-4385-b1cb-1f1949bfd81a"}]                                                         1
[{"tool_name": "user_input", "parameters": {"input": "Please clarify if you want me to stay active until you tell me to exit."}, "call_id": "3c8fba0a-8897-4e97-8bc2-243b2d7d0a41"}]                 1
[{"tool_name": "user_input", "parameters": {"input": "Please provide more information about what you mean by \"exit after saying 5 texts."}, "call_id": "bf651234-ff75-4083-9b93-26b0445ff21f"}] 