# Exploratory Analysis

Notebook for exploring the dataset and model predictions.

In [None]:
import sys
sys.path.insert(0, "..")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

## Load Dataset

In [None]:
df = pd.read_csv("../data/splits.csv")
print(f"Total samples: {len(df)}")
df.head()

## Class Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Overall distribution
df["label"].value_counts().plot(kind="bar", ax=axes[0])
axes[0].set_title("Overall Class Distribution")
axes[0].set_xlabel("Class")
axes[0].set_ylabel("Count")

# Per-split distribution
split_dist = df.groupby(["split", "label"]).size().unstack(fill_value=0)
split_dist.T.plot(kind="bar", ax=axes[1])
axes[1].set_title("Class Distribution by Split")
axes[1].set_xlabel("Class")
axes[1].set_ylabel("Count")
axes[1].legend(title="Split")

plt.tight_layout()
plt.show()

## Sequence Length Distribution

In [None]:
df["seq_length"] = df["sequence"].apply(len)

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(df["seq_length"], bins=50, edgecolor="black")
plt.xlabel("Sequence Length")
plt.ylabel("Count")
plt.title("Sequence Length Distribution")

plt.subplot(1, 2, 2)
df.boxplot(column="seq_length", by="label", rot=45)
plt.xlabel("Class")
plt.ylabel("Sequence Length")
plt.title("Sequence Length by Class")
plt.suptitle("")  # Remove auto title

plt.tight_layout()
plt.show()

print(f"Mean length: {df['seq_length'].mean():.1f}")
print(f"Median length: {df['seq_length'].median():.1f}")
print(f"Max length: {df['seq_length'].max()}")