# EDA – Seq2Seq Dataset (src/tgt)

This notebook performs quick exploratory data analysis for a sequence-to-sequence dataset
with **TSV files** and **columns**:
- `src` – input text
- `tgt` – target text

It assumes files live under:
```
data/processed/train.tsv
data/processed/dev.tsv
data/processed/test.tsv
```
You can change the paths below if needed.

**What you'll get:**
- Basic dataset sizes and missing-value checks
- Token length distributions (src & tgt)
- Character length distributions
- Vocabulary size (whitespace tokenization by default)
- Sample pairs and longest/shortest examples
- N‑gram frequency peek (optional)


In [5]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Never set explicit colors per the plotting rules
plt.rcParams.update({'figure.dpi': 120})

# Paths (edit as needed)
TRAIN_PATH = Path('data/processed/train.tsv')
DEV_PATH   = Path('data/processed/dev.tsv')
TEST_PATH  = Path('data/processed/test.tsv')

SRC_COL = 'src'
TGT_COL = 'tgt'

def read_tsv(p: Path) -> pd.DataFrame:
    if not p.exists():
        raise FileNotFoundError(f"Missing file: {p}. Please adjust the path or add the file.")
    return pd.read_csv(p, sep='\t', dtype=str).fillna('')

train = read_tsv(TRAIN_PATH)
dev   = read_tsv(DEV_PATH)
test  = read_tsv(TEST_PATH)

for name, df in [('train', train), ('dev', dev), ('test', test)]:
    if SRC_COL not in df.columns or TGT_COL not in df.columns:
        raise ValueError(f"{name}.tsv must have columns '{SRC_COL}' and '{TGT_COL}' – got {list(df.columns)}")


FileNotFoundError: Missing file: data/processed/train.tsv. Please adjust the path or add the file.

In [None]:
# Sizes and basic info
def dataset_info(df: pd.DataFrame, name: str):
    print(f"=== {name.upper()} ===")
    print(f"Rows: {len(df)}")
    print("Nulls per column:\n", df[[SRC_COL, TGT_COL]].isnull().sum())
    print("Empty strings per column:\n", (df[[SRC_COL, TGT_COL]] == '').sum())
    print('-'*40)

dataset_info(train, 'train')
dataset_info(dev, 'dev')
dataset_info(test, 'test')


In [None]:
# Token & character lengths
def to_tokens(s: str):
    # whitespace split – replace with your tokenizer if needed
    return s.split()

def add_lengths(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out['src_char_len'] = out[SRC_COL].str.len()
    out['tgt_char_len'] = out[TGT_COL].str.len()
    out['src_tok_len']  = out[SRC_COL].apply(lambda x: len(to_tokens(x)))
    out['tgt_tok_len']  = out[TGT_COL].apply(lambda x: len(to_tokens(x)))
    return out

train_len = add_lengths(train)
dev_len   = add_lengths(dev)
test_len  = add_lengths(test)

def plot_hist(series, title, bins=50):
    plt.figure()
    series.hist(bins=bins)
    plt.title(title)
    plt.xlabel('Length')
    plt.ylabel('Count')
    plt.show()

plot_hist(train_len['src_tok_len'], 'Train: Source token length')
plot_hist(train_len['tgt_tok_len'], 'Train: Target token length')
plot_hist(train_len['src_char_len'], 'Train: Source char length')
plot_hist(train_len['tgt_char_len'], 'Train: Target char length')


In [None]:
# Show a few examples
pd.set_option('display.max_colwidth', 160)

print('Random samples:')
display(train.sample(min(5, len(train)), random_state=42)[[SRC_COL, TGT_COL]])

print('\nShortest by src tokens:')
display(train_len.nsmallest(5, 'src_tok_len')[[SRC_COL, TGT_COL, 'src_tok_len']])

print('\nLongest by src tokens:')
display(train_len.nlargest(5, 'src_tok_len')[[SRC_COL, TGT_COL, 'src_tok_len']])


In [None]:
# Simple vocabulary stats (whitespace tokens)
from collections import Counter

def vocab_stats(df: pd.DataFrame, col: str, top_k=20):
    toks = []
    for s in df[col].tolist():
        toks.extend(s.split())
    counter = Counter(toks)
    print(f"Unique tokens in {col}: {len(counter)}")
    print(f"Top {top_k} tokens:")
    for tok, cnt in counter.most_common(top_k):
        print(f"{tok}\t{cnt}")
    return counter

print('=== Source vocab (train) ===')
src_counter = vocab_stats(train, SRC_COL)

print('\n=== Target vocab (train) ===')
tgt_counter = vocab_stats(train, TGT_COL)


In [None]:
# Optional: quick n-gram peek (character n-grams)
from collections import Counter

def char_ngrams(s: str, n: int = 3):
    s = s.replace(' ', ' ')
    return [s[i:i+n] for i in range(len(s)-n+1)]

def top_char_ngrams(df: pd.DataFrame, col: str, n=3, top_k=30):
    c = Counter()
    for s in df[col].tolist():
        c.update(char_ngrams(s, n))
    print(f"Top {top_k} char {n}-grams in {col}:")
    for gram, cnt in c.most_common(top_k):
        print(f"{gram}\t{cnt}")

top_char_ngrams(train, SRC_COL, n=3, top_k=30)
top_char_ngrams(train, TGT_COL, n=3, top_k=30)
