# Exploratory Data Analysis of Persian Ezafe Constructions

This notebook explores raw `.conllu` data and prepares a preprocessed dataset of head-modifier pairs for model training. 
We will inspect token frequencies, ezafe labels, and positional patterns.


In [None]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure project root is in path
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

# Import preprocessing script
from scripts.preprocess_data import preprocess_conllu_folder

# Display settings
pd.set_option("display.max_columns", None)


In [None]:
import sys, os

# Add project root to Python path
project_root = os.path.abspath("..")
sys.path.append(project_root)

print("Project root added to PYTHONPATH:", project_root)


In [None]:
# Define paths relative to the notebook
conllu_folder = "../data/conllu_files"
preprocessed_csv = "../data/preprocessed/head_modifier_pairs.csv"

# Ensure preprocessed folder exists
os.makedirs("../data/preprocessed", exist_ok=True)

if os.path.exists(preprocessed_csv):
    print("✔ Preprocessed CSV found. Loading...")
    df = pd.read_csv(preprocessed_csv)
else:
    print("⚠ Preprocessed CSV not found.")

    if os.path.exists(conllu_folder) and len(os.listdir(conllu_folder)) > 0:
        print("✔ Raw .conllu files found. Running preprocessing...")
        df = preprocess_conllu_folder(conllu_folder, output_csv=preprocessed_csv)
    else:
        raise FileNotFoundError(
            "❌ No preprocessed CSV and no raw .conllu files found.\n"
            "Please place raw files in: data/conllu_files/"
        )

print(f"Loaded dataset with {len(df)} rows and {df.shape[1]} columns.")
df.head()


In [None]:
df.info()
df.describe(include="all")
print("Ezafe label distribution:")
print(df["ezafe_label"].value_counts(dropna=False))

print("\nHead position distribution:")
print(df["position"].value_counts(dropna=False))


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=df, x="ezafe_label")
plt.title("Ezafe Label Distribution")
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=df, x="position")
plt.title("Head Position (before/after)")
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df["head_freq"], bins=40)
plt.title("Head Lemma Frequencies")
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
sns.histplot(df["modifier_freq"], bins=40)
plt.title("Modifier Lemma Frequencies")
plt.tight_layout()
plt.show()


In [None]:
df.sample(10)
