In [5]:
import pandas as pd
from pathlib import Path
import numpy as np

# Paths
df1_path = r"nlp_output\classified_.csv"
df2_path = r"csv_output\300_manually_filtered\_final.csv"

# Load
df1 = pd.read_csv(df1_path)
df2 = pd.read_csv(df2_path)

# Normalize (vectorized)
df1["doi_norm"] = (
    df1["file"].astype(str).str.strip().str.lower()
       .str.replace(r"\.json$", "", regex=True)
       .str.replace("_", "/", n=1)
)
df2["doi_norm"] = df2["DOI"].astype(str).str.strip().str.lower()

# Merge + flags
merged = df1.merge(df2, on="doi_norm", how="outer", suffixes=("_csv1","_csv2"))
merged["in_csv1"] = merged["file"].notna()
merged["in_csv2"] = merged["DOI"].notna()
merged["match_type"] = np.select(
    [merged["in_csv1"] & merged["in_csv2"], merged["in_csv1"], merged["in_csv2"]],
    ["both", "csv1_only", "csv2_only"]
)

# Subsets
matching_query = merged[merged["match_type"] == "both"]
only_in_csv1  = merged[merged["match_type"] == "csv1_only"]
only_in_csv2  = merged[merged["match_type"] == "csv2_only"]

# Save near df1 with suffixes
p1 = Path(df1_path)
out_dir, base, ext = p1.parent, p1.stem, p1.suffix

matching_query.to_csv(out_dir / f"{base}matching{ext}", index=False)

if not only_in_csv1.empty:
    only_in_csv1.to_csv(out_dir / f"{base}_only_in_csv1{ext}", index=False)

if not only_in_csv2.empty:
    only_in_csv2.to_csv(out_dir / f"{base}_only_in_csv2{ext}", index=False)


In [None]:
import pandas as pd

def _norm_doi(s: pd.Series) -> pd.Series:
    return s.astype(str).str.strip().str.lower()

# Paths
verification_path = r"csv_output\300_manually_filtered\_verification-list.csv"
matching_path     = out_dir / f"{base}matching{ext}"

# Load
matching_df = pd.read_csv(matching_path)
verify_df   = pd.read_csv(verification_path)

# Normalize verification DOIs
verify_df["doi_norm"] = _norm_doi(verify_df["DOI"])

# Ensure matching has doi_norm (derive if missing)
if "doi_norm" not in matching_df.columns:
    if "DOI" in matching_df.columns:
        matching_df["doi_norm"] = _norm_doi(matching_df["DOI"])
    elif "file" in matching_df.columns:
        matching_df["doi_norm"] = (
            matching_df["file"].astype(str).str.strip().str.lower()
                      .str.replace(r"\.json$", "", regex=True)
                      .str.replace("_", "/", n=1)
        )

# Semi-join: keep FULL rows from matching for DOIs in verification list
review_full = matching_df.merge(
    verify_df[["doi_norm", "DOI"]].rename(columns={"DOI": "DOI_verify"}),
    on="doi_norm",
    how="inner"
)

# Save full-column review file
review_full_out = out_dir / f"{base}review{ext}"
review_full.to_csv(review_full_out, index=False)