In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

# --- CONFIG ---
# Replace with your sheet link
SHEET_URL = "https://docs.google.com/spreadsheets/d/1ZeeuWtUgenEp8wBagQxCFyP-Kcw61kIKxzXorudzNUk/export?format=csv&gid="

# Corrected GIDs
SHEETS = {
    "GPTZero": "0",
    "ZeroGPT": "2140979269"
}

# --- FUNCTIONS ---

def load_sheet(sheet_name, gid):
    url = SHEET_URL + gid
    df = pd.read_csv(url)
    df["sheet"] = sheet_name
    return df

def parse_detector_result(col):
    """Split 'AI 100' → ('AI', 100)."""
    if pd.isna(col):
        return pd.Series([None, None])
    parts = str(col).split()
    if len(parts) == 2 and parts[0] in ["AI", "Human"]:
        try:
            return pd.Series([parts[0], float(parts[1])])
        except:
            return pd.Series([parts[0], None])
    return pd.Series([col, None])

def tfidf_similarity(o, a):
    if not isinstance(o, str) or not isinstance(a, str):
        return np.nan
    v = TfidfVectorizer().fit_transform([o, a])
    return cosine_similarity(v[0], v[1])[0][0]

# --- MAIN CHECK PIPELINE ---
all_dfs = []
for name, gid in SHEETS.items():
    df = load_sheet(name, gid)
    print(f"\n--- {name} sheet loaded: {df.shape[0]} rows, {df.shape[1]} cols ---")

    # 1. Schema & missing-values
    print("Missing values per column:")
    print(df.isna().sum())

    # 2. Label consistency
    if "source" in df.columns:
        bad_labels = df[~df["source"].isin(["Human", "AI"])]
        print(f"Inconsistent labels: {len(bad_labels)}")

    # 3. Detector parsing (ORG & ADV)
    if "detector_result_ORG" in df.columns:
        df[["pred_label_ORG", "pred_score_ORG"]] = df["detector_result_ORG"].apply(parse_detector_result)
    if "detector_result_ADV" in df.columns:
        df[["pred_label_ADV", "pred_score_ADV"]] = df["detector_result_ADV"].apply(parse_detector_result)

    # 4. Length & drift check
    df["len_org"] = df["original_text"].astype(str).str.split().str.len()
    df["len_adv"] = df["adversarial_text"].astype(str).str.split().str.len()
    df["len_ratio"] = df["len_adv"] / (df["len_org"] + 1e-9)
    print("Length ratio summary:")
    print(df["len_ratio"].describe())

    outliers = df[(df["len_ratio"] < 0.6) | (df["len_ratio"] > 1.7)]
    print(f"Length ratio outliers: {outliers.shape[0]}")

    # 5. TF-IDF similarity
    df["sim"] = df.apply(lambda r: tfidf_similarity(r["original_text"], r["adversarial_text"]), axis=1)
    print("Similarity summary:")
    print(df["sim"].describe())
    low_sim = df[df["sim"] < 0.6]
    print(f"Low similarity pairs (<0.6): {low_sim.shape[0]}")

    # 6. Manual validation sample (print random 10)
    sample = df.sample(min(10, len(df)), random_state=42)
    print("\nManual validation sample:")
    for _, row in sample.iterrows():
        print("---")
        print("Original:", row["original_text"][:200], "...")
        print("Adversarial:", row["adversarial_text"][:200], "...")

    all_dfs.append(df)

final_df = pd.concat(all_dfs, ignore_index=True)
print("\nFinal combined dataset:", final_df.shape)



--- GPTZero sheet loaded: 300 rows, 19 cols ---
Missing values per column:
sample_id                  0
topic                      0
Category                   0
genre                      0
source                     0
LLM                        0
if_llm_version           300
length_words               0
original_text              0
detector_result_ORG        0
detector_ai_score_ORG      0
edit_type                  0
editor_tool                0
adversarial_length         0
adversarial_text           0
detector_results_ADV       0
detector_ai_score_ADV      0
expected_impact            1
sheet                      0
dtype: int64
Inconsistent labels: 0
Length ratio summary:
count    300.000000
mean       0.997307
std        0.195501
min        0.470430
25%        0.905297
50%        1.005814
75%        1.050057
max        1.895522
Name: len_ratio, dtype: float64
Length ratio outliers: 9
Similarity summary:
count    300.000000
mean       0.821800
std        0.191126
min        0.16055

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# --- CONFIG ---
SHEET_URL = "https://docs.google.com/spreadsheets/d/1ZeeuWtUgenEp8wBagQxCFyP-Kcw61kIKxzXorudzNUk/export?format=csv&gid="
SHEETS = {
    "GPTZero": "0",
    "ZeroGPT": "2140979269"
}

# --- FUNCTIONS ---
def load_sheet(sheet_name, gid):
    url = SHEET_URL + gid
    df = pd.read_csv(url)
    df["sheet"] = sheet_name
    return df

def parse_detector_result(col):
    """Split 'AI 100' → ('AI', 100)."""
    if pd.isna(col):
        return pd.Series([None, None])
    parts = str(col).split()
    if len(parts) == 2 and parts[0] in ["AI", "Human"]:
        try:
            return pd.Series([parts[0], float(parts[1])])
        except:
            return pd.Series([parts[0], None])
    return pd.Series([col, None])

def tfidf_similarity(o, a):
    if not isinstance(o, str) or not isinstance(a, str):
        return np.nan
    v = TfidfVectorizer().fit_transform([o, a])
    return cosine_similarity(v[0], v[1])[0][0]

def analyze_sheet(df, sheet_name):
    """Run all checks and return summary stats as a dict"""
    # Parse detector outputs
    if "detector_result_ORG" in df.columns:
        df[["pred_label_ORG", "pred_score_ORG"]] = df["detector_result_ORG"].apply(parse_detector_result)
    if "detector_result_ADV" in df.columns:
        df[["pred_label_ADV", "pred_score_ADV"]] = df["detector_result_ADV"].apply(parse_detector_result)

    # Length ratio
    df["len_org"] = df["original_text"].astype(str).str.split().str.len()
    df["len_adv"] = df["adversarial_text"].astype(str).str.split().str.len()
    df["len_ratio"] = df["len_adv"] / (df["len_org"] + 1e-9)
    len_stats = df["len_ratio"].describe()

    # Similarity
    df["sim"] = df.apply(lambda r: tfidf_similarity(r["original_text"], r["adversarial_text"]), axis=1)
    sim_stats = df["sim"].describe()

    # Counts
    missing_vals = df.isna().sum().sum()
    inconsistent_labels = df[~df["source"].isin(["Human", "AI"])].shape[0]
    outliers_len = df[(df["len_ratio"] < 0.6) | (df["len_ratio"] > 1.7)].shape[0]
    low_sim = df[df["sim"] < 0.6].shape[0]

    return {
        "Sheet": sheet_name,
        "Rows": df.shape[0],
        "Cols": df.shape[1],
        "Missing Values": missing_vals,
        "Inconsistent Labels": inconsistent_labels,
        "LenRatio Mean": round(len_stats["mean"], 3),
        "LenRatio Std": round(len_stats["std"], 3),
        "LenRatio Outliers": outliers_len,
        "Sim Mean": round(sim_stats["mean"], 3),
        "Sim Std": round(sim_stats["std"], 3),
        "LowSim Pairs (<0.6)": low_sim,
    }, df


# --- MAIN ---
summaries = []
all_dfs = []

for name, gid in SHEETS.items():
    df = load_sheet(name, gid)
    summary, df_cleaned = analyze_sheet(df, name)
    summaries.append(summary)
    all_dfs.append(df_cleaned)

# Compile summary table
summary_table = pd.DataFrame(summaries)

# Final combined dataset
final_df = pd.concat(all_dfs, ignore_index=True)

# --- OUTPUT ---
print("\n=== Summary Table ===")
print(summary_table.to_string(index=False))

print("\nFinal combined dataset:", final_df.shape)

# Save if needed
summary_table.to_csv("detector_dataset_summary.csv", index=False)
final_df.to_csv("detector_dataset_full.csv", index=False)



=== Summary Table ===
  Sheet  Rows  Cols  Missing Values  Inconsistent Labels  LenRatio Mean  LenRatio Std  LenRatio Outliers  Sim Mean  Sim Std  LowSim Pairs (<0.6)
GPTZero   300    25             601                    0          0.997         0.196                  9     0.822    0.191                   34
ZeroGPT   300    25             600                    0          0.997         0.196                  9     0.822    0.191                   34

Final combined dataset: (600, 25)
