In [1]:
import pandas as pd
import re
from html import unescape


def clean_markdown(md_text):
    # Remove code blocks
    md_text = re.sub(r'```.*?```', '', md_text, flags=re.DOTALL)
    # Remove inline code
    md_text = re.sub(r'`[^`]*`', '', md_text)
    # Remove images
    md_text = re.sub(r'!\[.*?\]\(.*?\)', '', md_text)
    # Remove links but keep link text
    md_text = re.sub(r'\[([^\]]+)\]\(.*?\)', r'\1', md_text)
    # Remove bold and italic (groups of *, _)
    md_text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', md_text)
    md_text = re.sub(r'(\*|_)(.*?)\1', r'\2', md_text)
    # Remove headings
    md_text = re.sub(r'#+ ', '', md_text)
    # Remove blockquotes
    md_text = re.sub(r'^>.*$', '', md_text, flags=re.MULTILINE)
    # Remove list markers
    md_text = re.sub(r'^(\s*[-*+]|\d+\.)\s+', '', md_text, flags=re.MULTILINE)
    # Remove horizontal rules
    md_text = re.sub(r'^\s*[-*_]{3,}\s*$', '', md_text, flags=re.MULTILINE)
    # Remove tables
    md_text = re.sub(r'\|.*?\|', '', md_text)
    # Remove raw HTML tags
    md_text = re.sub(r'<.*?>', '', md_text)
    # Decode HTML entities
    md_text = unescape(md_text)
    return md_text
    
def clean_text(t):
    t = clean_markdown(t)
    t = t.replace("\n"," ")
    t = t.replace("\t"," ")
    t = t.replace("^M"," ")
    t = t.replace("\r"," ")
    t = t.replace(" ,", ",")
    t = re.sub(" +", " ", t)
    return t

testing_frame = pd.read_parquet("/kaggle/input/dactyl-complete-release-adversarial-included/complete_testing.parquet")


In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from tqdm import tqdm
pipe = pipeline("text-classification", model="fakespot-ai/roberta-base-ai-text-detection-v1")

def data():
    for text in testing_frame["text"].values:
        yield clean_text(text)
y_pred = list()
for result in tqdm(pipe(data(),batch_size=16, max_length=512,truncation=True, return_all_scores=True), total=len(testing_frame)):
    for output in result:
        if output["label"] == "AI":
            y_pred.append(output["score"])



config.json:   0%|          | 0.00/848 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Device set to use cuda:0
100%|██████████| 94240/94240 [34:02<00:00, 46.15it/s]


In [3]:
testing_frame["fakespot_pred"] = y_pred
testing_frame = testing_frame.drop(columns="text")
testing_frame.to_csv(f"fakespot-results.csv",index=False)
