In [None]:
from huggingface_hub import login
HF_TOKEN="<HF_TOKEN>"
login(HF_TOKEN)

In [1]:
import pandas as pd
df_full=pd.read_csv("health_misinfo_speed_run.csv")

In [None]:
import re, html

def clean_comment(text):
    text = html.unescape(text)

    # Fix tokenized URLs and slashes
    text = re.sub(r"h\s*t\s*t\s*p\s*s?\s*:\s*/\s*/", "http://", text, flags=re.I)
    text = re.sub(r"w\s*w\s*w\s*\.\s*", "www.", text, flags=re.I)
    text = re.sub(r"\s*\.\s*", ".", text)
    text = re.sub(r"\s*/\s*", "/", text)
    text = re.sub(r"\s*-\s*", "-", text)

    # Fix contractions: i ' m → I'm, don ' t → don't, etc.
    text = re.sub(r"\b([A-Za-z])\s*'\s*([A-Za-z])\b", r"\1'\2", text)
    
    # Remove any remaining isolated apostrophes
    text = re.sub(r"\s+'\s*", "'", text)

    # Normalize subreddit and user mentions
    text = re.sub(r"/\s*r\s*/\s*", "r/", text)
    text = re.sub(r"/\s*u\s*/\s*", "u/", text)

    # Remove markdown artifacts
    text = re.sub(r"\[(.*?)\]\(.*?\)", r"\1", text)

    # Remove repeated punctuation spaces
    text = re.sub(r"\s+([.,!?;:])", r"\1", text)

    # Add space after sentence-ending punctuation if missing
    text = re.sub(r"([.!?])([A-Za-z])", r"\1 \2", text)

    # Fix repeated spaces
    text = re.sub(r"\s{2,}", " ", text).strip()

    # Capitalize first character
    if text:
        text = text[0].upper() + text[1:]

    return text
df_full['body'].apply(clean_comment)

In [26]:
def pick_relevant_data(data):
    health_keywords = [
    # --- The "Miracle" Foods & Herbs ---
    "moringa", "turmeric", "ginger", "garlic", "lemon", "honey", 
    "apple cider vinegar", "acv", "coconut oil", "soursop", "graviola",
    "apricot kernels", "b17", "laetrile", "black seed", "nigella sativa",
    "neem", "aloe vera", "oregano oil", "manuka", "elderberry",
    "colloidal silver", "silver solution", "sea moss", "alkaline water",
    
    # --- Substances & controversial treatments ---
    "ivermectin", "hydroxychloroquine", "hcq", "fenbendazole", 
    "mms", "miracle mineral solution", "chlorine dioxide", "bleach",
    "turpentine", "kerosene", "black salve", "bloodroot", 
    "urine therapy", "aged urine", "dmso", "hydrogen peroxide",
    "ozone therapy", "chelation", "coffee enema",
    
    # --- Buzzwords & Claims ---
    "cure", "cures", "cured", "curative", "treats", "treated", "treatment", "miracle", "remedy", "panacea",
    "detox", "detoxify", "cleanse", "purify", "parasite", "worm", "shedding",
    "natural alternative", "big pharma", "suppressed", "hidden truth",
    "ancient wisdom", "secret", "pharmaceutical mafia", "depopulation",
    "graphene", "5g", "spike protein", "clot shot", "jab", "bioweapon",
    "vaxx", "anti-vax", "vaccine injury", "herd immunity", "natural immunity",
    
    # --- Target Diseases (often subjects of misinformation) ---
    "cancer", "tumor", "diabetes", "malaria", "tuberculosis", "tb", 
    "aids", "hiv", "autism", "autistic", "lyme", "morgellons", 
    "candida", "leaky gut", "adrenal fatigue", "chronic fatigue", 
    "fibromyalgia", "lupus", "epilepsy", "alzheimer"
]
    text = str(data).lower()
    pattern = '|'.join(health_keywords)
    return bool(re.search(pattern, text, re.IGNORECASE))

# Filter dataframe to only keep rows with health keywords
mask = df_full['body'].apply(pick_relevant_data)
df_filtered = df_full[mask].copy()

print(f"Original rows: {len(df_full)}")
print(f"Filtered rows (with keywords): {len(df_filtered)}")
df_filtered.head()

Original rows: 100000
Filtered rows (with keywords): 86539


Unnamed: 0,author,body,controversiality,created_utc,link_id,score,subreddit,subreddit_id,id
0,UdonUdon,i wish i could truly speak for my father here ...,0,1325382013,t3_nxfwf,5,nba,t5_2qo4s,c3cur8t
1,nakp88d,"finally , some strong yet honest criticism , r...",0,1325382400,t3_nxf5a,2,india,t5_2qh1q,c3cusw7
2,The_Evil_Within,"yeah , d & amp ; d requires very little , hone...",0,1325384584,t3_nx1rb,1,rpg,t5_2qh2s,c3cv1rb
3,[deleted],hrrrrrrm . . . . well some of the more common ...,0,1325388991,t3_ny00l,1,funny,t5_2qh33,c3cvikf
4,srta_idiotica,i ' ll be shedding tear of joys if something e...,0,1325390493,t3_nxsrx,2,toronto,t5_2qi63,c3cvo9s


In [33]:
from datasets import Dataset
ds_filtered=Dataset.from_pandas(df_filtered)
ds_filtered.save_to_disk("reddit-health-86k")

Saving the dataset (1/1 shards): 100%|██████████| 86539/86539 [00:00<00:00, 911703.07 examples/s]


In [1]:
from datasets import load_from_disk

ds = load_from_disk("reddit-health-cleaned")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

import pandas as pd
import numpy as np

df=pd.DataFrame(ds)
np.where(df['controversiality']==1)

df.iloc[2]

author                                           chiropractorsananton
body                best chiropractor in san antonio http : / / ww...
controversiality                                                    0
created_utc                                                1325378875
link_id                                                      t3_ny3q0
score                                                               1
subreddit                                                      Health
subreddit_id                                                 t5_2qh9z
id                                                            c3cud7a
clean_text          Best chiropractor in san antonio http://www. d...
Name: 2, dtype: object

In [None]:
# Dependency parsing
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_triples(text):
    doc = nlp(text)
    triples = []
    for sent in doc.sents:
        for token in sent:
            if token.dep_ == "ROOT" and token.pos_ == "VERB":
                subj = [w.text for w in token.lefts if w.dep_ in ("nsubj", "nsubjpass")]
                obj = [w.text for w in token.rights if w.dep_ in ("dobj", "attr", "pobj")]
                if subj and obj:
                    triples.append((" ".join(subj), token.lemma_, " ".join(obj)))
    print(triples)
    return triples

def extract_triples_batch(batch):
    return {"triples": [extract_triples(t) for t in batch["clean_text"]]}

triples_ds = ds.map(extract_triples_batch, batched=True)


In [50]:
triples_df=pd.DataFrame(triples_ds)
triples_df_nonempty = triples_df[triples_df['triples'].apply(lambda x: len(x) > 0)]
print(triples_df_nonempty['triples'])



3                                [[friend, have, burping]]
5                               [[states, have, programs]]
10                                  [[glass, cause, much]]
12                                  [[people, take, look]]
13                  [[russians, do, it], [you, see, list]]
                               ...                        
17279              [[i, find, rebuttal], [i, get, access]]
17280    [[gt mandache, leave, career], [most, leave, c...
17281    [[They, refuse, babies], [hospital, find, doct...
17285                                 [[I, read, studies]]
17291                        [[anything, have, potential]]
Name: triples, Length: 7610, dtype: object


In [None]:
#gpt

import openai

model="gpt-4.1"

client=openai.Client(api_key="<openai key>")


system_prompt=f"""
You are an information extraction system designed to transform informal Reddit comments into factual knowledge triples.

Your task: Extract all factual or causal claim triples from the given text in the format:
(subject, relation, object)

Guidelines:
- Focus only on factual, scientific, or health-related claims (not opinions or emotions).
- Each triple must represent a single relationship that can, in principle, be verified.
- Use simple canonical verbs like: treats, causes, prevents, leads_to, associated_with, contains, increases, decreases, helps_with, used_for, etc.
- Avoid non-informative relations such as “is”, “have”, “make”, “thing”.
- Combine multi-word entities where appropriate (e.g., “vitamin D deficiency”, “blood pressure”).
- If no factual claim is found, return an empty list.
- Output strictly as tuple in this format:
[
  (subject, relation, object),
  ...
]

Sentence:

{ds[17280]["clean_text"]}
"""

response= client.responses.create(
    instructions=system_prompt,
    input=system_prompt,
    model=model
)

print(response.output_text)

[
  (high quality health care, associated_with, extra payment),
  (high quality health care, associated_with, black money),
  (low salaries, causes, corruption),
  (low salaries, leads_to, black market),
  (low salaries, decreases, applications for medical colleges),
  (most medical college graduates, leaves, country),
  (private clinics, associated_with, zero kickbacks),
  (private clinics, associated_with, decent services),
  (universal insurance system, allows, free procedures in private clinics),
  (universal insurance system, covers, most procedures in private clinics),
  (medics in state-run hospitals, paid, low salaries),
  (low salaries, leads_to, corruption)
]


In [None]:
import os, json
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


qwen_model = "Qwen/Qwen3-8B"
llama_model= "meta-llama/Meta-Llama-3-8B-Instruct"


def load_model(model_name=qwen_model):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        dtype=torch.float16,
        device_map="auto",
        attn_implementation="sdpa",
        low_cpu_mem_usage=True
    )

    return tokenizer, model


def extract_triples(clean_text, tokenizer, model, max_new_tokens=256):
    system_prompt=f"""
You are an information extraction system designed to transform informal Reddit comments into factual knowledge triples.

Your task: Extract all factual or causal claim triples from the given text in the format:
(subject, relation, object)

Guidelines:
- Focus only on factual, scientific, or health-related claims (not opinions or emotions).
- Each triple must represent a single relationship that can, in principle, be verified.
- Use simple canonical verbs like: treats, causes, prevents, leads_to, associated_with, contains, increases, decreases, helps_with, used_for, etc.
- Avoid non-informative relations such as “is”, “have”, “make”, “thing”.
- Combine multi-word entities where appropriate (e.g., “vitamin D deficiency”, “blood pressure”).
- If no factual claim is found, return an empty list.
- Output strictly as tuple in this format:
[
  (subject, relation, object),
  ...
]
""" 
    prompt = f"{system_prompt}\n\nSentence:\n{clean_text}\n\nTriples:\n"
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    triples = tokenizer.decode(output[0], skip_special_tokens=True)
    if "Triples:" in triples:
        triples = triples.split("Triples:")[-1].strip()
        
    return triples



# def extract_triples_batch(batch, tokenizer, model):
#     results = []
#     for text in batch["clean_text"]:
#         res = extract_triples(text, tokenizer, model)
#         results.append(res)
#     return {"triples": results}


# tokenizer, model = load_model()
# triples_ds = ds.map(
#     lambda b: extract_triples_batch(b, tokenizer, model),
#     batch_size=10,
#     writer_batch_size=10,
#     batched=True
# )

In [3]:
# Efficient dataset write-back
from datasets import load_dataset,Dataset,DatasetInfo
from datasets.arrow_writer import ArrowWriter
from datasets.features import Features, Value


def process_dataset(input_dataset: Dataset, output_path: str):
    tokenizer, model = load_model()

    # Define new dataset schema
    new_features = Features({
        **input_dataset.features,
        "triples": Value("string")
    })

    # Create writer with correct signature
    writer = ArrowWriter(
        path=output_path,
        features=new_features
    )

    # Process each row efficiently
    for row in input_dataset:
        triples = extract_triples(row["clean_text"], tokenizer, model)
        row_out = {**row, "triples": triples}
        writer.write(row_out)

    writer.finalize()

In [4]:
process_dataset(ds, "output.arrow")
final_ds = Dataset.from_file("output.arrow")

Loading checkpoint shards: 100%|██████████| 5/5 [00:10<00:00,  2.12s/it]
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [81]:
from datasets import load_dataset, load_from_disk
ds_triples = load_from_disk("triples_output")

In [78]:
import ast
import re

def clean_and_parse_triples(raw_text: str):
    """
    Cleans model output and extracts triples in the form:
    [("subj","rel","obj"), ...]
    Robust against:
      - Qwen headers (<|eot_id|>, <|start_header_id|>...)
      - Extra prompts or assistant/user text
      - Multiple lists in single output
      - Trailing commas, newlines, stray characters
    """

    # 1. Remove Qwen / special token garbage
    raw_text = re.sub(r"<\|.*?\|>", "", raw_text)
    raw_text = re.sub(r"<.*?header.*?>", "", raw_text)

    # 2. Remove repeated chunks like "Sentence:" etc.
    raw_text = re.sub(r"Sentence:.*?Triples:", "", raw_text, flags=re.DOTALL)

    # 3. Find ALL list-like fragments: [ (...), (...) ]
    candidates = re.findall(r"\[\s*\(.*?\)\s*\]", raw_text, flags=re.DOTALL)

    if not candidates:
        return []  # no triples found

    triples = []

    for c in candidates:
        try:
            parsed = ast.literal_eval(c)     # safely parse Python literal
            if isinstance(parsed, list):
                # filter valid tuples only
                parsed = [
                    tup for tup in parsed
                    if isinstance(tup, tuple) and len(tup) == 3
                ]
                triples.extend(parsed)
        except:
            continue

    return triples


In [93]:

def normalize_triples(cell):
    return str(cell)


In [94]:
import pandas as pd

df= pd.DataFrame(ds_triples)

df["triples"] = df["triples"].apply(clean_and_parse_triples)
df["triples"] = df["triples"].apply(normalize_triples)



In [95]:
from datasets import Dataset

ds_clean = Dataset.from_pandas(df, preserve_index=False)
ds_clean.save_to_disk("triples_output_cleaned")


Saving the dataset (1/1 shards): 100%|██████████| 17292/17292 [00:00<00:00, 784645.31 examples/s]


In [108]:
ds_clean_pd=ds_clean.to_pandas()
example=ast.literal_eval(ds_clean_pd["triples"][17287])

example


'time spent in front of screens'

In [1]:
from datasets import load_dataset, load_from_disk
ds_triples = load_from_disk("triples_output_cleaned")

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
from datasets import load_dataset, load_from_disk
ds_linked = load_from_disk("reddit-health-small-sapbert-full")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
ds_1=ds_linked.to_pandas()
ds_1=ds_1.sample(50)
for i in ds_1['umls_triples']:
    if i!=[]:
        print(i)

  if i!=[]:
