In [None]:
from huggingface_hub import login
HF_TOKEN="<HF_TOKEN>"
login(HF_TOKEN)

In [63]:
from datasets import load_dataset

dataset = load_dataset("Vijayrathank/reddit-health-small", split="train")

In [64]:
import re, html

def clean_comment(text):
    text = html.unescape(text)

    # Fix tokenized URLs and slashes
    text = re.sub(r"h\s*t\s*t\s*p\s*s?\s*:\s*/\s*/", "http://", text, flags=re.I)
    text = re.sub(r"w\s*w\s*w\s*\.\s*", "www.", text, flags=re.I)
    text = re.sub(r"\s*\.\s*", ".", text)
    text = re.sub(r"\s*/\s*", "/", text)
    text = re.sub(r"\s*-\s*", "-", text)

    # Fix contractions: i ' m → I'm, don ' t → don't, etc.
    text = re.sub(r"\b([A-Za-z])\s*'\s*([A-Za-z])\b", r"\1'\2", text)
    
    # Remove any remaining isolated apostrophes
    text = re.sub(r"\s+'\s*", "'", text)

    # Normalize subreddit and user mentions
    text = re.sub(r"/\s*r\s*/\s*", "r/", text)
    text = re.sub(r"/\s*u\s*/\s*", "u/", text)

    # Remove markdown artifacts
    text = re.sub(r"\[(.*?)\]\(.*?\)", r"\1", text)

    # Remove repeated punctuation spaces
    text = re.sub(r"\s+([.,!?;:])", r"\1", text)

    # Add space after sentence-ending punctuation if missing
    text = re.sub(r"([.!?])([A-Za-z])", r"\1 \2", text)

    # Fix repeated spaces
    text = re.sub(r"\s{2,}", " ", text).strip()

    # Capitalize first character
    if text:
        text = text[0].upper() + text[1:]

    return text


In [65]:
dataset=dataset.map(lambda x: {"clean_text": clean_comment(x["body"])})
dataset.save_to_disk("reddit-health-cleaned")

Saving the dataset (0/1 shards):   0%|          | 0/17292 [00:00<?, ? examples/s]

In [66]:
from datasets import load_from_disk
ds = load_from_disk("reddit-health-cleaned")

In [None]:

import pandas as pd
import numpy as np

df=pd.DataFrame(ds)
np.where(df['controversiality']==1)

df.iloc[2]

In [None]:
# Dependency parsing
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_triples(text):
    doc = nlp(text)
    triples = []
    for sent in doc.sents:
        for token in sent:
            if token.dep_ == "ROOT" and token.pos_ == "VERB":
                subj = [w.text for w in token.lefts if w.dep_ in ("nsubj", "nsubjpass")]
                obj = [w.text for w in token.rights if w.dep_ in ("dobj", "attr", "pobj")]
                if subj and obj:
                    triples.append((" ".join(subj), token.lemma_, " ".join(obj)))
    print(triples)
    return triples

def extract_triples_batch(batch):
    return {"triples": [extract_triples(t) for t in batch["clean_text"]]}

triples_ds = ds.map(extract_triples_batch, batched=True)


In [50]:
triples_df=pd.DataFrame(triples_ds)
triples_df_nonempty = triples_df[triples_df['triples'].apply(lambda x: len(x) > 0)]
print(triples_df_nonempty['triples'])



3                                [[friend, have, burping]]
5                               [[states, have, programs]]
10                                  [[glass, cause, much]]
12                                  [[people, take, look]]
13                  [[russians, do, it], [you, see, list]]
                               ...                        
17279              [[i, find, rebuttal], [i, get, access]]
17280    [[gt mandache, leave, career], [most, leave, c...
17281    [[They, refuse, babies], [hospital, find, doct...
17285                                 [[I, read, studies]]
17291                        [[anything, have, potential]]
Name: triples, Length: 7610, dtype: object


In [None]:
#gpt

import openai

model="gpt-4.1"

client=openai.Client(api_key="<openai key>")


system_prompt=f"""
You are an information extraction system designed to transform informal Reddit comments into factual knowledge triples.

Your task: Extract all factual or causal claim triples from the given text in the format:
(subject, relation, object)

Guidelines:
- Focus only on factual, scientific, or health-related claims (not opinions or emotions).
- Each triple must represent a single relationship that can, in principle, be verified.
- Use simple canonical verbs like: treats, causes, prevents, leads_to, associated_with, contains, increases, decreases, helps_with, used_for, etc.
- Avoid non-informative relations such as “is”, “have”, “make”, “thing”.
- Combine multi-word entities where appropriate (e.g., “vitamin D deficiency”, “blood pressure”).
- If no factual claim is found, return an empty list.
- Output strictly as tuple in this format:
[
  (subject, relation, object),
  ...
]

Sentence:

{ds[17280]["clean_text"]}
"""

response= client.responses.create(
    instructions=system_prompt,
    input=system_prompt,
    model=model
)

print(response.output_text)

[
  (high quality health care, associated_with, extra payment),
  (high quality health care, associated_with, black money),
  (low salaries, causes, corruption),
  (low salaries, leads_to, black market),
  (low salaries, decreases, applications for medical colleges),
  (most medical college graduates, leaves, country),
  (private clinics, associated_with, zero kickbacks),
  (private clinics, associated_with, decent services),
  (universal insurance system, allows, free procedures in private clinics),
  (universal insurance system, covers, most procedures in private clinics),
  (medics in state-run hospitals, paid, low salaries),
  (low salaries, leads_to, corruption)
]


In [None]:
import os, json
from huggingface_hub import InferenceClient


qwen_model = "Qwen/Qwen3-8B"
llama_model= "meta-llama/Meta-Llama-3-8B-Instruct"


def extract_triples_via_hf(clean_text, max_tokens= 256):
    print(clean_text)
    system_prompt=f"""
You are an information extraction system designed to transform informal Reddit comments into factual knowledge triples.

Your task: Extract all factual or causal claim triples from the given text in the format:
(subject, relation, object)

Guidelines:
- Focus only on factual, scientific, or health-related claims (not opinions or emotions).
- Each triple must represent a single relationship that can, in principle, be verified.
- Use simple canonical verbs like: treats, causes, prevents, leads_to, associated_with, contains, increases, decreases, helps_with, used_for, etc.
- Avoid non-informative relations such as “is”, “have”, “make”, “thing”.
- Combine multi-word entities where appropriate (e.g., “vitamin D deficiency”, “blood pressure”).
- If no factual claim is found, return an empty list.
- Output strictly as tuple in this format:
[
  (subject, relation, object),
  ...
]

Sentence:

{clean_text}
"""
    messages = [{"role": "user", "content":  system_prompt}]
    client = InferenceClient(qwen_model)
    # client= InferenceClient(llama_model)
    response=client.chat_completion(messages=messages)
    triples = response.choices[0].message["content"]
    print(triples_clean if len(triples)>0 else "No claim made")
    return triples

def extract_triples_batch(batch):
    return {"triples": [extract_triples_via_hf(t) for t in batch["clean_text"]]}


triples_ds = ds.map(extract_triples_batch, batched=True)


