**1**-INSTALL

In [None]:
!pip install torch torch_geometric pandas networkx scikit-learn --quiet
# Required for Groq API
!pip install groq

# Torch + PyTorch Geometric
!pip install torch torchvision torchaudio --quiet
!pip install torch==2.4.0 torch-geometric==2.5.3 torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.4.0+cpu.html
#!pip install  torch-geometric --quiet


# Core graph + ML + data tools
!pip install networkx pandas numpy scikit-learn

# Optional: if you use progress bars
!pip install tqdm


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

KeyboardInterrupt: 

2-**AGENT1**

In [None]:
import os
import pandas as pd
import logging
import time
import json
from typing import Dict, List
from groq import Groq
from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score
from datetime import datetime

# Configuration
start_row = 0
end_row = 2000

# Logging configuration
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('agent1_history.txt', encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Initialize Groq client
GROQ_API_KEY = ""
client = Groq(api_key=GROQ_API_KEY)

# Cache configuration
CACHE_FILE = "llm_cache.json"
CACHE_EXPIRY_SECONDS = 7 * 24 * 60 * 60  # 7 days

# Normalization
DISEASE_NORMALIZATION = {
    'corona': 'كورونا',
    'Covid-19': 'كورونا',
    'coronavirus': 'كورونا',
    'فيروس كورونا': 'كورونا'
}
EXCLUDED_TERMS = {'الصحية', 'santé', 'health', 'pandémie', 'pandemic', 'cas', 'cases', 'vaccine', 'حالات'}

def load_cache() -> Dict:
    if os.path.exists(CACHE_FILE):
        try:
            with open(CACHE_FILE, 'r', encoding='utf-8') as f:
                cache = json.load(f)
                current_time = time.time()
                return {
                    k: v for k, v in cache.items()
                    if v.get('timestamp', 0) + CACHE_EXPIRY_SECONDS > current_time
                }
        except Exception as e:
            logger.error(f"Error loading cache: {e}")
            return {}
    return {}

def save_cache(cache: Dict):
    try:
        with open(CACHE_FILE, 'w', encoding='utf-8') as f:
            json.dump(cache, f, ensure_ascii=False, indent=2)
    except Exception as e:
        logger.error(f"Error saving cache: {e}")

def summarize_article(text: str) -> str:
    cache = load_cache()
    cache_key = f"summarize_{hash(text)}"
    if cache_key in cache:
        logger.info(f"Using cached summary for text: {text[:50]}...")
        return cache[cache_key]['summary']

    prompt = (
        "You are a health news expert. Your task is to extract country–disease or country–vaccine pairs "
        "specifically mentioned in the headline below.\n\n"

        "Include only:\n"
        "- Specific diseases (e.g., كورونا, إنفلونزا, كوليرا), or\n"
        "- Recognized vaccine names (e.g., فايزر, موديرنا, أسترازينيكا) paired with a clearly identifiable country (e.g., البرازيل, أمريكا).\n\n"

        "Normalization rules:\n"
        "- All forms of corona (corona, Covid-19, فيروس كورونا) should become: كورونا\n"
        "- All vaccine names should become: لقاح\n\n"

        "Exclude:\n"
        "- Generic terms (e.g., حالات, الوضع, الصحية)\n"
        "- Vague or regional terms like: العالم, \n"
        "- Any headline without a clear (country, disease/vaccine) pair\n\n"

        "Expected output:\n"
        "- A single line with one or more pairs in the format: 'country disease' or 'country لقاح', separated by spaces.\n"
        "- If no valid pair exists, return an empty string.\n"
        "- Do NOT add any explanation or extra text.\n\n"

        f"Title: {text}\nResult:"
    )

    try:
        response = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=150,
            temperature=0.2
        )
        result = response.choices[0].message.content.strip()
        pairs = result.split()
        cleaned_pairs = []
        for i in range(0, len(pairs), 2):
            if i + 1 < len(pairs):
                country, name = pairs[i], pairs[i + 1]
                if name in DISEASE_NORMALIZATION:
                    name = DISEASE_NORMALIZATION[name]
                if name and country and name not in EXCLUDED_TERMS and country not in {'العالم', 'أوروبا', ''}:
                    cleaned_pairs.append(f"{country} {name}")
        cleaned_result = " ".join(cleaned_pairs)
        cache[cache_key] = {
            'summary': cleaned_result,
            'timestamp': time.time()
        }
        save_cache(cache)
        logger.info(f"Generated summary for: {text[:50]}... -> {cleaned_result}")
        return cleaned_result
    except Exception as e:
        logger.error(f"LLM summarization error for text {text[:50]}... : {e}")
        return ""

BATCH_SIZE = 10

def batch_summarize(texts):
    return [summarize_article(text) for text in texts]

logger.info("Loading inputfilegraph.csv")
try:
    df = pd.read_csv("inputfilegraph.csv")
    logger.info(f"File has {len(df)} rows. Processing rows {start_row} to {end_row}")
    df = df.iloc[start_row:end_row].copy()
except FileNotFoundError:
    logger.error("File inputfilegraph.csv not found")
    raise
except Exception as e:
    logger.error(f"Error loading file: {e}")
    raise

if 'New_Text' not in df.columns:
    logger.error("Missing 'New_Text' column in inputfilegraph.csv")
    raise ValueError("Missing column 'New_Text'")

start_time = time.time()
summary_file = "summarized_inputfilegraph.csv"
logger.info("Running Agent 1 to summarize articles")

if os.path.exists(summary_file):
    try:
        summaries_df = pd.read_csv(summary_file, encoding='utf-8')
        if 'Summary' in summaries_df.columns:
            logger.info(f"Loaded summaries from {summary_file}")
            summaries_df = summaries_df.iloc[start_row:end_row].copy()
            df = df.iloc[:len(summaries_df)]
            df['Summary'] = summaries_df['Summary']
            df['Original'] = summaries_df['Original'] if 'Original' in summaries_df.columns else df['New_Text']
        else:
            raise Exception("Missing 'Summary' column in summary file.")
    except Exception as e:
        logger.warning(f"Problem loading summary file: {e}. Regenerating summaries.")
        texts = df['New_Text'].fillna("").astype(str).tolist()
        summaries = []
        for i in range(0, len(texts), BATCH_SIZE):
            summaries.extend(batch_summarize(texts[i:i+BATCH_SIZE]))
            time.sleep(1.0)
        df['Summary'] = summaries
        df['Original'] = df['New_Text']
        df[['Original', 'Summary']].to_csv(summary_file, index=False, encoding='utf-8')
        logger.info(f"Summaries saved to {summary_file}")
else:
    logger.info(f"Generating summaries for {len(df)} articles")
    texts = df['New_Text'].fillna("").astype(str).tolist()
    summaries = []
    for i in range(0, len(texts), BATCH_SIZE):
        summaries.extend(batch_summarize(texts[i:i+BATCH_SIZE]))
        time.sleep(1.0)
    df['Summary'] = summaries
    df['Original'] = df['New_Text']
    df[['Original', 'Summary']].to_csv(summary_file, index=False, encoding='utf-8')
    logger.info(f"Summaries saved to {summary_file}")

execution_time = time.time() - start_time
logger.info(f"Total execution time: {execution_time:.2f} seconds")

-MERGING

In [None]:
import pandas as pd

file_paths = [
    "0_2000.csv",
    "2000_4000.csv",
    "4000_6000.csv",
    "6000_8000.csv",
    "8000_10000.csv"
]

dfs = [pd.read_csv(fp, encoding='utf-8') for fp in file_paths]
merged_df = pd.concat(dfs, ignore_index=True)

# Optional: check if row count matches original dataset
print("Merged rows:", len(merged_df))

merged_df.to_csv("merged_summaries.csv", index=False, encoding='utf-8')


Merged rows: 9917


3-BERT SCORE (2 PARTS)

In [None]:
#code Bertscore(arabic + other)
!pip install bert-score
import pandas as pd
import logging
from bert_score import score

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Load the cleaned CSV file
try:
    df = pd.read_csv("merged_summaries.csv")
    logger.info("Cleaned CSV file loaded successfully.")
except FileNotFoundError:
    logger.error("The cleaned CSV file was not found.")
    raise
except Exception as e:
    logger.error(f"Error while loading the CSV file: {e}")
    raise

# Prepare the texts
df['Original'] = df['Original'].fillna("").astype(str)
df['Summary'] = df['Summary'].fillna("").astype(str)
original_texts = df['Original'].tolist()
summaries = df['Summary'].tolist()

# Compute BERTScore
logger.info("Calculating BERTScore...")
P, R, F1 = score(summaries, original_texts, lang="ar", model_type="bert-base-multilingual-cased")
avg_f1 = F1.mean().item()

# Display the results directly
print("\nBERTScore Results:")
print(f"Average BERTScore F1: {avg_f1:.4f}")
logger.info("BERTScore metric successfully displayed.")





BERTScore Results:
Average BERTScore F1: 0.6495


In [None]:
#code Bertscore (arabic)
import pandas as pd
import logging
from bert_score import score
import re

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Load the cleaned CSV file
try:
    df = pd.read_csv("merged_summaries.csv")
    logger.info("Cleaned CSV file loaded successfully.")
except FileNotFoundError:
    logger.error("The cleaned CSV file was not found.")
    raise
except Exception as e:
    logger.error(f"Error while loading the CSV file: {e}")
    raise

# Filtrer pour ne garder que les lignes où 'Summary' contient au moins un caractère arabe
import re
arabic_pattern = re.compile(r'[\u0600-\u06FF]')
df = df[df['Summary'].apply(lambda x: bool(arabic_pattern.search(str(x))))]
logger.info(f"Nombre de summaries en arabe : {len(df)}")

# Prepare the texts
df['Original'] = df['Original'].fillna("").astype(str)
df['Summary'] = df['Summary'].fillna("").astype(str)
original_texts = df['Original'].tolist()
summaries = df['Summary'].tolist()

# Compute BERTScore
logger.info("Calculating BERTScore...")
P, R, F1 = score(summaries, original_texts, lang="ar", model_type="bert-base-multilingual-cased")
avg_f1 = F1.mean().item()

# Display the results directly
print("\nBERTScore Results:")
print(f"Average BERTScore F1: {avg_f1:.4f}")
logger.info("BERTScore metric successfully displayed.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Original'] = df['Original'].fillna("").astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Summary'] = df['Summary'].fillna("").astype(str)


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]


BERTScore Results:
Average BERTScore F1: 0.6973


4-AGENT 1 METRICS

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
import re

# Download tokenizer
nltk.download('punkt_tab')

# Load your CSV
df = pd.read_csv("agent1_ground_truth_template.csv")  # Replace with your path

# Preprocessing: remove anything not Arabic
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.strip()
    # If not purely Arabic text (letters, spaces only), discard
    if not re.fullmatch(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\s]+', text):
        return ""
    return text.lower()

df["Clean_Summary"] = df["Summary"].apply(clean_text)
df["Clean_Manual"] = df["Manual_Ground_Truth"].apply(clean_text)

# Helper for ROUGE-L
def lcs(X, Y):
    m, n = len(X), len(Y)
    dp = [[0]*(n+1) for _ in range(m+1)]
    for i in range(m):
        for j in range(n):
            if X[i] == Y[j]:
                dp[i+1][j+1] = dp[i][j] + 1
            else:
                dp[i+1][j+1] = max(dp[i+1][j], dp[i][j+1])
    return dp[m][n]

# Compute all metrics
def compute_all_metrics(summary, reference):
    if summary.strip() == "" and reference.strip() == "":
        return {
            "ROUGE-1": 1.0,
            "ROUGE-L": 1.0,
            "BLEU": 1.0,
            "Exact_Match": 1.0,
            "Jaccard": 1.0,
            "Token_F1": 1.0
        }

    summary_tokens = word_tokenize(summary)
    reference_tokens = word_tokenize(reference)

    # ROUGE-1
    overlap = set(summary_tokens) & set(reference_tokens)
    p = len(overlap) / len(summary_tokens) if summary_tokens else 0
    r = len(overlap) / len(reference_tokens) if reference_tokens else 0
    rouge1_f1 = 2 * p * r / (p + r) if p + r > 0 else 0

    # ROUGE-L
    lcs_len = lcs(summary_tokens, reference_tokens)
    p_lcs = lcs_len / len(summary_tokens) if summary_tokens else 0
    r_lcs = lcs_len / len(reference_tokens) if reference_tokens else 0
    rougeL_f1 = 2 * p_lcs * r_lcs / (p_lcs + r_lcs) if p_lcs + r_lcs > 0 else 0

    # BLEU
    smoothie = SmoothingFunction().method1
    bleu = sentence_bleu([reference_tokens], summary_tokens, smoothing_function=smoothie)

    # Exact Match
    exact = float(summary.strip() == reference.strip())

    # Jaccard
    union = set(summary_tokens) | set(reference_tokens)
    jaccard = len(overlap) / len(union) if union else 0

    # Token-level F1
    token_f1 = (2 * p * r) / (p + r) if p + r > 0 else 0

    return {
        "ROUGE-1": rouge1_f1,
        "ROUGE-L": rougeL_f1,
        "BLEU": bleu,
        "Exact_Match": exact,
        "Jaccard": jaccard,
        "Token_F1": token_f1
    }

# Apply metrics
metrics = df.apply(lambda row: compute_all_metrics(row["Clean_Summary"], row["Clean_Manual"]), axis=1)
metrics_df = pd.DataFrame(metrics.tolist())

# Merge and save
result_df = pd.concat([df[["Original", "Summary", "Manual_Ground_Truth"]], metrics_df], axis=1)
average_scores = metrics_df.mean()

# Save to CSV
result_df.to_csv("resultll.csv", index=False, encoding='utf-8')

# Print results
print("\nAverage Scores:\n", average_scores)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Average Scores:
 ROUGE-1        0.931653
ROUGE-L        0.947694
BLEU           0.574392
Exact_Match    0.905000
Jaccard        0.942500
Token_F1       0.931653
dtype: float64


5-AGENT2

In [None]:
!pip install requests
!pip install -q transformers==4.36.2
!pip install pycountry

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 4.1.0 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.36.2 which is incompatible.[0m[31m
[0mCollecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m101.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry
Successfully insta

In [None]:
import csv
import requests
import re
from collections import defaultdict
from itertools import combinations

import pycountry
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# -------------------------------------
# 1. Setup: Country list (Arabic/English)
# -------------------------------------
def get_country_names():
    countries = set()
    for country in pycountry.countries:
        names = [country.name]
        if hasattr(country, 'official_name'):
            names.append(country.official_name)
        if hasattr(country, 'translations') and 'ar' in country.translations:
            names.append(country.translations['ar'])
        for name in names:
            countries.add(name)
    return countries

COUNTRIES = get_country_names()

# -------------------------------------
# 2. Entities of Interest
# -------------------------------------
DISEASE_SYNONYMS = {
    'كورونا': ['كورونا', 'فيروس كورونا', 'كوفيد', 'كوفيد19'],
    'إنفلونزا': ['إنفلونزا', 'انفلونزا', 'إنفلونزا الخنازير', 'إنفلونزا الطيور'],
    'جدري': ['جدري', 'الجدري'],
    'إيبولا': ['إيبولا', 'الإيبولا'],
    'السمنة': ['السمنة'],
    'الفطر': ['الفطر', 'الفطر الأسود'],
    'السيدا': ['السيدا', 'الإيدز', 'إيدز'],
    'سرطان': ['سرطان', 'سرطان الثدي', 'سرطان الرئة', 'سرطان القولون'],
    'القلب': ['القلب', 'أمراض القلب', 'ذبحة صدرية'],
    'الملاريا': ['الملاريا'],
    'الدرن': ['الدرن', 'السل'],
    'التهاب الكبد': ['التهاب الكبد'],
    'حمى الضنك': ['حمى الضنك']
}

VACCINE_KEYWORDS = [
    'لقاح كورونا', 'لقاح فايزر', 'لقاح موديرنا', 'لقاح سينوفارم',
    'فايزر', 'موديرنا', 'جونسون', 'أسترازينيكا', 'سبوتنيك', 'سينوفارم'
]

CONTINENTS = [
    'أفريقيا', 'إفريقيا', 'Africa',
    'آسيا', 'اسيا', 'Asia',
    'أوروبا', 'اوروبا', 'Europe',
    'أمريكا الشمالية', 'امريكا الشمالية', 'North America',
    'أمريكا الجنوبية', 'امريكا الجنوبية', 'South America',
    'أستراليا', 'استراليا', 'Australia',
    'أنتاركتيكا', 'Antarctica', 'القارة القطبية الجنوبية'
]

# Reverse mapping for diseases
alias_to_disease = {alias: canonical for canonical, aliases in DISEASE_SYNONYMS.items() for alias in aliases}

# -------------------------------------
# 3. Utility Functions
# -------------------------------------
def is_arabic(text):
    return bool(re.search(r'[\u0600-\u06FF]', text))

def get_country_from_location(location):
    url = "https://nominatim.openstreetmap.org/search"
    params = {"q": location, "format": "json", "addressdetails": 1, "accept-language": "ar"}
    try:
        r = requests.get(url, params=params, headers={"User-Agent": "entity-extractor"})
        data = r.json()
        if data and "address" in data[0] and "country" in data[0]["address"]:
            return data[0]["address"]["country"]
    except Exception:
        pass
    return None

def get_edge_type(t1, t2):
    types = sorted([t1, t2])
    return "-".join(types)

# -------------------------------------
# 4. Load NER model
# -------------------------------------
tokenizer = AutoTokenizer.from_pretrained("Davlan/bert-base-multilingual-cased-ner-hrl")
model = AutoModelForTokenClassification.from_pretrained("Davlan/bert-base-multilingual-cased-ner-hrl")
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# -------------------------------------
# 5. Process summaries & build graph
# -------------------------------------
nodes = {}  # label -> type
edges = defaultdict(int)
node_label_to_index = {}

with open('merged_summaries.csv', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        summary = row.get('Summary', '').strip()
        if not summary or summary.lower().startswith("empty"):
            continue

        entities = ner(summary)
        found_labels = set()

        for ent in entities:
            label = ent['word'].strip()
            if not is_arabic(label) or label.startswith("##"):
                continue

            # Generalize all vaccines to 'لقاح'
            if label in VACCINE_KEYWORDS:
                nodes['لقاح'] = 'vaccine'
                found_labels.add('لقاح')
                continue

            if label in COUNTRIES:
                nodes[label] = 'country'
                found_labels.add(label)
            elif label in CONTINENTS:
                nodes[label] = 'continent'
                found_labels.add(label)
            elif ent['entity_group'] == "LOC":
                country = get_country_from_location(label)
                if country and is_arabic(country) and country not in alias_to_disease and country not in CONTINENTS:
                    nodes[country] = 'country'
                    found_labels.add(country)

        for alias, canonical in alias_to_disease.items():
            if alias in summary:
                nodes[canonical] = 'disease'
                found_labels.add(canonical)

        for continent in CONTINENTS:
            if continent in summary:
                nodes[continent] = 'continent'
                found_labels.add(continent)

        for src, tgt in combinations(sorted(found_labels), 2):
            if src == tgt:
                continue
            t1, t2 = nodes.get(src), nodes.get(tgt)
            if not t1 or not t2:
                continue
            etype = get_edge_type(t1, t2)
            edges[(src, tgt, etype)] += 1

# -------------------------------------
# 6. Assign index and write CSVs
# -------------------------------------
with open('nodes.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'label', 'type'])
    for idx, (label, typ) in enumerate(nodes.items()):
        node_label_to_index[label] = idx
        writer.writerow([idx, label, typ])

with open('edges.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['source', 'target', 'type', 'weight'])
    for (src, tgt, etype), weight in edges.items():
        if src in node_label_to_index and tgt in node_label_to_index:
            writer.writerow([
                node_label_to_index[src],
                node_label_to_index[tgt],
                etype,
                weight
            ])


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/709M [00:00<?, ?B/s]

6-GCN LINKS

In [None]:
import pandas as pd
import numpy as np
import random
import torch
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from itertools import combinations
from collections import defaultdict
import re

# Reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Load data
nodes_df = pd.read_csv("nodes.csv")
edges_df = pd.read_csv("edges.csv")
summaries_df = pd.read_csv("merged_summaries.csv")
summaries_df['Summary'] = summaries_df['Summary'].fillna("")

# Create mappings
node_types = dict(zip(nodes_df['id'], nodes_df['type']))
label_to_id = dict(zip(nodes_df['label'], nodes_df['id']))
num_nodes = len(nodes_df)

# Build node-to-summary map
node_to_summaries = defaultdict(list)
for idx, row in summaries_df.iterrows():
    summary = row['Summary']
    for label, node_id in label_to_id.items():
        if isinstance(label, str) and re.search(rf'\b{re.escape(label)}\b', summary, re.UNICODE):
            node_to_summaries[node_id].append(idx)

# Edge list
edges = list(edges_df[['source', 'target']].itertuples(index=False, name=None))
edge_weights = edges_df['weight'].tolist()

# Define edge type encoding
def encode_edge_type(t: str) -> int:
    t = t.lower()
    if "country" in t and "disease" in t:
        return 1
    elif "country" in t and "vaccine" in t:
        return 2
    elif "continent" in t and ("disease" in t or "vaccine" in t):
        return 4
    elif "country" in t and "country" in t:
        return 3
    return 0

edge_types = [encode_edge_type(rel_type) for rel_type in edges_df['type']]

# Prepare adjacency and training edges
adj_edges = []
train_edges = []
train_weights = []
train_types = []

for i, (a, b) in enumerate(edges):
    t = edge_types[i]
    adj_edges.append((a, b))

    # Include country-disease, country-vaccine, and continent-disease links for prediction
    if (
        t in [1, 2,4]
    ):
        train_edges.append((a, b))
        train_weights.append(edge_weights[i])
        train_types.append(t)

print(f"Total edges in adjacency: {len(adj_edges)}")
print(f"Positive edges for training (types 1, 2, or 4): {len(train_edges)}")

# Generate negative edges
all_possible = list(combinations(range(num_nodes), 2))
existing_set = set(train_edges)
negative_edges = [e for e in all_possible if e not in existing_set]
neg_sample = random.sample(negative_edges, min(len(train_edges), len(negative_edges)))
neg_types = [0] * len(neg_sample)

# Combine pos + neg
all_edges = train_edges + neg_sample
all_labels = [1] * len(train_edges) + [0] * len(neg_sample)
all_types = train_types + neg_types

# GCN Model
class LinkPredictor(torch.nn.Module):
    def __init__(self, in_channels, hidden1, hidden2, dropout):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden1)
        self.conv2 = GCNConv(hidden1, hidden2)
        self.classifier = torch.nn.Linear(hidden2, 1)
        self.dropout = dropout

    def forward(self, data, edge_index):
        x = self.conv1(data.x, data.edge_index, edge_weight=data.edge_attr)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, data.edge_index, edge_weight=data.edge_attr)
        src, dst = edge_index
        edge_feats = x[src] * x[dst]
        return torch.sigmoid(self.classifier(edge_feats).squeeze())
# Training Loop
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
dropout = 0.4
learning_rate = 0.05
hidden_dims = (32, 16)
patience = 151

f1_scores, precisions, recalls, accuracies, aucs = [], [], [], [], []

for fold, (train_idx, test_idx) in enumerate(folds.split(all_edges, all_types)):
    print(f"\n=== Fold {fold+1} ===")

    train_edges_fold = [all_edges[i] for i in train_idx]
    train_labels = torch.tensor([all_labels[i] for i in train_idx], dtype=torch.float)
    test_edges = [all_edges[i] for i in test_idx]
    test_labels = torch.tensor([all_labels[i] for i in test_idx], dtype=torch.float)

    # TF-IDF feature extraction (fold-wise)
    train_summary_ids = set()
    for edge in train_edges_fold:
        for node in edge:
            train_summary_ids.update(node_to_summaries.get(node, []))
    train_summaries = [summaries_df['Summary'][i] for i in train_summary_ids]

    vectorizer = TfidfVectorizer(max_features=100)
    tfidf_matrix = vectorizer.fit_transform(train_summaries).toarray() if train_summaries else np.zeros((1, 100))
    summary_idx_map = {idx: i for i, idx in enumerate(train_summary_ids)}

    node_features = np.zeros((num_nodes, tfidf_matrix.shape[1]))
    for node_id in range(num_nodes):
        indices = [i for i in node_to_summaries[node_id] if i in summary_idx_map]
        if indices:
            vectors = [tfidf_matrix[summary_idx_map[i]] for i in indices]
            node_features[node_id] = np.mean(vectors, axis=0)

    node_features = torch.tensor(node_features, dtype=torch.float)
    node_features = torch.nan_to_num(node_features)

    edge_index_full = torch.tensor(adj_edges, dtype=torch.long).t()
    edge_attr_full = torch.tensor(edge_weights, dtype=torch.float)
    graph = Data(x=node_features, edge_index=edge_index_full, edge_attr=edge_attr_full)

    train_edge_index = torch.tensor(train_edges_fold, dtype=torch.long).t()
    test_edge_index = torch.tensor(test_edges, dtype=torch.long).t()

    model = LinkPredictor(node_features.shape[1], *hidden_dims, dropout)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    best_f1 = 0
    patience_counter = 0

    for epoch in range(400):
        model.train()
        optimizer.zero_grad()
        pred = model(graph, train_edge_index)
        loss = F.binary_cross_entropy(pred, train_labels)
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_scores = model(graph, test_edge_index)
            preds = (val_scores > 0.5).float()
            precision = precision_score(test_labels, preds, zero_division=0)
            recall = recall_score(test_labels, preds, zero_division=0)
            f1 = f1_score(test_labels, preds, zero_division=0)
            acc = accuracy_score(test_labels, preds)
            auc = roc_auc_score(test_labels, val_scores)

        if f1 > best_f1:
            best_f1 = f1
            best_metrics = (precision, recall, acc, auc)
            best_preds = preds
            patience_counter = 0
        else:
            patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

    p, r, acc, auc = best_metrics
    tn, fp, fn, tp = confusion_matrix(test_labels, best_preds).ravel()
    precisions.append(p)
    recalls.append(r)
    f1_scores.append(best_f1)
    accuracies.append(acc)
    aucs.append(auc)

    print(f"Precision: {p:.4f} | Recall: {r:.4f} | F1: {best_f1:.4f} | Acc: {acc:.4f} | AUC: {auc:.4f}")
    print(f"Confusion: TP={tp}, FP={fp}, FN={fn}, TN={tn}")
    print(f"Fold {fold+1} - Test size: {len(test_labels)}")

# Final metrics
print("\n=== Cross-Validation Results ===")
print(f"Avg Precision: {np.mean(precisions):.4f}")
print(f"Avg Recall:    {np.mean(recalls):.4f}")
print(f"Avg F1 Score:  {np.mean(f1_scores):.4f}")
print(f"Avg Accuracy:  {np.mean(accuracies):.4f}")
print(f"Avg AUC-ROC:   {np.mean(aucs):.4f}")


Total edges in adjacency: 438
Positive edges for training (types 1, 2, or 4): 129

=== Fold 1 ===




Early stopping at epoch 311
Precision: 0.8125 | Recall: 1.0000 | F1: 0.8966 | Acc: 0.8846 | AUC: 0.9024
Confusion: TP=13, FP=3, FN=0, TN=10
Fold 1 - Test size: 26

=== Fold 2 ===
Early stopping at epoch 304
Precision: 0.8125 | Recall: 1.0000 | F1: 0.8966 | Acc: 0.8846 | AUC: 0.9142
Confusion: TP=13, FP=3, FN=0, TN=10
Fold 2 - Test size: 26

=== Fold 3 ===
Early stopping at epoch 335
Precision: 0.9231 | Recall: 0.9231 | F1: 0.9231 | Acc: 0.9231 | AUC: 0.8994
Confusion: TP=12, FP=1, FN=1, TN=12
Fold 3 - Test size: 26

=== Fold 4 ===
Early stopping at epoch 316
Precision: 0.8571 | Recall: 0.9231 | F1: 0.8889 | Acc: 0.8846 | AUC: 0.8994
Confusion: TP=12, FP=2, FN=1, TN=11
Fold 4 - Test size: 26

=== Fold 5 ===
Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 | Acc: 1.0000 | AUC: 1.0000
Confusion: TP=13, FP=0, FN=0, TN=13
Fold 5 - Test size: 26

=== Fold 6 ===
Early stopping at epoch 332
Precision: 1.0000 | Recall: 0.9231 | F1: 0.9600 | Acc: 0.9615 | AUC: 0.9822
Confusion: TP=12, FP=0, FN=1,



7-AGENT3

In [None]:
import pandas as pd
import time
import logging
from groq import Groq
from collections import Counter

# Setup logging
logging.basicConfig(
    filename='inverse_relation_generation.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# API Key
client = Groq(api_key="")

# Load edges with weight
df = pd.read_csv("edges.csv")  # columns: source, target, type, weight
logging.info(f"Loaded {len(df)} edges from edges.csv")

# Function that calls the LLM to generate the inverse relation
def generate_inverse_relation_en(source, target, rel_type, weight):
    prompt = f"""Given a relationship between two entities in the format: source,target,type,weight
The source and target are numeric IDs (not names). The weight is an integer.
If the relationship can be reversed, return the reversed relationship in the format: source,target,type (do NOT include the weight in the output).
Otherwise, return nothing.

Examples:
Input: 1,2,country-disease,5
Output: 2,1,disease-country

Input: 1,0,country-country,3
Output: 0,1,country-country

Input: 2,1,disease-country,2
Output: 1,2,country-disease

Input: 3,1,country-country,1
Output: 1,3,country-country

Now, given the following:
{source},{target},{rel_type},{weight}
Return only the reversed relationship in the format: source,target,type (do NOT include the weight).
"""
    try:
        response = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
            max_tokens=60,
        )
        result = response.choices[0].message.content.strip()
        # Only keep the first line with three comma-separated parts
        for line in result.splitlines():
            parts = [x.strip() for x in line.split(",")]
            if len(parts) == 3 and all(parts[:2]) and all(part.isdigit() for part in parts[:2]):
                return ",".join(parts)
        logging.warning(f"Invalid format from LLM for input {source},{target},{rel_type},{weight}: '{result}'")
        return None
    except Exception as e:
        logging.error(f"LLM error for {source} → {target} ({rel_type}): {e}")
        return None


# Count reversed relations in the original file
edge_counter = Counter(
    (str(row["target"]), str(row["source"]), str(row["type"]))
    for _, row in df.iterrows()
)

# Generate reversed relations with weight preserved
inverse_edges = []
for idx, row in df.iterrows():
    source, target, rel_type, weight = row["source"], row["target"], row["type"], row["weight"]
    if rel_type in ["country-country", "continent-country", "disease-disease", "disease-vaccine"]:
        continue
    logging.info(f"Processing edge {idx}: {source} → {target} ({rel_type})")
    inverse = generate_inverse_relation_en(source, target, rel_type, weight)
    if inverse:
        parts = [x.strip() for x in inverse.split(",")]
        if len(parts) == 3 and all(parts):
            s2, t2, r2 = parts
            inverse_edges.append({
                "source": s2,
                "target": t2,
                "type": r2,
                "weight": weight
            })
            logging.info(f"Added inverse: {s2} → {t2} ({r2})")
    time.sleep(0.3)

# DataFrame of inverse edges
df_inverse = pd.DataFrame(inverse_edges)
df_inverse = df_inverse.dropna()
df_inverse = df_inverse[(df_inverse["source"] != "") & (df_inverse["target"] != "") & (df_inverse["type"] != "")]
logging.info(f"{len(df_inverse)} valid inverse edges generated.")

# Merge with original edges
df_total = pd.concat([df, df_inverse], ignore_index=True).drop_duplicates()

# Save
df_total.to_csv("edges_with_inverses.csv", index=False)
logging.info("Saved combined edges to edges_with_inverses.csv")
print("File 'edges_with_inverses.csv' generated with weights and inverse relations.")


File 'edges_with_inverses.csv' generated with weights and inverse relations.


8-GCN link

In [None]:
import pandas as pd
import numpy as np
import random
import torch
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from itertools import combinations
from collections import defaultdict
import re

# Reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Load data
nodes_df = pd.read_csv("nodes.csv")
edges_df = pd.read_csv("edges_with_inverses.csv", dtype={"source": int, "target": int})
summaries_df = pd.read_csv("merged_summaries.csv")
summaries_df['Summary'] = summaries_df['Summary'].fillna("")

# Create mappings
node_types = dict(zip(nodes_df['id'], nodes_df['type']))
label_to_id = dict(zip(nodes_df['label'], nodes_df['id']))
num_nodes = len(nodes_df)

# Build node-to-summary map
node_to_summaries = defaultdict(list)
for idx, row in summaries_df.iterrows():
    summary = row['Summary']
    for label, node_id in label_to_id.items():
        if isinstance(label, str) and re.search(rf'\b{re.escape(label)}\b', summary, re.UNICODE):
            node_to_summaries[node_id].append(idx)

# Edge list
edges = list(edges_df[['source', 'target']].itertuples(index=False, name=None))
edge_weights = edges_df['weight'].tolist()

# Define edge type encoding

def encode_edge_type(t: str) -> int:
    t = t.lower()
    if "country" in t and "disease" in t:
        return 1
    elif "country" in t and "vaccine" in t:
        return 2
    elif "continent" in t and ("disease" in t or "vaccine" in t):
        return 4
    elif "country" in t and "country" in t:
        return 3
    return 0

edge_types = [encode_edge_type(rel_type) for rel_type in edges_df['type']]

# Prepare adjacency and training edges
adj_edges = []
train_edges = []
train_weights = []
train_types = []

for i, (a, b) in enumerate(edges):
    t = edge_types[i]
    adj_edges.append((a, b))

    # Include country-disease, country-vaccine, and continent-disease links for prediction
    if t in [1, 2, 4]:
        train_edges.append((a, b))
        train_weights.append(edge_weights[i])
        train_types.append(t)

print(f"Total edges in adjacency: {len(adj_edges)}")
print(f"Positive edges for training (types 1, 2, or 4): {len(train_edges)}")

# Generate negative edges
all_possible = list(combinations(range(num_nodes), 2))
existing_set = set(train_edges)
negative_edges = [e for e in all_possible if e not in existing_set]
neg_sample = random.sample(negative_edges, min(len(train_edges), len(negative_edges)))
neg_types = [0] * len(neg_sample)

# Combine pos + neg
all_edges = train_edges + neg_sample
all_labels = [1] * len(train_edges) + [0] * len(neg_sample)
all_types = train_types + neg_types

# GCN Model
class LinkPredictor(torch.nn.Module):
    def __init__(self, in_channels, hidden1, hidden2, dropout):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden1)
        self.conv2 = GCNConv(hidden1, hidden2)
        self.classifier = torch.nn.Linear(hidden2, 1)
        self.dropout = dropout

    def forward(self, data, edge_index):
        x = self.conv1(data.x, data.edge_index, edge_weight=data.edge_attr)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, data.edge_index, edge_weight=data.edge_attr)
        src, dst = edge_index
        edge_feats = x[src] * x[dst]
        return torch.sigmoid(self.classifier(edge_feats).squeeze())
# Training Loop
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
dropout = 0.3
learning_rate = 0.05
hidden_dims = (128, 64)
patience = 150

f1_scores, precisions, recalls, accuracies, aucs = [], [], [], [], []

for fold, (train_idx, test_idx) in enumerate(folds.split(all_edges, all_types)):
    print(f"\n=== Fold {fold+1} ===")

    train_edges_fold = [all_edges[i] for i in train_idx]
    train_labels = torch.tensor([all_labels[i] for i in train_idx], dtype=torch.float)
    test_edges = [all_edges[i] for i in test_idx]
    test_labels = torch.tensor([all_labels[i] for i in test_idx], dtype=torch.float)

    # TF-IDF feature extraction (fold-wise)
    train_summary_ids = set()
    for edge in train_edges_fold:
        for node in edge:
            train_summary_ids.update(node_to_summaries.get(node, []))
    train_summaries = [summaries_df['Summary'][i] for i in train_summary_ids]

    vectorizer = TfidfVectorizer(max_features=100)
    tfidf_matrix = vectorizer.fit_transform(train_summaries).toarray() if train_summaries else np.zeros((1, 100))
    summary_idx_map = {idx: i for i, idx in enumerate(train_summary_ids)}

    node_features = np.zeros((num_nodes, tfidf_matrix.shape[1]))
    for node_id in range(num_nodes):
        indices = [i for i in node_to_summaries[node_id] if i in summary_idx_map]
        if indices:
            vectors = [tfidf_matrix[summary_idx_map[i]] for i in indices]
            node_features[node_id] = np.mean(vectors, axis=0)

    node_features = torch.tensor(node_features, dtype=torch.float)
    node_features = torch.nan_to_num(node_features)

    edge_index_full = torch.tensor(adj_edges, dtype=torch.long).t()
    edge_attr_full = torch.tensor(edge_weights, dtype=torch.float)
    graph = Data(x=node_features, edge_index=edge_index_full, edge_attr=edge_attr_full)

    train_edge_index = torch.tensor(train_edges_fold, dtype=torch.long).t()
    test_edge_index = torch.tensor(test_edges, dtype=torch.long).t()

    model = LinkPredictor(node_features.shape[1], *hidden_dims, dropout)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    best_f1 = 0
    patience_counter = 0

    for epoch in range(400):
        model.train()
        optimizer.zero_grad()
        pred = model(graph, train_edge_index)
        loss = F.binary_cross_entropy(pred, train_labels)
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_scores = model(graph, test_edge_index)
            preds = (val_scores > 0.5).float()
            precision = precision_score(test_labels, preds, zero_division=0)
            recall = recall_score(test_labels, preds, zero_division=0)
            f1 = f1_score(test_labels, preds, zero_division=0)
            acc = accuracy_score(test_labels, preds)
            auc = roc_auc_score(test_labels, val_scores)

        if f1 > best_f1:
            best_f1 = f1
            best_metrics = (precision, recall, acc, auc)
            best_preds = preds
            patience_counter = 0
        else:
            patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

    p, r, acc, auc = best_metrics
    tn, fp, fn, tp = confusion_matrix(test_labels, best_preds).ravel()
    precisions.append(p)
    recalls.append(r)
    f1_scores.append(best_f1)
    accuracies.append(acc)
    aucs.append(auc)

    print(f"Precision: {p:.4f} | Recall: {r:.4f} | F1: {best_f1:.4f} | Acc: {acc:.4f} | AUC: {auc:.4f}")
    print(f"Confusion: TP={tp}, FP={fp}, FN={fn}, TN={tn}")
    print(f"Fold {fold+1} - Test size: {len(test_labels)}")

# Final metrics
print("\n=== Cross-Validation Results ===")
print(f"Avg Precision: {np.mean(precisions):.4f}")
print(f"Avg Recall:    {np.mean(recalls):.4f}")
print(f"Avg F1 Score:  {np.mean(f1_scores):.4f}")
print(f"Avg Accuracy:  {np.mean(accuracies):.4f}")
print(f"Avg AUC-ROC:   {np.mean(aucs):.4f}")


Total edges in adjacency: 567
Positive edges for training (types 1, 2, or 4): 258

=== Fold 1 ===
Early stopping at epoch 270
Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 | Acc: 1.0000 | AUC: 1.0000
Confusion: TP=26, FP=0, FN=0, TN=26
Fold 1 - Test size: 52

=== Fold 2 ===
Early stopping at epoch 223
Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 | Acc: 1.0000 | AUC: 1.0000
Confusion: TP=26, FP=0, FN=0, TN=26
Fold 2 - Test size: 52

=== Fold 3 ===
Early stopping at epoch 157
Precision: 0.9615 | Recall: 0.9615 | F1: 0.9615 | Acc: 0.9615 | AUC: 0.9896
Confusion: TP=25, FP=1, FN=1, TN=25
Fold 3 - Test size: 52

=== Fold 4 ===
Early stopping at epoch 231
Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 | Acc: 1.0000 | AUC: 1.0000
Confusion: TP=26, FP=0, FN=0, TN=26
Fold 4 - Test size: 52

=== Fold 5 ===
Early stopping at epoch 283
Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 | Acc: 1.0000 | AUC: 1.0000
Confusion: TP=26, FP=0, FN=0, TN=26
Fold 5 - Test size: 52

=== Fold 6 ===
Early s