In [3]:
import pandas as pd
import requests
from newspaper import Article
from tqdm import tqdm
import time

# Step 1: Load table
df = pd.read_csv('dataset.csv')  # or use read_excel

# Headers to mimic a real browser
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/122.0.0.0 Safari/537.36"
}

# Step 2: Scrape articles
df['Content'] = ''

for idx, row in tqdm(df.iterrows(), total=len(df)):
    url = row['Link']
    try:
        # Make request manually with headers
        response = requests.get(url, headers=HEADERS, timeout=10)
        if response.status_code != 200:
            print(f"Failed to access {url}: Status code {response.status_code}")
            df.at[idx, 'Content'] = ''
            continue

        # Pass HTML manually to newspaper
        article = Article(url)
        article.set_html(response.text)
        article.parse()

        df.at[idx, 'Content'] = article.text

    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        df.at[idx, 'Content'] = ''
    
    time.sleep(1)  # polite delay between requests

# Step 3: Save table
df.to_csv('updated_table_with_content.csv', index=False)
print("✅ Scraping complete and file saved as 'updated_table_with_content.csv'")


 20%|████████▌                                  | 12/60 [00:14<00:37,  1.29it/s]

Failed to access https://www.nationalreview.com/news/trump-zelensky-go-at-it-in-heated-oval-office-debate-youre-gambling-with-world-war-iii/: Status code 403
Failed to access https://www.wsj.com/world/zelensky-to-meet-trump-in-bid-to-salvage-u-s-support-2e656025: Status code 401


 38%|████████████████▍                          | 23/60 [00:30<00:47,  1.30s/it]

Failed to access https://thehill.com/newsletters/business-economy/5256455-trumps-tariff-problem/: Status code 403


 47%|████████████████████                       | 28/60 [00:36<00:42,  1.32s/it]

Failed to scrape https://www.newsmax.com/us/trade-war-tariffs/2025/04/23/id/1208015/: HTTPSConnectionPool(host='www.newsmax.com', port=443): Read timed out. (read timeout=10)


 65%|███████████████████████████▉               | 39/60 [01:02<00:36,  1.72s/it]

Failed to access https://www.reuters.com/sports/basketball/florida-beat-houston-claim-third-ncaa-mens-basketball-title-2025-04-08/: Status code 401


 68%|█████████████████████████████▍             | 41/60 [01:04<00:23,  1.24s/it]

Failed to access https://www.washingtontimes.com/news/2025/apr/7/florida-wins-ncaa-basketball-title-rallying-beat-houston-65-63/: Status code 403
Failed to scrape nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


100%|███████████████████████████████████████████| 60/60 [01:36<00:00,  1.61s/it]

✅ Scraping complete and file saved as 'updated_table_with_content.csv'





Step | Description
1. | Loads your scraped articles
2. | Extracts all PERSON entities (names)
3. | Groups similar names automatically
4. | Builds a normalization dictionary
5. | Replaces variations in articles with the canonical names
6. | Tracks everything nicely with tqdm progress bars
7. | Saves the new dataframe with a Content_Normalized column

In [9]:
import pandas as pd
import spacy
import re
from tqdm import tqdm
from collections import defaultdict, Counter

# Load Data and spaCy model
print("Loading data and model...")
df = pd.read_csv('updated_table_with_content.csv')
nlp = spacy.load('en_core_web_sm')

# Step 1: Extract Named Entities (PERSON)
def extract_named_entities(texts):
    persons = []
    print("Extracting named entities...")
    for doc in tqdm(nlp.pipe(texts, batch_size=20), total=len(texts)):
        persons.extend([ent.text.strip() for ent in doc.ents if ent.label_ == "PERSON"])
    return persons

all_persons = extract_named_entities(df['Content'].dropna().tolist())
print(f"Total PERSON entities extracted: {len(all_persons)}")

# Step 2: Clean and Filter NEs
def clean_persons(persons):
    cleaned = []
    for p in persons:
        if len(p.split()) <= 3 and all(word[0].isupper() for word in p.split() if word.isalpha()):
            cleaned.append(p)
    return cleaned

filtered_persons = clean_persons(all_persons)
print(f"PERSON entities after cleaning: {len(filtered_persons)}")

# Step 3: Build Normalization Dictionary (Smarter Grouping)
def build_normalization_dict(person_list):
    groups = defaultdict(list)
    
    for person in person_list:
        parts = person.replace(".", "").split()
        if parts:
            last_name = parts[-1].lower()
            groups[last_name].append(person)
    
    normalization_dict = {}
    for group, names in groups.items():
        name_counter = Counter(names)
        canonical_name = max(name_counter, key=name_counter.get)  # most frequent
        for name in names:
            if name != canonical_name:
                normalization_dict[name] = canonical_name
    
    return normalization_dict

normalization_dict = build_normalization_dict(filtered_persons)
print(f"Normalization dictionary size: {len(normalization_dict)}")

# Optional: See top mapped examples
print("\nSample normalization mappings:")
for i, (k, v) in enumerate(normalization_dict.items()):
    if i >= 10:
        break
    print(f"  {k}  -->  {v}")

# Step 4: Normalize Content
def normalize_content(text, normalization_dict):
    if pd.isna(text) or text.strip() == '':
        return text
    for pattern, replacement in normalization_dict.items():
        text = re.sub(rf'\b{re.escape(pattern)}\b', replacement, text, flags=re.IGNORECASE)
    return text

print("\nNormalizing articles...")
tqdm.pandas()
df['Content_Normalized'] = df['Content'].progress_apply(lambda x: normalize_content(x, normalization_dict))

# Step 5: Save the final file
df.to_csv('final_normalized_articles.csv', index=False)
print("\n✅ Final normalized articles saved as 'final_normalized_articles.csv'")


Loading data and model...
Extracting named entities...


100%|███████████████████████████████████████████| 52/52 [00:07<00:00,  7.01it/s]


Total PERSON entities extracted: 966
PERSON entities after cleaning: 957
Normalization dictionary size: 66

Sample normalization mappings:
  Donald Trump  -->  Trump
  Eric Trump  -->  Trump
  JD Vance  -->  Vance
  J. D. Vance  -->  Vance
  VP Vance  -->  Vance
  Vladimir Putin  -->  Putin
  Volodymyr Zelenskyy  -->  Zelenskyy
  Hunter Biden  -->  Joe Biden
  Biden  -->  Joe Biden
  Clinton  -->  Hillary Clinton

Normalizing articles...


100%|██████████████████████████████████████████| 60/60 [00:00<00:00, 293.46it/s]


✅ Final normalized articles saved as 'final_normalized_articles.csv'





In [11]:
import pandas as pd
import spacy
from neo4j import GraphDatabase
from tqdm import tqdm

# 1. Neo4j Connection
URI = "neo4j+s://da0d5023.databases.neo4j.io"
AUTH = ("neo4j", "")
driver = GraphDatabase.driver(URI, auth=AUTH)

# 2. Load Data and spaCy Model
df = pd.read_csv('final_normalized_articles.csv')
nlp = spacy.load("en_core_web_sm")

# 3. Extract Entities and Relationships
def extract_entities_relations(text):
    doc = nlp(text)
    persons = [ent.text.strip() for ent in doc.ents if ent.label_ == "PERSON"]
    relations = []
    for sent in doc.sents:
        persons_in_sent = [ent.text.strip() for ent in sent.ents if ent.label_ == "PERSON"]
        if len(persons_in_sent) >= 2:
            for i in range(len(persons_in_sent) - 1):
                relations.append((persons_in_sent[i], "RELATED_TO", persons_in_sent[i+1]))
    return set(persons), set(relations)

all_nodes = set()
all_edges = set()

print("Extracting entities and relations...")
for text in tqdm(df['Content_Normalized'].dropna().tolist()):
    nodes, edges = extract_entities_relations(text)
    all_nodes.update(nodes)
    all_edges.update(edges)

print(f"Total unique PERSON nodes: {len(all_nodes)}")
print(f"Total unique relationships: {len(all_edges)}")

# 4. Helper function to run Cypher safely
def run_query(tx, query, parameters=None):
    tx.run(query, parameters or {})

# 5. Insert into Neo4j
with driver.session() as session:
    print("Creating nodes...")
    for person in tqdm(all_nodes):
        query = """
        MERGE (p:Person {name: $name})
        """
        session.execute_write(run_query, query, {"name": person})

    print("Creating relationships...")
    for source, relation, target in tqdm(all_edges):
        query = """
        MATCH (a:Person {name: $source})
        MATCH (b:Person {name: $target})
        MERGE (a)-[r:RELATED_TO]->(b)
        """
        session.execute_write(run_query, query, {"source": source, "target": target})

print("✅ Graph populated successfully into your Neo4j Aura database!")


Extracting entities and relations...


100%|███████████████████████████████████████████| 52/52 [00:08<00:00,  6.19it/s]


Total unique PERSON nodes: 324
Total unique relationships: 198
Creating nodes...


100%|█████████████████████████████████████████| 324/324 [00:45<00:00,  7.19it/s]


Creating relationships...


100%|█████████████████████████████████████████| 198/198 [00:26<00:00,  7.48it/s]

✅ Graph populated successfully into your Neo4j Aura database!



