In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pyodbc
from urllib.parse import urljoin
from datetime import datetime, timedelta
import re


In [2]:
# base_url = "https://royanews.tv"

headers = {'User-Agent': 'Mozilla/5.0'}

sections = [
    "https://www.jordannews.jo/Category/5/Business",
    "https://www.jordannews.jo/Section-33/Trade-Industry"
    # ,"https://www.jordannews.jo/Section-30/Entrepreneurship"
]


# response = requests.get(page_url, headers=headers)
# soup = BeautifulSoup(response.content, "html.parser")



In [5]:
article_links = []

for page_url in sections:
    try:
        response = requests.get(page_url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        for a in soup.find_all("a", href=True):
            href = a["href"].strip()

            # روابط Jordan News (Economy, Business, Trade-Industry, All فقط)
            if page_url.startswith("https://www.jordannews.jo") and any(kw in href for kw in ["Economy", "Business", "Trade-Industry", "All"]):
                full_link = urljoin(page_url, href)
                if full_link not in article_links:
                    article_links.append(full_link)


    except Exception as e:
        print(f"❌ Error while scraping {page_url}: {e}")

print(f"🔗 Total extracted links: {len(article_links)}")


🔗 Total extracted links: 82


In [7]:
valid_links = []
now = datetime.now()

In [9]:
for link in article_links:
    try:
        res = requests.get(link, headers=headers)
        soup = BeautifulSoup(res.content, "html.parser")

        pub_text = ""
        pub_time = None

        pub_div = soup.find("div", class_="pup_date_news")
        if pub_div:
            pub_text = pub_div.get_text(strip=True).replace("نشر :", "").replace("Published:", "").strip()
            pub_text = pub_text.replace("|", "").strip()

        if not pub_text:
            main_date = soup.select_one("div.story span.DateTime")
            if main_date:
                pub_text = main_date.get_text(strip=True)
            else:
                 time_byline = soup.select_one("p.byline span.TimeDate")
                 if time_byline:
                       pub_text = time_byline.get_text(strip=True)


        # ⏰ تحويل الوقت
        if "منذ" in pub_text or "ago" in pub_text:
            if "ساعتين" in pub_text or "two hours" in pub_text:
                pub_time = now - timedelta(hours=2)
            elif "منذ ساعة" in pub_text or "an hour" in pub_text or "hour ago" in pub_text:
                pub_time = now - timedelta(hours=1)    
            elif "ساعات" in pub_text or "ساعة" in pub_text or "hour" in pub_text:
                hours = int(re.search(r"\d+", pub_text).group())
                pub_time = now - timedelta(hours=hours)
            elif "دقيقتين" in pub_text or "two minutes" in pub_text:
                pub_time = now - timedelta(minutes=2)
            elif "منذ دقيقة" in pub_text or "a minute" in pub_text:
                pub_time = now - timedelta(minutes=1)    
            elif "دقائق" in pub_text or "دقيقة" in pub_text or "minute" in pub_text:
                minutes = int(re.search(r"\d+", pub_text).group())
                pub_time = now - timedelta(minutes=minutes)
            elif "m ago" in pub_text:
                match = re.search(r"(\d+)\s*m", pub_text)
                if match:
                     minutes = int(match.group(1))
                     pub_time = now - timedelta(minutes=minutes)                  
            elif "h ago" in pub_text:
                match = re.search(r"(\d+)\s*h", pub_text)
                if match:
                    hours = int(match.group(1))
                    pub_time = now - timedelta(hours=hours)     
        elif "Published :" in pub_text and re.search(r"\d{2}-\d{2}-\d{4} \d{2}:\d{2}", pub_text):
            pub_text = pub_text.replace("Published :", "").strip()
            pub_time = datetime.strptime(pub_text, "%d-%m-%Y %H:%M")

        # ✅ الفلترة
        if pub_time and (now - pub_time).total_seconds() <= 86400:
            valid_links.append((link, pub_text))
            print(f"✅ Added ({pub_text}) → {link}")
        else:
            print(f"🕒 Ignored ({pub_text}) → {link}")
    except Exception as e:
        print(f"❌ Error while checking {link}: {e}")

# 📌 عرض النتائج النهائية
print("\n📌 Articles within the last 24 hours:")
for i, (link, pub_text) in enumerate(valid_links, 1):
    print(f"{i}. {link} 🕒 {pub_text}")


🕒 Ignored () → https://www.jordannews.jo/Category/5/Business
🕒 Ignored () → https://www.jordannews.jo/Section-112/Economy
🕒 Ignored () → https://www.jordannews.jo/Section-33/Trade-Industry
🕒 Ignored () → https://www.jordannews.jo/Section-111/All
🕒 Ignored () → https://www.jordannews.jo/Section-126/All
✅ Added (52m ago) → https://www.jordannews.jo/Section-113/All/Housing-Bank-Holds-its-52nd-Annual-General-Assembly-Approval-of-2024-Financial-Statements-and-30-Cash-Dividends-Distribution-JOD-150-3-Million-Net-Profits-for-2024-and-Major-Achievements-across-Key-P-41430
🕒 Ignored () → https://www.jordannews.jo/Section-113/All
✅ Added (58m ago) → https://www.jordannews.jo/Section-113/All/Applied-Science-Private-University-Leads-Jordanian-Universities-and-Achieves-a-Major-Asian-Milestone-in-the-2025-Times-Ranking-41429
✅ Added (18 h ago) → https://www.jordannews.jo/Section-112/Economy/Energy-Regulatory-Authority-100-000-Plastic-Gas-Cylinders-to-Be-Available-in-Markets-by-End-of-June-41403
✅ Ad

In [13]:
final_links = set()

for item in valid_links:
    if isinstance(item, tuple) and len(item) >= 1:
        link = item[0]
        if link and link.startswith("http"):
            final_links.add(link)

print("\n📌 روابط المقالات خلال آخر 24 ساعة (final_links):")
for i, link in enumerate(final_links, 1):
    print(f"{i}. {link}")



📌 روابط المقالات خلال آخر 24 ساعة (final_links):
1. https://www.jordannews.jo/Section-113/All/Housing-Bank-Holds-its-52nd-Annual-General-Assembly-Approval-of-2024-Financial-Statements-and-30-Cash-Dividends-Distribution-JOD-150-3-Million-Net-Profits-for-2024-and-Major-Achievements-across-Key-P-41430
2. https://www.jordannews.jo/Section-112/Economy/JEBA-President-The-European-Market-Represents-a-Strong-Opportunity-to-Expand-and-Diversify-Jordan-s-Exports-41395
3. https://www.jordannews.jo/Section-113/All/Applied-Science-Private-University-Leads-Jordanian-Universities-and-Achieves-a-Major-Asian-Milestone-in-the-2025-Times-Ranking-41429
4. https://www.jordannews.jo/Section-113/All/European-Markets-Close-Higher-Across-the-Board-41405
5. https://www.jordannews.jo/Section-112/Economy/Chinese-and-Japanese-Stocks-Continue-to-Gain-41404
6. https://www.jordannews.jo/Section-113/All/Slight-Increase-in-Global-Oil-Prices-41406
7. https://www.jordannews.jo/Section-112/Economy/21-Carat-Gold-Priced-at

In [15]:
import os
# images_folder = "images"
os.makedirs('news_images', exist_ok=True)


In [17]:
data = []

for link in final_links:
    try:
        res = requests.get(link, headers=headers)
        soup = BeautifulSoup(res.content, "html.parser")

        # العنوان
        title_tag = soup.find("h1", class_="details-news-title") or soup.find("div", class_="news_main_title_mob")
        title = title_tag.get_text(strip=True) if title_tag else "No Title"

        # المحتوى
        content_div = soup.find("div", id="atricle-text") or soup.find("div", class_="Newsbody")

        if content_div:
    # احذف كل وسوم الإعلانات <addadscommand> لو فيه
            for ad in content_div.find_all('addadscommand'):
                ad.decompose()

    # احذف قسم "Read More" لو موجود
            read_more_section = content_div.find('div', class_="news-items")
            if read_more_section:
                 read_more_section.decompose()

    # بعد التنظيف، خذ كل النص المكتوب
            content = content_div.get_text(separator="\n", strip=True)
        else:
            content = "No Content"

        # الصورة
        img_tag = soup.find("img")
        img_url = urljoin(link, img_tag["src"]) if img_tag and img_tag.get("src") else None

        # حفظ الصورة
        clean_title = re.sub(r'[\\/:*?"<>|]', '', title)
        if img_url:
            img_data = requests.get(img_url).content
            img_path = os.path.join("news_images", f"{clean_title}.jpg")
            with open(img_path, "wb") as f:
                f.write(img_data)

        data.append({
            "Title": title,
            "Content": content,
            "Image": img_url,
            "Link": link
        })
        print(f"✅ Saved: {title}")
    except Exception as e:
        print(f"❌ Error in {link}: {e}")


✅ Saved: Housing Bank Holds its 52nd Annual General Assembly Approval of 2024 Financial Statements and 30% Cash Dividends Distribution JOD 150.3 Million Net Profits for 2024 and Major Achievements across Key P
✅ Saved: JEBA President: The European Market Represents a Strong Opportunity to Expand and Diversify Jordan's Exports
✅ Saved: Applied Science Private University Leads Jordanian Universities and Achieves a Major Asian Milestone in the 2025 Times Ranking
✅ Saved: European Markets Close Higher Across the Board
✅ Saved: Chinese and Japanese Stocks Continue to Gain
✅ Saved: Slight Increase in Global Oil Prices
✅ Saved: 21-Carat Gold Priced at 68 Dinars per Gram in the Local Market
✅ Saved: Energy Regulatory Authority: 100,000 Plastic Gas Cylinders to Be Available in Markets by End of June


In [19]:
df = pd.DataFrame(data)
df

Unnamed: 0,Title,Content,Image,Link
0,Housing Bank Holds its 52nd Annual General Ass...,The Housing Bank for Trade and\nFinance (HBTF)...,https://www.jordannews.jo/App_images/logo.png,https://www.jordannews.jo/Section-113/All/Hous...
1,JEBA President: The European Market Represents...,The President of the Jordanian-European Busine...,https://www.jordannews.jo/App_images/logo.png,https://www.jordannews.jo/Section-112/Economy/...
2,Applied Science Private University Leads Jorda...,Applied Science Private University (ASU) conti...,https://www.jordannews.jo/App_images/logo.png,https://www.jordannews.jo/Section-113/All/Appl...
3,European Markets Close Higher Across the Board,European stock markets closed higher across th...,https://www.jordannews.jo/App_images/logo.png,https://www.jordannews.jo/Section-113/All/Euro...
4,Chinese and Japanese Stocks Continue to Gain,Stocks in China and Hong Kong rose slightly on...,https://www.jordannews.jo/App_images/logo.png,https://www.jordannews.jo/Section-112/Economy/...
5,Slight Increase in Global Oil Prices,Global oil prices rose slightly on Saturday at...,https://www.jordannews.jo/App_images/logo.png,https://www.jordannews.jo/Section-113/All/Slig...
6,21-Carat Gold Priced at 68 Dinars per Gram in ...,"Gold prices for 21-carat, the most in-demand a...",https://www.jordannews.jo/App_images/logo.png,https://www.jordannews.jo/Section-112/Economy/...
7,"Energy Regulatory Authority: 100,000 Plastic G...",The Chairman of the Board of Commissioners of ...,https://www.jordannews.jo/App_images/logo.png,https://www.jordannews.jo/Section-112/Economy/...


In [21]:
df.to_excel("Jordan_news_articles.xlsx", index=False, engine='openpyxl')



In [23]:
articles_df = pd.read_excel("Jordan_news_articles.xlsx", engine='openpyxl')
# articles_df

In [25]:
import pyodbc,  sys 

conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};'

                      'SERVER=.\\SQLEXPRESS;'

                      'DATABASE=Scraping;'

                      'Trusted_Connection=yes;'
                      
                      'Encrypt=no')
cursor = conn.cursor()


In [27]:
# cursor.execute("DELETE FROM Roya_Articles")
# conn.commit()
cursor.execute("TRUNCATE TABLE Jordan_news_articles")
conn.commit()

In [29]:
for index, row in df.iterrows():
    cursor.execute(
        "INSERT INTO Jordan_news_articles (title, content, image, link) VALUES (?, ?, ?, ?)",
        (row['Title'], row['Content'], row['Image'], row['Link'])
    )

conn.commit()
