In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pyodbc
from urllib.parse import urljoin
from datetime import datetime, timedelta
import re


In [2]:
# base_url = "https://royanews.tv"

headers = {'User-Agent': 'Mozilla/5.0'}

sections = [
    "https://royanews.tv/section/12",
    "https://en.royanews.tv/section/8"
]


# response = requests.get(page_url, headers=headers)
# soup = BeautifulSoup(response.content, "html.parser")



In [5]:
article_links = []

for page_url in sections:
    try:
        response = requests.get(page_url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        for a in soup.find_all("a", href=True):
            href = a["href"].strip()

            # روابط رؤيا
            if page_url.startswith("https://royanews.tv") and href.startswith("/news/"):
                full_link = urljoin(page_url, href)
                if full_link not in article_links:
                    article_links.append(full_link)


            # روابط رؤيا إنجليزي
            elif page_url.startswith("https://en.royanews.tv") and href.startswith("/news/"):
                full_link = urljoin(page_url, href)
                if full_link not in article_links:
                    article_links.append(full_link)

    except Exception as e:
        print(f"❌ Error while scraping {page_url}: {e}")

print(f"🔗 Total extracted links: {len(article_links)}")


🔗 Total extracted links: 42


In [7]:
valid_links = []
now = datetime.now()

In [9]:
for link in article_links:
    try:
        res = requests.get(link, headers=headers)
        soup = BeautifulSoup(res.content, "html.parser")

        pub_text = ""
        pub_time = None

        # رؤيا
        pub_div = soup.find("div", class_="pup_date_news")
        if pub_div:
            pub_text = pub_div.get_text(strip=True).replace("نشر :", "").replace("Published:", "").strip()
            pub_text = pub_text.replace("|", "").strip()

        # Jordan News
        if not pub_text:
            main_date = soup.select_one("div.story span.DateTime")
            if main_date:
                pub_text = main_date.get_text(strip=True)
            else:
                 time_byline = soup.select_one("p.byline span.TimeDate")
                 if time_byline:
                       pub_text = time_byline.get_text(strip=True)


        # ⏰ تحويل الوقت
        if "منذ" in pub_text or "ago" in pub_text:
            if "ساعتين" in pub_text or "two hours" in pub_text:
                pub_time = now - timedelta(hours=2)
            elif "منذ ساعة" in pub_text or "an hour" in pub_text or "hour ago" in pub_text:
                pub_time = now - timedelta(hours=1)    
            elif "ساعات" in pub_text or "ساعة" in pub_text or "hour" in pub_text:
                hours = int(re.search(r"\d+", pub_text).group())
                pub_time = now - timedelta(hours=hours)
            elif "دقيقتين" in pub_text or "two minutes" in pub_text:
                pub_time = now - timedelta(minutes=2)
            elif "منذ دقيقة" in pub_text or "a minute" in pub_text:
                pub_time = now - timedelta(minutes=1)    
            elif "دقائق" in pub_text or "دقيقة" in pub_text or "minute" in pub_text:
                minutes = int(re.search(r"\d+", pub_text).group())
                pub_time = now - timedelta(minutes=minutes)
            elif "m ago" in pub_text:
                match = re.search(r"(\d+)\s*m", pub_text)
                if match:
                     minutes = int(match.group(1))
                     pub_time = now - timedelta(minutes=minutes)                  
            elif "h ago" in pub_text:
                match = re.search(r"(\d+)\s*h", pub_text)
                if match:
                    hours = int(match.group(1))
                    pub_time = now - timedelta(hours=hours)     
        elif "Published :" in pub_text and re.search(r"\d{2}-\d{2}-\d{4} \d{2}:\d{2}", pub_text):
            pub_text = pub_text.replace("Published :", "").strip()
            pub_time = datetime.strptime(pub_text, "%d-%m-%Y %H:%M")

        # ✅ الفلترة
        if pub_time and (now - pub_time).total_seconds() <= 86400:
            valid_links.append((link, pub_text))
            print(f"✅ Added ({pub_text}) → {link}")
        else:
            print(f"🕒 Ignored ({pub_text}) → {link}")
    except Exception as e:
        print(f"❌ Error while checking {link}: {e}")

# 📌 عرض النتائج النهائية
print("\n📌 Articles within the last 24 hours:")
for i, (link, pub_text) in enumerate(valid_links, 1):
    print(f"{i}. {link} 🕒 {pub_text}")


✅ Added (منذ 9 ساعات) → https://royanews.tv/news/347841
✅ Added (منذ 3 ساعات) → https://royanews.tv/news/347846
✅ Added (منذ 12 ساعة) → https://royanews.tv/news/347835
🕒 Ignored (06:21 2025-04-26) → https://royanews.tv/news/347776
🕒 Ignored (13:22 2025-04-25) → https://royanews.tv/news/347728
🕒 Ignored (10:45 2025-04-25) → https://royanews.tv/news/347724
🕒 Ignored (08:13 2025-04-25) → https://royanews.tv/news/347716
🕒 Ignored (08:01 2025-04-25) → https://royanews.tv/news/347715
🕒 Ignored (21:15 2025-04-24) → https://royanews.tv/news/347695
🕒 Ignored (19:43 2025-04-24) → https://royanews.tv/news/347690
🕒 Ignored (09:33 2025-04-24) → https://royanews.tv/news/347658
🕒 Ignored (08:52 2025-04-24) → https://royanews.tv/news/347655
🕒 Ignored (03:50 2025-04-24) → https://royanews.tv/news/347646
🕒 Ignored (17:22 2025-04-23) → https://royanews.tv/news/347606
🕒 Ignored (11:20 2025-04-23) → https://royanews.tv/news/347580
🕒 Ignored (10:53 2025-04-23) → https://royanews.tv/news/347577
🕒 Ignored (08

In [10]:
final_links = set()

for item in valid_links:
    if isinstance(item, tuple) and len(item) >= 1:
        link = item[0]
        if link and link.startswith("http"):
            final_links.add(link)

print("\n📌 روابط المقالات خلال آخر 24 ساعة (final_links):")
for i, link in enumerate(final_links, 1):
    print(f"{i}. {link}")



📌 روابط المقالات خلال آخر 24 ساعة (final_links):
1. https://royanews.tv/news/347846
2. https://royanews.tv/news/347841
3. https://en.royanews.tv/news/59188
4. https://royanews.tv/news/347835


In [13]:
import os
# images_folder = "images"
os.makedirs('news_images', exist_ok=True)


In [15]:
data = []
for link in final_links:
    try:
        res = requests.get(link, headers=headers)
        soup = BeautifulSoup(res.content, "html.parser")

        # العنوان
        title_tag = soup.find("h1", class_="details-news-title") or soup.find("div", class_="news_main_title_mob")
        title = title_tag.get_text(strip=True) if title_tag else "No Title"

        # المحتوى
        content_div = soup.find("div", class_="Newsbody") or soup.find("div", id="atricle-text")

        if content_div:
    # حذف الإعلانات
            for ad_div in content_div.find_all("div", class_="custom-statement"):
                ad_div.decompose()

    # حذف "اقرأ أيضاً"
            for extra in content_div.find_all("p", class_="extra_title"):
                extra.decompose()

    # حذف الوسوم (tags)
            tags_section = content_div.find("div", class_="tags")
            if tags_section:
               tags_section.decompose()

            elements = content_div.find_all(["p", "li", "table"])
            content_parts = []

            for elem in elements:
                if elem.name == "table":
                   content_parts.append(str(elem))  # نخزن الجدول كـ HTML String
                else:
                    text = elem.get_text(strip=True)
                    if text:
                         content_parts.append(text)

            content = "\n\n".join(content_parts) if content_parts else "No Content"
        else:
            content = "No Content"

        # الصورة
        img_tag = soup.find("img")
        img_url = urljoin(link, img_tag["src"]) if img_tag and img_tag.get("src") else None

        # حفظ الصورة
        clean_title = re.sub(r'[\\/:*?"<>|]', '', title)
        if img_url:
            img_data = requests.get(img_url).content
            img_path = os.path.join("news_images", f"{clean_title}.jpg")
            with open(img_path, "wb") as f:
                f.write(img_data)

        data.append({
            "Title": title,
            "Content": content,
            "Image": img_url,
            "Link": link
        })
        print(f"✅ Saved: {title}")
    except Exception as e:
        print(f"❌ Error in {link}: {e}")


✅ Saved: استقرار نسبي في أسعار النفط عالميا مع ترقب قرارات أوبك+
✅ Saved: ترمب يطلب السماح للسفن الأمريكية بالمرور مجانا عبر قناتي السويس وبنما
✅ Saved: Gold prices in Jordan Sunday, April 27
✅ Saved: مختصون يبحثون بالتحديات التي يواجهها القطاع الزراعي في الأردن - فيديو


In [17]:
df = pd.DataFrame(data)
df

Unnamed: 0,Title,Content,Image,Link
0,استقرار نسبي في أسعار النفط عالميا مع ترقب قرا...,سجل سعر خام برنت القياسي حوالي 66.87 دولارًا ل...,https://royanews.tv/assets/images/icon-languag...,https://royanews.tv/news/347846
1,ترمب يطلب السماح للسفن الأمريكية بالمرور مجانا...,ترمب: قناتا السويس وبنما ما كانتا لتوجدا لولا ...,https://royanews.tv/assets/images/icon-languag...,https://royanews.tv/news/347841
2,"Gold prices in Jordan Sunday, April 27","Sunday, gold prices in Jordan stabilized, acco...",https://en.royanews.tv/assets/images/icon-lang...,https://en.royanews.tv/news/59188
3,مختصون يبحثون بالتحديات التي يواجهها القطاع ال...,الحياري: القطاع الزراعي يسجل نمواً بنسبة 8.4% ...,https://royanews.tv/assets/images/icon-languag...,https://royanews.tv/news/347835


In [21]:
df.to_excel("Roya_scraping_articles.xlsx", index=False, engine='openpyxl')



In [23]:
articles_df = pd.read_excel("Roya_scraping_articles.xlsx", engine='openpyxl')
# articles_df

In [25]:
import pyodbc,  sys 

conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};'

                      'SERVER=.\\SQLEXPRESS;'

                      'DATABASE=Scraping;'

                      'Trusted_Connection=yes;'
                      
                      'Encrypt=no')
cursor = conn.cursor()


In [27]:
# cursor.execute("DELETE FROM Roya_Articles")
# conn.commit()
cursor.execute("TRUNCATE TABLE Roya_scraping_articles")
conn.commit()

In [29]:
for index, row in df.iterrows():
    cursor.execute(
        "INSERT INTO Roya_scraping_articles (title, content, image, link) VALUES (?, ?, ?, ?)",
        (row['Title'], row['Content'], row['Image'], row['Link'])
    )

conn.commit()
