<a href="https://colab.research.google.com/github/Rawan806/2025_GP_18/blob/main/DataScience.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Processing and Cleaning

In [None]:
# Required Libraries Installation
import subprocess, sys, os, re
from datetime import datetime

def install_packages():
    packages = ["requests", "beautifulsoup4", "pandas", "lxml", "urllib3", "certifi"]
    for package in packages:
        try:
            __import__(package.replace("beautifulsoup4", "bs4"))
        except ImportError:
            subprocess.run([sys.executable, "-m", "pip", "install", package], check=True)

install_packages()

# Imports
import requests, pandas as pd
import certifi
from bs4 import BeautifulSoup
from google.colab import files
from pathlib import Path
import urllib3

# Disable only the SSL warning if we fall back to verify=False
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Output dir
output_dir = Path("raw_data")
output_dir.mkdir(exist_ok=True)
html_file = output_dir / "pdpl_raw_html.html"
csv_file  = output_dir / "pdpl_articles_enhanced.csv"

url = "https://laws.boe.gov.sa/boelaws/laws/lawdetails/b7cfae89-828e-4994-b167-adaa00e37188/1"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

print("Starting enhanced data collection with article status detection...")

def fetch_with_fallback(url, headers, timeout=30):
    """
    1) Try with default CA bundle (secure)
    2) Try with certifi.where() (explicit CA path)
    3) Last resort: verify=False (insecure, but pragmatic for Colab scraping)
    """
    try:
        r = requests.get(url, headers=headers, timeout=timeout)  # default
        r.raise_for_status()
        return r, "secure-default"
    except Exception as e1:
        try:
            r = requests.get(url, headers=headers, timeout=timeout, verify=certifi.where())
            r.raise_for_status()
            return r, "secure-certifi"
        except Exception as e2:
            # Final fallback (NOT for prod; just to proceed in Colab)
            r = requests.get(url, headers=headers, timeout=timeout, verify=False)
            r.raise_for_status()
            return r, "insecure-fallback"

try:
    response, mode = fetch_with_fallback(url, headers)
    print(f"HTTP GET OK via mode: {mode} | status={response.status_code}")

    # Save raw HTML for inspection
    html_file.write_text(response.text, encoding="utf-8")
    print(f"Raw HTML saved: {html_file}")

    soup = BeautifulSoup(response.text, "lxml")

    # Try structured extraction first
    articles_from_html = []
    article_divs = soup.find_all("div", class_="article_item")

    if article_divs:
        print(f"Found {len(article_divs)} article divs in HTML structure")
        for div in article_divs:
            classes = div.get("class", [])
            if "changed-article" in classes:
                article_status = "معدلة"
            elif "no_alternate" in classes:
                article_status = "أصلية"
            else:
                article_status = "غير محدد"

            title_element = div.find("h3")
            article_title = title_element.get_text(strip=True) if title_element else "عنوان غير محدد"

            content_div = div.find("div", class_="HTMLContainer")
            if content_div:
                article_text = content_div.get_text(separator=" ", strip=True)
                modification_text = "لا يوجد نص معدل"

                if article_status == "معدلة":
                    popup_div = div.find("div", class_="article_item_popup")
                    if popup_div:
                        popup_content = popup_div.find("div", class_="HTMLContainer")
                        if popup_content:
                            popup_text = popup_content.get_text(separator=" ", strip=True)
                            if popup_text and len(popup_text) > 10:
                                modification_text = popup_text
                        else:
                            popup_text = popup_div.get_text(separator=" ", strip=True)
                            if popup_text and len(popup_text) > 10:
                                modification_text = popup_text

                if article_text and len(article_text) > 10:
                    articles_from_html.append({
                        "article_title": article_title,
                        "article_text": article_text,
                        "article_status": article_status,
                        "modification_text": modification_text
                    })

    # Fallback to text parsing if needed
    articles_from_text = []
    if not articles_from_html:
        print("No structured HTML found, falling back to text extraction...")
        text = soup.get_text(separator="\n", strip=True)
        lines = text.split("\n")
        current_article, current_content, article_found = None, [], False

        for line in lines:
            line = line.strip()
            if not line:
                continue
            if re.match(r"^المادة\s+", line):
                if current_article and current_content and article_found:
                    article_text = " ".join(current_content).strip()
                    if len(article_text) > 10:
                        article_status = "أصلية"
                        modification_text = "لا يوجد نص معدل"
                        if any(ind in article_text for ind in ["عُدلت هذه المادة", "المرسوم الملكي", "بموجب", "معدلة"]):
                            article_status = "معدلة"
                            patterns = [
                                r'عُدلت هذه المادة.*?"(.*?)"',
                                r'بموجب.*?المرسوم الملكي.*?(.*?)(?:\.|$)',
                                r'لتكون بالنص الآتي.*?"(.*?)"'
                            ]
                            for pattern in patterns:
                                m = re.search(pattern, article_text, re.DOTALL)
                                if m:
                                    modification_text = m.group(1).strip()
                                    break
                            if modification_text == "لا يوجد نص معدل":
                                mod_start = article_text.find("عُدلت هذه المادة")
                                if mod_start != -1:
                                    modification_text = article_text[mod_start:mod_start+200] + "..."

                        articles_from_text.append({
                            "article_title": current_article,
                            "article_text": article_text,
                            "article_status": article_status,
                            "modification_text": modification_text
                        })
                current_article, current_content, article_found = line, [], True
            else:
                if current_article and article_found:
                    if len(line) <= 3:
                        continue
                    skip_line = False
                    if (line.startswith("تاريخ") or line.startswith("رقم") or
                        line.startswith("الجريدة الرسمية") or line.startswith("*") or line.startswith("-")):
                        skip_line = True
                    admin_only = ["تعديلات المادة", "مادة معدلة", "مادة ملغية"]
                    line_lower = line.lower()
                    for pat in admin_only:
                        if pat in line_lower and len(line) < 150 and line.count('.') <= 1:
                            skip_line = True
                            break
                    if not skip_line:
                        current_content.append(line)

        if current_article and current_content and article_found:
            article_text = " ".join(current_content).strip()
            if len(article_text) > 10:
                article_status = "أصلية"
                modification_text = "لا يوجد نص معدل"
                if any(ind in article_text for ind in ["عُدلت هذه المادة", "المرسوم الملكي", "بموجب", "معدلة"]):
                    article_status = "معدلة"
                    patterns = [
                        r'عُدلت هذه المادة.*?"(.*?)"',
                        r'بموجب.*?المرسوم الملكي.*?(.*?)(?:\.|$)',
                        r'لتكون بالنص الآتي.*?"(.*?)"'
                    ]
                    for pattern in patterns:
                        m = re.search(pattern, article_text, re.DOTALL)
                        if m:
                            modification_text = m.group(1).strip()
                            break
                    if modification_text == "لا يوجد نص معدل":
                        mod_start = article_text.find("عُدلت هذه المادة")
                        if mod_start != -1:
                            modification_text = article_text[mod_start:mod_start+200] + "..."

                articles_from_text.append({
                    "article_title": current_article,
                    "article_text": article_text,
                    "article_status": article_status,
                    "modification_text": modification_text
                })

    # Prefer HTML-based; else text-based; else empty
    articles = articles_from_html if articles_from_html else articles_from_text

    # Build DataFrame (even if empty, but with columns)
    cols = ["article_title", "article_text", "article_status", "modification_text"]
    df = pd.DataFrame(articles, columns=cols)

    # Basic cleaning
    if not df.empty:
        df = df[df["article_text"].astype(str).str.len() > 10]
        df = df.drop_duplicates(subset=["article_title"])
        first_articles = df[df["article_title"].astype(str).str.contains("المادة الأولى|المادة \\(1\\)|المادة ١", case=False, na=False, regex=True)]
        if len(first_articles) > 1:
            print("Detected duplicate first article - fixing...")
            best_first = first_articles.loc[first_articles["article_text"].str.len().idxmax()]
            df = df[~df["article_title"].str.contains("المادة الأولى|المادة \\(1\\)|المادة ١", case=False, na=False, regex=True)]
            df = pd.concat([pd.DataFrame([best_first]), df]).reset_index(drop=True)

    # Always write a CSV so Cell 2 never fails with FileNotFound
    df.to_csv(csv_file, index=False, encoding="utf-8-sig")
    print(f"Successfully extracted {len(df)} articles with status information!")
    print(f"Enhanced data saved to: {csv_file}")

    # Quick stats
    print("CWD:", Path.cwd())
    print("raw_data contents:", [p.name for p in output_dir.iterdir()])

    if not df.empty:
        print("\n📊 Article Status Summary:")
        print("-" * 40)
        for status, count in df['article_status'].value_counts().items():
            print(f"• {status}: {count} مادة")

except Exception as e:
    # As a last resort, create an empty CSV with columns so Cell 2 won't crash
    print(f"❌ Error in data collection: {e}")
    if not csv_file.exists():
        pd.DataFrame(columns=["article_title", "article_text", "article_status", "modification_text"]).to_csv(csv_file, index=False, encoding="utf-8-sig")
        print("Created empty CSV due to error, to keep the pipeline alive.")

print("\n📁 Files saved in directory:", output_dir)
print("✅ Enhanced data collection completed (with fallbacks).")
try:
    files.download(str(html_file))
    files.download(str(csv_file))
except:
    print("📥 Files ready for download in the output directory")


Starting enhanced data collection with article status detection...
HTTP GET OK via mode: insecure-fallback | status=200
Raw HTML saved: raw_data/pdpl_raw_html.html
Found 43 article divs in HTML structure
Successfully extracted 43 articles with status information!
Enhanced data saved to: raw_data/pdpl_articles_enhanced.csv
CWD: /content
raw_data contents: ['pdpl_raw_html.html', 'pdpl_articles_enhanced.csv']

📊 Article Status Summary:
----------------------------------------
• معدلة: 24 مادة
• أصلية: 19 مادة

📁 Files saved in directory: raw_data
✅ Enhanced data collection completed (with fallbacks).


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from pathlib import Path
import pandas as pd

base = Path("raw_data")
print("CWD:", Path.cwd())
print("raw_data contents:", [p.name for p in base.glob("*")])

target = base / "pdpl_articles_enhanced.csv"
if not target.exists():
    raise FileNotFoundError("CSV not found. Re-run Cell 1 successfully first.")

# اقرأ بترميز utf-8-sig (نفس الحفظ)
df = pd.read_csv(target, encoding="utf-8-sig")
print("Loaded file:", target)
print("Initial shape:", df.shape)
df.info()

# OPTIONAL: اعرض أول صفين للتأكد
display(df.head(2))


CWD: /content
raw_data contents: ['pdpl_raw_html.html', 'pdpl_articles_enhanced.csv']
Loaded file: raw_data/pdpl_articles_enhanced.csv
Initial shape: (43, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   article_title      43 non-null     object
 1   article_text       43 non-null     object
 2   article_status     43 non-null     object
 3   modification_text  43 non-null     object
dtypes: object(4)
memory usage: 1.5+ KB


Unnamed: 0,article_title,article_text,article_status,modification_text
0,المادة الأولى,لغرض تطبيق هذا النظام، يُقصد بالألفاظ والعبارا...,معدلة,- صدر المرسوم الملكي رقم (م/148) وتاريخ 1444/9...
1,المادة الثانية,يُطبق النظام على أي عملية مُعالجة لبيانات شخصي...,أصلية,لا يوجد نص معدل


In [None]:
# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Check for duplicates
duplicates = df.duplicated(subset=["article_title"]).sum()
print(f"\nDuplicate articles found: {duplicates}")

# Display sample
df.head()



Missing values per column:
article_title        0
article_text         0
article_status       0
modification_text    0
dtype: int64

Duplicate articles found: 0


Unnamed: 0,article_title,article_text,article_status,modification_text
0,المادة الأولى,لغرض تطبيق هذا النظام، يُقصد بالألفاظ والعبارا...,معدلة,- صدر المرسوم الملكي رقم (م/148) وتاريخ 1444/9...
1,المادة الثانية,يُطبق النظام على أي عملية مُعالجة لبيانات شخصي...,أصلية,لا يوجد نص معدل
2,المادة الثالثة,لا تخل الأحكام والإجراءات المنصوص عليها في الن...,أصلية,لا يوجد نص معدل
3,المادة الرابعة,يكون لصاحب البيانات الشخصية -وفقاً للأحكام الو...,معدلة,- عُدلت هذه المادة وذلك بموجب المرسوم الملكي ر...
4,المادة الخامسة,فيما عدا الأحوال المنصوص عليها في النظام، لا ت...,معدلة,"- إحلال كلمة ""صريحة"" محل كلمة ""كتابية"" الواردة..."


In [None]:
import re

def clean_text(text):
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    # Remove non-Arabic or special symbols if any
    text = re.sub(r'[^\u0600-\u06FF\s.,؛:،]', '', text)
    return text.strip()

# Apply cleaning
df["article_text"] = df["article_text"].apply(clean_text)
df["modification_text"] = df["modification_text"].apply(clean_text)


In [None]:
# Normalize status column to ensure consistency
df["article_status"] = df["article_status"].replace({
    "معدلة": "Modified",
    "أصلية": "Original",
    "غير محدد": "Unknown"
})


In [None]:
# Save the cleaned and processed dataset
cleaned_file = "raw_data/pdpl_cleaned.csv"
df.to_csv(cleaned_file, index=False, encoding="utf-8-sig")

print(f"✅ Cleaned dataset saved to: {cleaned_file}")


✅ Cleaned dataset saved to: raw_data/pdpl_cleaned.csv
