In [None]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
import time

def scrape_bayut_page(page_url):
    """دالة لجمع البيانات من صفحة واحدة"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    try:
        response = requests.get(page_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
    except Exception as e:
        print(f"خطأ في تحميل الصفحة {page_url}: {e}")
        return []
    
    def text_or_none(selector, parent):
        el = parent.select_one(selector)
        return el.get_text(strip=True) if el else None
    
    # select li cards inside the ul (avoid iterating the ul itself)
    property_cards = soup.select("ul._172b35d1 li")
    properties = []
    
    for card in property_cards:
        try:
            a = card.select_one("a._8969fafd")
            link = f"https://www.bayut.eg{a.get('href')}" if a and a.get('href') else None
            
            price = text_or_none("h4.afdad5da._71366de7 span.eff033a6", card) or text_or_none("span.eff033a6", card)
            title = text_or_none("h2._34c51035", card)
            
            spans = card.select("span._3002c6fb")
            type_ = spans[0].get_text(strip=True) if len(spans) > 0 else None
            bedrooms = spans[1].get_text(strip=True) if len(spans) > 1 else None
            bathrooms = spans[2].get_text(strip=True) if len(spans) > 2 else None
            
            location = text_or_none("h3._51c6b1ca", card)
            d = text_or_none("span.fd7ade6e", card)
            
            area_raw = text_or_none("h4._60820635._07b5f28e", card) or text_or_none("h4", card)
            area = area_raw[:-6] if area_raw and len(area_raw) > 6 else area_raw
            
            properties.append({
                'PropertyType': type_,
                'Link': link,
                'Title': title,
                'Price': price,
                'Location': location,
                'Area': area,
                'Bedrooms': bedrooms,
                'Bathrooms': bathrooms,
                'Down_Payment': d,
            })
        except Exception as e:
            print(f"خطأ في معالجة كارد: {e}")
            continue
    
    return properties

def scrape_all_pages(base_url, max_pages=20):
    """دالة لجمع البيانات من جميع الصفحات"""
    all_properties = []
    
    # الصفحة الأولى
    print(f"جاري جمع البيانات من الصفحة 1...")
    page1_properties = scrape_bayut_page(base_url)
    all_properties.extend(page1_properties)
    print(f"تم جمع {len(page1_properties)} عقار من الصفحة 1")
    
    # الصفحات التالية
    for page_num in range(2, max_pages + 1):
        page_url = f"{base_url.rstrip('/')}/page-{page_num}/"
        print(f"جاري جمع البيانات من الصفحة {page_num}...")
        
        properties = scrape_bayut_page(page_url)
        
        # إذا لم نجد عقارات في هذه الصفحة، توقف
        if not properties:
            print(f"لم يتم العثور على عقارات في الصفحة {page_num}. التوقف...")
            break
        
        all_properties.extend(properties)
        print(f"تم جمع {len(properties)} عقار من الصفحة {page_num}")
        
        # تأخير بسيط لتجنب حظر IP
        time.sleep(1)
    
    return all_properties

# استخدام الدالة
base_url = "https://www.bayut.eg/en/alexandria/properties-for-sale/"
all_properties = scrape_all_pages(base_url, max_pages=20)  # يمكنك زيادة عدد الصفحات

df = pd.DataFrame(all_properties)
def clean_data(df_clean):
    # Split text using string ',' in column: 'Location'
    loc_0 = df_clean.columns.get_loc('Location')
    df_clean_split = df_clean['Location'].str.split(pat=',', expand=True).add_prefix('Location_')
    df_clean = pd.concat([df_clean.iloc[:, :loc_0], df_clean_split, df_clean.iloc[:, loc_0:]], axis=1)
    df_clean = df_clean.drop(columns=['Location'])
    # Drop column: 'Location_2'
    df_clean = df_clean.drop(columns=['Location_2'])
    # Rename column 'Location_1' to 'State'
    df_clean = df_clean.rename(columns={'Location_1': 'State'})
    # Rename column 'Location_0' to 'Location'
    df_clean = df_clean.rename(columns={'Location_0': 'Location'})
    # Replace all instances of "Saba Pasha" with "Saba Basha" in column: 'State'
    df['State'] = df['State'].str.replace("Saba Pasha", "Saba Basha", case=False, regex=False)
    # Replace all instances of "Borg al-Arab" with "Borg El Arab" in column: 'State'
    df['State'] = df['State'].str.replace("Borg al-Arab", "Borg El Arab", case=False, regex=False)
    # Replace all instances of "Smoha" with "Smouha" in column: 'Location'
    df['Location'] = df['Location'].str.replace("Smoha", "Smouha", case=False, regex=False)
    return df_clean

df_clean_1 = clean_data(df.copy())
df1 = df_clean_1.copy()
df1['Down_Payment']= df1['Down_Payment'].fillna(0)
df1.isnull().sum()
df1.dropna(inplace=True)
df1.reset_index(drop=True, inplace=True)
df1

df_clean = df1.copy()
df_clean['State'] = df_clean['State'].str.strip()
mask = df_clean['State'].str.contains('Alexandria', case=False, na=False)
df_clean.loc[mask, 'State'] = df_clean.loc[mask, 'Location']
df_clean['State'] = df_clean['State'].fillna(df_clean['Location'])
df_clean['State'] = df_clean['State'].str.strip()
df_clean['Location'] = df_clean['Location'].str.strip()
df_clean['Payment_Method'] = ""
df_clean.loc[df_clean['Down_Payment'].astype(str).str.strip() != "0", 'Payment_Method'] = "Installments"
df_clean.loc[df_clean['Payment_Method'] == "", 'Payment_Method'] = "Cash"

df_final = pd.read_csv("E:\PY\Real Estate\Final.csv")
df_combined = pd.concat([df_final, df_clean], ignore_index=True)
df_combined.to_csv("E:\PY\Real Estate\Final.csv", index=False)
print("Datasets combined and saved successfully.")

df11 = pd.read_csv("E:\PY\Real Estate\Final.csv")
df11.drop_duplicates(inplace=True)
df11.to_csv("E:\PY\Real Estate\Final.csv", index=False)

جاري جمع البيانات من الصفحة 1...
تم جمع 26 عقار من الصفحة 1
جاري جمع البيانات من الصفحة 2...
تم جمع 26 عقار من الصفحة 2
تنظيف State مكتمل!
أمثلة بعد التنظيف:
         Location  State
0  Orouba Skyline  Smoha
1         The One  Smoha
2       Alex West  Agami
3       Alex West  Agami
4  Orouba Skyline  Smoha


Unnamed: 0,PropertyType,Link,Title,Price,Location,State,Area,Bedrooms,Bathrooms,Down_Payment,Payment_Method
0,Apartment,https://www.bayut.eg/en/property/details-50299...,"Apartment for sale, 149 m in Skyline delivery ...",6055000,Orouba Skyline,Smoha,149,3,3,"EGP 4,800,000",Installments
1,Apartment,https://www.bayut.eg/en/property/details-50299...,Apartment for sale 179 m The One compound Smou...,11258026,The One,Smoha,179,3,2,"EGP 3,089,090",Installments
2,Apartment,https://www.bayut.eg/en/property/details-50299...,Apartment for sale 153 m with a view directly ...,5500000,Alex West,Agami,153,3,2,0,Cash
3,Townhouse,https://www.bayut.eg/en/property/details-50289...,Receive your townhouse with an open lake view ...,13000000,Alex West,Agami,180,3,2,0,Cash
4,Apartment,https://www.bayut.eg/en/property/details-50274...,Receive your hotel-finished apartment immediat...,10000000,Orouba Skyline,Smoha,206,3,3,0,Cash
5,Apartment,https://www.bayut.eg/en/property/details-50288...,Own your apartment with a direct club view in ...,8600000,Murooj,Smoha,207,3,3,0,Cash
6,Villa,https://www.bayut.eg/en/property/details-50299...,"Villa for sale 1,030 sqm in Alex West immediat...",35000000,Alex West,Agami,1030,6,6,0,Cash
7,Apartment,https://www.bayut.eg/en/property/details-50299...,Apartment for sale 175 sqm Sawary with install...,7454240,Sawari,Moharam Bik,175,3,3,"EGP 5,250,000",Installments
8,Apartment,https://www.bayut.eg/en/property/details-50299...,Apartment 134 m for sale in Smouha Albert elaw...,8440000,The One,Smoha,134,3,2,"EGP 2,100,000",Installments
9,Apartment,https://www.bayut.eg/en/property/details-50299...,Apartment for sale 180 m in Smouha The One com...,11529000,The One,Smoha,180,4,2,"EGP 2,445,581",Installments
