# Import Relevant Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup

## Scrape the data

In [5]:
def scrape_political_reviews(page_number, flag):
    url_params = {'Text' : 'egypt', 'StartRowIndex' : page_number, 'PID' : 1, 'CID' : 64}
    url = 'https://english.ahram.org.eg/UI/Front/Search.aspx'
    response = requests.get(url=url, params=url_params)
    soup = BeautifulSoup(response.text, 'html.parser')
    end = 20
    scraped_data = []
    for i in range(end):
        text = soup.find(
            id=f'ContentPlaceHolder1_dlNewsContent_{"hyplnkTitle" if flag else "lblCreationDate"}_{i}'
        ).get_text()
        scraped_data.append(text if flag else text.split(' ')[0])
    return scraped_data

In [6]:
reviews = []
dates = []
for i in range(0, 10000, 20):
    reviews += scrape_political_reviews(i, True)
    dates += scrape_political_reviews(i, False)

In [11]:
print(len(reviews))
print(len(dates))

10000
10000


## Removing Unnecessary Characters

In [13]:
reviews = [re.sub('\s+', ' ', review) for review in reviews]

In [14]:
df = pd.DataFrame(data={'political_review': reviews, 'date': dates})
df

Unnamed: 0,political_review,date
0,Parliament approves law setting up Egyptian He...,2/9/2022
1,New head of Egypt's Supreme Constitutional Cou...,2/9/2022
2,Egypt’s HR council hails as 'historic step' th...,2/9/2022
3,Sisi names first Christian as president of Egy...,2/8/2022
4,Egyptian parliament rejects draft law imposing...,2/8/2022
...,...,...
9995,Egypt's President Sisi arrives in Saudi Arabia...,4/14/2018
9996,Egypt concerned over military escalation in Sy...,4/14/2018
9997,"Egyptian Army says 27 'takfiris' killed, 114 s...",4/14/2018
9998,Egypt's Coptic Pope Tawadros II receives Portu...,4/13/2018


## Convert date column to pandas datetime object

In [15]:
df['date'] = pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   political_review  10000 non-null  object        
 1   date              10000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 156.4+ KB


In [16]:
df

Unnamed: 0,political_review,date
0,Parliament approves law setting up Egyptian He...,2022-02-09
1,New head of Egypt's Supreme Constitutional Cou...,2022-02-09
2,Egypt’s HR council hails as 'historic step' th...,2022-02-09
3,Sisi names first Christian as president of Egy...,2022-02-08
4,Egyptian parliament rejects draft law imposing...,2022-02-08
...,...,...
9995,Egypt's President Sisi arrives in Saudi Arabia...,2018-04-14
9996,Egypt concerned over military escalation in Sy...,2018-04-14
9997,"Egyptian Army says 27 'takfiris' killed, 114 s...",2018-04-14
9998,Egypt's Coptic Pope Tawadros II receives Portu...,2018-04-13


In [17]:
df.to_csv('ahram_political_reviews.csv')