In [92]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time

# Data Scraping

In [105]:
class scrapingData:
    
    def __init__(self, driver, chrome_driver_path):
        self.driver = driver
        self.chrome_driver_path = chrome_driver_path
        self.article_links={'economy':[],'culture':[],'politics':[],'tamazight':[],'sport':[]}
        self.titles=[]
        self.contents=[]
        self.categories=[]
        self.keyWords=[]
        self.sources=[]
        self.date=[]
        
    def scrapingArticles(self):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }
        for category in self.article_links : 
            for link in self.article_links[category] :
                response = requests.get(link, headers=headers)
                soup = BeautifulSoup(response.text, 'html.parser')
                self.titles.append(soup.find('h1', class_='post-title').text)
                
                self.contents.append( soup.find('div', class_ = 'article-content').text)
                
                ol = soup.find('ol', class_ = 'breadcrumb')
                li = ol.find_all('li', class_='breadcrumb-item')[1].text
                self.categories.append(li)
    
                self.keyWords.append([i.text for i in soup.find_all('a', class_= 'tag_post_tag')])
                source = soup.find('span', class_= 'author')
                if source == None:
                    self.sources.append('')
                else:
                    self.sources.append(source.text)
                self.date.append(soup.find('span', class_='date-post').text)
        
    def economicArticles(self) :
        self.driver.get('https://www.hespress.com/economie')
        while(len(self.article_links['economy']) <1000 ):    
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            self.article_links['economy']= [ link.get_attribute('href')  for link in self.driver.find_elements(By.CLASS_NAME, 'stretched-link')]
            
    def cultureArticles(self) :
        self.driver.get('https://www.hespress.com/art-et-culture')
        while(len(self.article_links['culture']) <1000 ):    
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            self.article_links['culture']= [ link.get_attribute('href')  for link in self.driver.find_elements(By.CLASS_NAME, 'stretched-link')]
            
    def politiqueArticles(self) :
        self.driver.get('https://www.hespress.com/politique')
        while(len(self.article_links['politics']) <1000 ):    
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            self.article_links['politics']= [ link.get_attribute('href')  for link in self.driver.find_elements(By.CLASS_NAME, 'stretched-link')]
            
    def sportArticles(self) :
        self.driver.get('https://www.hespress.com/sport')
        while(len(self.article_links['sport']) <1000 ):    
            self.driver.execute_script("window.scrollTo(0, {});".format(100) ) 
            wait = WebDriverWait(self.driver, 10)  
            element = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'btn.show_more')))
            element.click()       
            self.article_links['sport']= [ link.get_attribute('href')  for div in  self.driver.find_elements(By.CLASS_NAME, 'card-img-top') for link in div.find_elements(By.CLASS_NAME, 'stretched-link')]
    

In [106]:
sd=scrapingData(webdriver.Chrome() ,'c:/chromedriver')

In [107]:
sd.sportArticles()

In [109]:
sd.economicArticles()

In [110]:
sd.cultureArticles()

In [111]:
sd.politiqueArticles()

In [108]:
len(sd.article_links['sport'])

1008

In [112]:
len(sd.article_links['culture'])

1104

In [113]:
len(sd.article_links['economy'])

1164

In [114]:
len(sd.article_links['politics'])

1032

In [119]:
sd.scrapingArticles()

In [122]:
len(sd.titles)

7984

# Data storing

In [123]:
data = {
    'titles': [],
    'categories': [],
    'contents': [],
    'keyWords': [],
    'sources': [],
    'date': []
}

for title, category, content, keywords, source, date in zip(sd.titles, sd.categories,sd.contents, sd.keyWords, sd.sources,sd.date):
        data['titles'].append(title.strip())
        data['categories'].append(category.strip())
        data['contents'].append(content.strip())
        data['keyWords'].append([keyword.strip() for keyword in keywords if keyword.strip()!=''])
        data['sources'].append(source.strip())
        data['date'].append(date.strip())

In [66]:
len(data['titles'])

3994

## MongoDB DataBase

In [69]:
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client['ArticlesDataBase']
collection = db['articles']

articles = []
for title, category, content, keywords, source, date in zip(data['titles'], data['categories'],data['contents'], data['keyWords'], data['sources'],data['date']):
    article = {
        'title': title,
        'category': category,
        'content':content,
        'keywords': keywords,
        'source': source,
        'date': date
    }
    articles.append(article)

collection.insert_many(articles)

InsertManyResult([ObjectId('661f026aa45ea4850f04e048'), ObjectId('661f026aa45ea4850f04e049'), ObjectId('661f026aa45ea4850f04e04a'), ObjectId('661f026aa45ea4850f04e04b'), ObjectId('661f026aa45ea4850f04e04c'), ObjectId('661f026aa45ea4850f04e04d'), ObjectId('661f026aa45ea4850f04e04e'), ObjectId('661f026aa45ea4850f04e04f'), ObjectId('661f026aa45ea4850f04e050'), ObjectId('661f026aa45ea4850f04e051'), ObjectId('661f026aa45ea4850f04e052'), ObjectId('661f026aa45ea4850f04e053'), ObjectId('661f026aa45ea4850f04e054'), ObjectId('661f026aa45ea4850f04e055'), ObjectId('661f026aa45ea4850f04e056'), ObjectId('661f026aa45ea4850f04e057'), ObjectId('661f026aa45ea4850f04e058'), ObjectId('661f026aa45ea4850f04e059'), ObjectId('661f026aa45ea4850f04e05a'), ObjectId('661f026aa45ea4850f04e05b'), ObjectId('661f026aa45ea4850f04e05c'), ObjectId('661f026aa45ea4850f04e05d'), ObjectId('661f026aa45ea4850f04e05e'), ObjectId('661f026aa45ea4850f04e05f'), ObjectId('661f026aa45ea4850f04e060'), ObjectId('661f026aa45ea4850f04e0

## Json File

In [124]:
import json

with open('Data.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False)
