In [55]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

# Data Scraping

In [62]:
class scrapingData:
    
    def __init__(self, driver, chrome_driver_path):
        self.driver = driver
        self.chrome_driver_path = chrome_driver_path
        self.article_links=[]
        self.titles=[]
        self.contents=[]
        self.categories=[]
        self.keyWords=[]
        self.sources=[]
        
    def scrapingLinks(self):
        self.scroll_down()
        self.article_links=[ link.get_attribute('href')  for link in self.driver.find_elements(By.CLASS_NAME, 'stretched-link')]
        
    def scrapingArticles(self):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }
        for link in self.article_links :
            response = requests.get(link, headers=headers)
            soup = BeautifulSoup(response.text, 'html.parser')
            self.titles.append(soup.find('h1', class_='post-title').text)
            
            self.contents.append( soup.find('div', class_ = 'article-content').text)
            
            ol = soup.find('ol', class_ = 'breadcrumb')
            li = ol.find_all('li', class_='breadcrumb-item')[1].text
            self.categories.append(li)

            self.keyWords.append([i.text for i in soup.find_all('a', class_= 'tag_post_tag')])

            self.sources.append(soup.find('span', class_= 'author').text)
            
        
    def scroll_down(self) :
        self.driver.get('https://en.hespress.com/all')
        last_height = self.driver.execute_script("return window.innerHeight")
        for i in range(500):            
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            new_height = self.driver.execute_script("return window.innerHeight")
            last_height = new_height


In [63]:
sd=scrapingData(webdriver.Chrome() ,'c:/chromedriver')
sd.scrapingLinks()
sd.scrapingArticles()

#### Number of articles scraped :

In [77]:
print(len(sd.titles))

2544


# Data cleaning

### Removing duplicates, None values | Normalizing Data

In [74]:
cleaned_data = {
    'titles': [],
    'categories': [],
    'contents': [],
    'keyWords': [],
    'sources': []
}

for title, category, content, keywords, source in zip(sd.titles, sd.categories,sd.contents, sd.keyWords, sd.sources):
    if (title not in cleaned_data['titles']) and (title.strip() != '' and category.strip() != '' and content.strip() != '' and source.strip() != ''):
        cleaned_data['titles'].append(title.lower().strip())
        cleaned_data['categories'].append(category.lower().strip())
        cleaned_data['contents'].append(content.lower().strip())
        cleaned_data['keyWords'].append([keyword.lower().strip() for keyword in keywords if keyword.strip()!=''])
        cleaned_data['sources'].append(source.lower().strip())

#### Number of articles (after cleaning)

In [78]:
len(cleaned_data['titles'])

1879

# Data storing

### MongoDB DataBase

In [81]:
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client['ArticlesDataBase']
collection = db['articles']

articles = []
for title, category, content, keywords, source in zip(cleaned_data['titles'], cleaned_data['categories'],cleaned_data['contents'], cleaned_data['keyWords'], cleaned_data['sources']):
    article = {
        'title': title,
        'category': category,
        'content':content,
        'keywords': keywords,
        'source': source
    }
    articles.append(article)

collection.insert_many(articles)


InsertManyResult([ObjectId('66140cfca20ed84a5b4a2ec6'), ObjectId('66140cfca20ed84a5b4a2ec7'), ObjectId('66140cfca20ed84a5b4a2ec8'), ObjectId('66140cfca20ed84a5b4a2ec9'), ObjectId('66140cfca20ed84a5b4a2eca'), ObjectId('66140cfca20ed84a5b4a2ecb'), ObjectId('66140cfca20ed84a5b4a2ecc'), ObjectId('66140cfca20ed84a5b4a2ecd'), ObjectId('66140cfca20ed84a5b4a2ece'), ObjectId('66140cfca20ed84a5b4a2ecf'), ObjectId('66140cfca20ed84a5b4a2ed0'), ObjectId('66140cfca20ed84a5b4a2ed1'), ObjectId('66140cfca20ed84a5b4a2ed2'), ObjectId('66140cfca20ed84a5b4a2ed3'), ObjectId('66140cfca20ed84a5b4a2ed4'), ObjectId('66140cfca20ed84a5b4a2ed5'), ObjectId('66140cfca20ed84a5b4a2ed6'), ObjectId('66140cfca20ed84a5b4a2ed7'), ObjectId('66140cfca20ed84a5b4a2ed8'), ObjectId('66140cfca20ed84a5b4a2ed9'), ObjectId('66140cfca20ed84a5b4a2eda'), ObjectId('66140cfca20ed84a5b4a2edb'), ObjectId('66140cfca20ed84a5b4a2edc'), ObjectId('66140cfca20ed84a5b4a2edd'), ObjectId('66140cfca20ed84a5b4a2ede'), ObjectId('66140cfca20ed84a5b4a2e

### Json File

In [82]:
import json

json_file_path = 'Data.json'

with open(json_file_path, 'w') as json_file:
    json.dump(cleaned_data, json_file, indent=5)
