In [130]:
import os, csv, re, time
from datetime import datetime
import ujson as json

import requests
from bs4 import BeautifulSoup
import numpy as np

def get_page(url):
    '''Get the page content

    Args:
        url (str): url to scrape

    Returns:
        response (obj): object of web content
    '''
    time.sleep(5)
    headers = {'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0'}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response
    else:
        print(f'Something went wrong when scraping {url}')

def copy_content(response):
    '''Copy the web content in local disk

    Parameters
        response (obj): object of web content
    '''
    with open('check.html', 'wb') as f:
        f.write(response.content)
        
def save_data(news_title, news_contents):
    '''Save news_title and news_contents in data file
    
    Parameters
        news_title (str): news title
        news_content (str): news content
    '''
    file_name = 'train.csv'
    file = os.path.join('data', file_name)
    if os.path.exists(file): 
        mode = 'a'
    else:
        mode = 'w'
    # write in the data
    with open(file, mode=mode, newline='') as csv_file:
        fieldnames = ['Title', 'Contents']
        csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        if mode == 'w': csv_writer.writeheader()
        data_dict = dict(zip(fieldnames, [news_title, news_contents]))
        csv_writer.writerow(data_dict)

    print(f'{news_title} written in successfully')


In [162]:
class ScrapeLiberty():
    """
    """
    def __init__(self, data=None):
        self.url_base = 'https://news.ltn.com.tw/ajax/breakingnews/'
        self.categories = ['entertainment', 'politics', 'sports', 'society']
        self.data = data
        self.news_titles = self.data[:, 2] if self.data else []
    
    def scrape(self):
        data = []
        for category in self.categories:
            index = 1
            
            url = f'{self.url_base}/{category}/{index}'
            news_list = get_page(url)
            news_list = json.loads(news_list.text)['data']
            
            for news in news_list:
                url = news['url']
                datum = self.process_page(url, category)
                if datum:
                    data.append(datum)
            print(data)
    
    def process_page(self, url, category):
        response = get_page(url)
        soup = BeautifulSoup(response.text, 'lxml')
        article = self.get_article(soup, category)

        news_title = self.get_title(article)
        if news_title in self.news_titles:
            return None

        date = self.get_date(article, category)
        news_content = self.get_content(article)
        image_path = self.get_image(news['photo_S'])

        return [date, title, content, image_path, category]
    
    def get_article(self, soup, category):
        if categroy in ['entertainment']:
            return soup.find('div', class_="content")
        else:
            return soup.find(itemprop="articleBody")

    def get_date(self, article, category):
        if categroy in ['entertainment', 'sports']:
            pattern = '\d{4}\/\d{2}\/\d{2}'
            date = re.search(pattern, article.text).group()
            date = datetime.strptime(date, '%Y/%m/%d').date()
        else:
            pattern = '\d{4}-\d{2}-\d{2}'
            date = re.search(pattern, article.text).group()
            date = datetime.strptime(date, '%Y-%m-%d').date()
        return date


    def get_title(self, article):
        return article.h1.text

    def get_content(self, article):
        if categroy in ['entertainment']:
            news_content_list = article.find_all('p', class_='')
            content = ''.join([content.text for content in news_content_list if not content.span])
        else:
            news_content_list = article.find('div', class_='text boxTitle boxText').find_all('p', class_='', recursive=False)
            content = ''.join([content.text for content in news_content_list])
        return content
    
    def get_image(self, url):
        image_name = url.split('/')[-1]
        image_path = os.path.join('data', 'pictures', 'liberty', image_name)
        image = get_page(url)
        with open(os.path.join(os.getcwd(), image_path), 'wb') as f:
            for chunk in image:
                f.write(chunk)
        return image_path
        
# return news_title, news_contents