In [None]:
import requests
from bs4 import BeautifulSoup
from models import bart as model
import sys
import dns.resolver

def is_domain_active(url):
    domain = url.split("//")[-1].split("/")[0]
    try:
        dns.resolver.resolve(domain, 'A')
        return True
    except dns.resolver.NXDOMAIN:
        return False
    except dns.resolver.Timeout:
        return False
    except dns.resolver.NoAnswer:
        return False
    except Exception as e:
        print (e)
        return False
    
def check_redirect(url):
    response = requests.head(url, allow_redirects=True, timeout=10)
    if response.history:
        original_url = requests.utils.urlparse(url).netloc
        redirect_url = requests.utils.urlparse(response.history[-1].url).netloc

        if original_url == redirect_url:
            return False
        return response.url
    else:
        return False

def get_html(url, debug=False):
    response = requests.get(url)
    response.raise_for_status()  # Raises an HTTPError for bad responses        
    html = response.text
    
    if debug:
        with open('temp.html', 'w') as file:
            file.write(html)
            
    return html

def extract_text_from_html(html, debug=False):
    soup = BeautifulSoup(html, 'html.parser')
    
    # Ignore link text
    for a in soup.find_all('a'):
       a.decompose()
    
    text = soup.get_text(separator='\n')
    
    if debug:
        with open('temp.txt', 'w') as file:
            file.write(text)
    
    return text

def get_summary_from_website(url, debug=False):
    try:
        summary = ''
        if is_domain_active(url) == False:
            return "(Domain inactive)"

        redirect_url = check_redirect(url)
        if redirect_url:
            summary = f"(Redirected to {redirect_url})\n"

        html = get_html(url, debug)
        if html:
            txt = extract_text_from_html(html, debug)
            summary += model.summarize(txt)
        return summary
    
    except requests.exceptions.Timeout as e:
            return "(Website not respondeding)"
    
    except requests.exceptions.HTTPError as e:
        code = e.response.status_code
        if code == 404:
            return "(Website not found)"
        if code == 403 or code == 406:
            return "(Website scraping blocked)"
        if code >= 502 and code < 600:
            return "(Website not working)"
        
        print(e)
        return f"(Could not scrape website: {e})"
    
    except Exception as e:
        print(e, file=sys.stderr)
        return str(e)


In [None]:
import csv

def load_urls_from_csv(file_path, column_name):
  urls = []
  with open(file_path, mode='r') as file:
      csv_reader = csv.DictReader(file)
      for row in csv_reader:
          if column_name in row:
              urls.append(row[column_name])
  return urls

def get_summaries_from_websites(urls, debug=False):
  summaries = []
  for url in urls:
    summary = ''
    if url: 
        summary = get_summary_from_website(url)
        if debug and url: print(url + " -> " + summary)
        
    summaries.append(summary)
    
  return summaries

urls = load_urls_from_csv('data/organizations.csv', 'homepage_url')
get_summaries_from_websites(urls, True)