In [None]:
import requests
import urllib.request
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import json
import pandas as pd

PHISH_CSV_URLS = "C:\\Users\\ohad\\Downloads\\verified_online.csv"
REAL_CSV_URLS  = "C:\\Users\\ohad\\Downloads\\majestic_million.csv"

REAL_URL_JSON_FILE  = "C:\\Users\\ohad\\Downloads\\my_json_real_url"
PHISH_URL_JSON_FILE = "C:\\Users\\ohad\\Downloads\\my_json_phish_url"

HTTP = "https"
HTTP_SEFIX = "://"
MAX_NUM_OF_URLS = 20

def read_links_from_csv(csv_file, col_name):
    list_of_all_csv_urls = pd.read_csv(csv_file, skipinitialspace=True, usecols=[col_name])[col_name].to_list()
    return(list_of_all_csv_urls)


def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


def get_all_website_links(url, curr_internal, curr_external):
    """
    Appends all URLs that is found on `url` in which it belongs to the same website to curr_internal and curr_external
    """
    # domain name of the URL without the protocol
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            # href empty tag
            continue
        # join the URL if it's relative (not absolute link)
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        # remove URL GET parameters, URL fragments, etc.
        href = parsed_href.scheme + HTTP_SEFIX + parsed_href.netloc + parsed_href.path
        if not is_valid(href):
            # not a valid URL
            continue
        if href in curr_internal:
            # already in the set
            continue
        if domain_name not in href:
            # external link
            if href not in curr_external:
                curr_external.append(href)
            continue
        curr_internal.append(href)


def crawl(url, curr_internal, curr_external):
    """
    Crawls a web page and extracts all links.
    You'll find all links in `external_urls` and `internal_urls` global set variables.
    params:
        max_urls (int): number of max urls to crawl, default is 30.
    """
    links = get_all_website_links(url, curr_internal, curr_external)
        

def check_if_website_is_online(url):
    status_code = urllib.request.urlopen(url).getcode()
    website_is_up = status_code == 200
    return website_is_up


def read_data(list_of_urls):
    internal_urls = []
    external_urls = []
    for cur_url in list_of_urls:
        curr_internal = []
        curr_external = []
        try:
            if check_if_website_is_online(cur_url):
                crawl(cur_url, curr_internal, curr_external)
                internal_urls.append(curr_internal)
                external_urls.append(curr_external)
        except:
            print(cur_url + " don't work")
    return external_urls, internal_urls      


def remove_empty_list_of_urls_from_all_lists_and_save_to_file(all_urls, internal_urls_list, external_urls_list, json_file_name):
    final_external_urls = []
    final_internal_urls = []
    final_url_list      = []
    for external_e, internal_e, all_e in zip(external_urls_list, internal_urls_list, all_urls):
        if (external_e != [] and internal_e != []):
            final_external_urls.append(external_e)
            final_internal_urls.append(internal_e)
            final_url_list.append(all_e)

    with open(json_file_name, "w") as f:
        json.dump(final_url_list, f)
        f.write('\n')
        json.dump(final_external_urls, f)
        f.write('\n')
        json.dump(final_internal_urls, f)



def main():
    phish_urls = read_links_from_csv(PHISH_CSV_URLS, 'url')
    real_urls  = read_links_from_csv(REAL_CSV_URLS, 'Domain')
    real_urls  = [HTTP + HTTP_SEFIX + real_url for real_url in real_urls]
    real_urls  = real_urls[:MAX_NUM_OF_URLS]
    phish_urls = phish_urls[:MAX_NUM_OF_URLS]
    
    legit_external_urls, legit_internal_urls = read_data(real_urls)     
    phish_external_urls, phish_internal_urls = read_data(phish_urls)
    
    remove_empty_list_of_urls_from_all_lists_and_save_to_file(phish_urls, phish_internal_urls, phish_external_urls, PHISH_URL_JSON_FILE)
    remove_empty_list_of_urls_from_all_lists_and_save_to_file(real_urls, legit_internal_urls, legit_external_urls, REAL_URL_JSON_FILE)


if __name__ == "__main__":
    main()