In [17]:
import os
import json
import time
from random import randint

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, unquote

In [19]:
class SearchEngine:
    @staticmethod
    def search(query, sleep=True):
        if sleep:
            time.sleep(randint(10, 100))
        temp_url = '+'.join(query.split())
        url = 'https://html.duckduckgo.com/html/?q=' + temp_url

        driver = webdriver.Safari()
        driver.set_window_position(-10000, 0)
        driver.get(url)
        time.sleep(2)

        new_results = SearchEngine.scrape_search_result(driver)

        driver.quit()
        return new_results

    @staticmethod
    def scrape_search_result(driver):
        results = []
        pages_visited = 0
        max_pages = 10

        while len(results) < 10 and pages_visited < max_pages:
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')

            for div in soup.find_all('div', class_='result'):
                a = div.find('a', href=True)
                if a:
                    url = a['href']
                    if url.startswith('//duckduckgo.com/l/?uddg='):
                        full_url = 'https:' + url
                        parsed_url = urlparse(full_url)
                        query_params = parse_qs(parsed_url.query)
                        if 'uddg' in query_params:
                            actual_url = unquote(query_params['uddg'][0])
                        else:
                            actual_url = full_url
                    else:
                        actual_url = url

                    if actual_url not in results:
                        results.append(actual_url)
                    if len(results) >= 10:
                        break

            if len(results) < 10:
                next_button = driver.find_element(By.XPATH, '//a[text()="Next"]')
                if next_button:
                    next_button.click()
                    pages_visited += 1
                    time.sleep(2)
                else:
                    break
            else:
                break

        return results[:10]

def read_queries(filename):
    with open(filename, encoding='utf-8') as f:
        lines = f.read().splitlines()
    return lines

def find_matches(ddg, google):
    matches = []
    for query in ddg:
        temp = []
        for url in range(len(ddg[query])):
            if ddg[query][url] in google[query]:
                temp.append([google[query].index(ddg[query][url]) + 1, url + 1])
        matches.append(temp)
    return matches

def spearman_coefficient(data):
    overlap_list = []
    overlap_percent_list = []
    spearman_coefficient_list = []
    sum_overlap = 0
    sum_overlap_percent = 0
    sum_spearman_coefficient = 0

    for matches in data:
        n = len(matches)
        overlap_list.append(n)
        sum_overlap += n

        percent = round(len(matches) / 10 * 100.0, 1)
        overlap_percent_list.append(percent)
        sum_overlap_percent += percent

        d2s = []
        if n == 0:
            spearman_coefficient_list.append(0)
        else:
            for match in matches:
                d2 = (match[0] - match[1]) ** 2
                d2s.append(d2)
            if n == 1:
                if matches[0][0] == matches[0][1]:
                    spearman_coefficient_list.append(1)
                    sum_spearman_coefficient += 1
                else:
                    spearman_coefficient_list.append(0)
            else:
                spearman_coefficient = 1 - (6 * sum(d2s)) / (n * (n ** 2 - 1))
                sum_spearman_coefficient += spearman_coefficient
                spearman_coefficient_list.append(spearman_coefficient)

    avg_overlap = sum_overlap / len(data)
    avg_overlap_percent = sum_overlap_percent / len(data)
    avg_spearman_coefficient = sum_spearman_coefficient / len(data)
    return overlap_list, overlap_percent_list, spearman_coefficient_list, avg_overlap, avg_overlap_percent, avg_spearman_coefficient

if __name__ == '__main__':
    query_file = "./100QueriesSet4.txt"
    Google_file = "./Google_Result4.json"
    hw1_json = "./hw1.json"
    hw1_csv = "./hw1.csv"

    DuckDuckGo = SearchEngine()
    results = {}

    queries = read_queries(query_file)
    for idx, query in enumerate(queries):
        result = query.rstrip()
        print(f"Processing Query {idx + 1}: {result}")
        results[result] = DuckDuckGo.search(query)
        print("Results:")
        for url in results[result]:
            print(url)
        print("-" * 40)
    out_json = json.dumps(results, indent=2)
    with open(hw1_json, 'w', encoding='utf-8') as f:
        f.write(out_json)

    ddg_result = json.load(open(hw1_json, encoding='utf-8'))
    google_result = json.load(open(Google_file, encoding='utf-8'))
    overlap = find_matches(ddg_result, google_result)

    overlap_list, overlap_percent_list, spearman_coefficient_list, avg_overlap, avg_overlap_percent, avg_spearman_coefficient = spearman_coefficient(overlap)

    result_str = '{:>6}, {:>24}, {:>16}, {:>22}\n'.format('Query', 'No. Overlapping Results', 'Percent Overlap', 'Spearman Coefficient')
    for i in range(len(overlap_list)):
        temp_str = '{:>6}, {:>24}, {:>16}, {:>22}\n'.format(
            f'Query {i + 1}',
            overlap_list[i],
            overlap_percent_list[i],
            round(spearman_coefficient_list[i], 4)
        )
        result_str += temp_str
    temp_str = '{:>6}, {:>24}, {:>16}, {:>22}'.format(
        'Averages',
        round(avg_overlap, 2),
        round(avg_overlap_percent, 2),
        round(avg_spearman_coefficient, 4)
    )
    result_str += temp_str
    print(result_str)
    with open(hw1_csv, 'w', encoding='utf-8') as f:
        f.write(result_str)

Processing Query 1: A two dollar bill from 1953 is worth what
Results:
https://www.coinvaluechecker.com/how-much-is-1953-2-dollar-bill-worth/
https://oldcurrencyvalues.com/1953_red_seal_two_dollar/
https://www.vipartfairs.com/1953-2-dollar-bill-value/
https://www.silverrecyclers.com/blog/1953-2-dollar-bill.aspx
https://www.uscurrencyauctions.com/$2-us-currency-value-price-guide.html
https://www.coinvaluelookup.com/1953-2-dollar-bill-value/
https://errorcoins.org/1953-2-dollar-bill-value-a-b-c-mint-mark/
https://nedluddpdx.com/rare-1953-2-dollar-bill-value/
https://www.youtube.com/watch?v=nCUTKX5FIMs
https://treasurepursuits.com/1953-2-dollar-bill-value-whats-it-worth/
----------------------------------------
Processing Query 2: What is franky jonas 's favorite color
Results:
https://facts.net/celebrity/43-facts-about-frankie-jonas/
http://itsajonaslife.weebly.com/facts-about-frankie.html
https://en.wikipedia.org/wiki/Frankie_Jonas
https://healthyceleb.com/frankie-jonas/
https://people.