In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
import time
import json
import csv
import concurrent.futures
import random

In [2]:
def initialize_driver():
    proxies = [
    "ioqzyjgy:urcueqpitm55@188.68.1.50:5919",
    "ioqzyjgy:urcueqpitm55@154.95.0.93:6346",
    "ioqzyjgy:urcueqpitm55@103.47.52.122:8164",
    "ioqzyjgy:urcueqpitm55@184.174.46.174:5803",
    "ioqzyjgy:urcueqpitm55@45.41.162.165:6802",
    "ioqzyjgy:urcueqpitm55@161.123.93.76:5806",
    "ioqzyjgy:urcueqpitm55@107.175.119.245:6773",
    "ioqzyjgy:urcueqpitm55@192.186.186.162:6204",
    "ioqzyjgy:urcueqpitm55@64.137.57.35:6044",
    "ioqzyjgy:urcueqpitm55@45.41.162.16:6653"
]
    proxy = random.choice(proxies)
    firefox_driver_path = "/Users/reembeniluz/Downloads/geckodriver"  # Replace with the actual path to the GeckoDriver executable
    options = Options()
    options.add_argument('-headless')
    options.add_argument('-private')
    options.add_argument(f"--proxy-server={proxy}")
    driver = webdriver.Firefox(service=Service(firefox_driver_path), options=options)
    print("driver was created")
    return driver

In [3]:
def scrape_page(url):
    driver = initialize_driver()
    driver.get(url)
    time.sleep(2)
    links = []
    extract_project_links(driver, links)
    driver.quit()
    return links

In [4]:
def extract_project_links(driver, links):
    page_links = driver.execute_script("""
        const elements = document.querySelectorAll('div.relative.self-start a[href].block.img-placeholder.w100p');
        const filteredLinks = [];
        elements.forEach(element => {
            const href = element.getAttribute('href');
            if (!href.includes('ref=recommendation-no-result-discoverpage')) {
                filteredLinks.push(href);
            }
        });
        return filteredLinks;
    """)
    links.extend(page_links)

In [5]:

def projects_links_list_from_website(url, start_page, end_page):
    page_seed = 2808845
    for seed_count in range(start_page, 4):
        page_urls = [url.replace('page=1', f'page={page}') for page in range(start_page, end_page+1)]

        with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor:
            futures = [executor.submit(scrape_page, page_url) for page_url in page_urls]

            links = []
            for future in concurrent.futures.as_completed(futures):
                result = future.result()
                links.extend(result)
                future.done()  # Close the thread when it finishes

            url = url.replace(f'seed={page_seed}', f'seed={page_seed+1}')
            page_seed += 1

        # Move the seed_count loop inside the 'with' block
        # Wait for all threads to finish before moving to the next iteration
        for future in futures:
            future.result()

    return links


In [6]:
def scrape_project_data(link):
    driver = initialize_driver()
    driver.get(link)
    time.sleep(5)
    element = driver.find_element(By.XPATH, '//*[@id="react-project-header"]')
    days_left_element = driver.find_element(By.XPATH, '/html/body/main/div/div/div[1]/div/div[1]/div[1]/div[2]/div[2]/div[3]/div/div/span[1]')
    data_initial = element.get_attribute("data-initial")
    json_data = json.loads(data_initial)
    project_name = json_data['project']['name']
    project_link = link
    currency = json_data['project']['currency']
    location = json_data['project']['location']['displayableName']
    try:
        parent_category = json_data['project']['category']['parentCategory']['name']
    except:
        parent_category = 'none'
    category_name = json_data['project']['category']['name']
    is_project_we_love = json_data['project']['isProjectWeLove']
    percent_funded = json_data['project']['percentFunded']
    goal_amount = json_data['project']['goal']['amount']
    pledged_amount = json_data['project']['pledged']['amount']
    duration = json_data['project']['duration']
    description_length = len(json_data['project']['description'])
    video_elements = driver.find_elements(By.TAG_NAME, "video")
    video_count = len(video_elements)
    image_elements = driver.find_elements(By.TAG_NAME, "img")
    image_count = len(image_elements)
    days_left = days_left_element.text

    try:
        riskField = driver.find_element(By.XPATH, '//*[@id="risks-and-challenges"]/p')
        riskDescLength = len(riskField.text)
    except:
        riskDescLength = 0

    project_info = {
        "Project Link":project_link,
        "Project Name": project_name,
        "Currency": currency,
        "Location": location,
        "Parent Category": parent_category,
        "Category Name": category_name,
        "Is Project We Love": is_project_we_love,
        "Percent Funded": percent_funded,
        "Goal Amount": goal_amount,
        "Pledged Amount": pledged_amount,
        "Duration": duration,
        "Description Length": description_length,
        "Image Count": image_count,
        "Video Count": video_count,
        "Risk Desc Count": riskDescLength,
        "Days Left":days_left
    }

    driver.quit()
    return project_info

In [7]:
def projects_data_from_links(links):
    project_data_dic = []
    counter = 1

    def scrape_project_data_wrapper(link):
        driver = initialize_driver()
        try:
            return scrape_project_data(link)
        finally:
            driver.quit()

    with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor:
        futures = [executor.submit(scrape_project_data_wrapper, link) for link in links]

        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            project_data_dic.append(result)
            print('finished with project:', counter)
            counter += 1

    return project_data_dic


In [8]:
def create_csv_table(data, filename):
    fieldnames = [
        "Project Name",
        "Project Link",
        "Parent Category",
        "Category Name",
        "Location",
        "Currency",
        "Goal Amount",
        "Pledged Amount",
        "Percent Funded",
        "Duration",
        "Days Left",
        "Description Length",
        "Risk Desc Count",
        "Is Project We Love",
        "Image Count",
        "Video Count"
    ]
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

In [None]:
url = "https://www.kickstarter.com/discover/advanced?woe_id=0&sort=magic&ref=discovery_overlay&seed=2808845&page=1"
links = projects_links_list_from_website(url,1,200)
project_data_dic = projects_data_from_links(links)
create_csv_table(project_data_dic,'sample_projects_table_data.csv')
print('table was created successfuly')