In [None]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
import time
import json
import csv
import concurrent.futures
import random
import psutil
from selenium.webdriver.common.proxy import Proxy, ProxyType


In [None]:
def initialize_driver():
    
    PROXY = "p.webshare.io:9999"
    webdriver.DesiredCapabilities.FIREFOX['proxy'] = {
    "httpProxy": PROXY,
    "sslProxy": PROXY,
    "proxyType": "MANUAL",
    }
    
    firefox_driver_path = "/Users/reembeniluz/Downloads/geckodriver"  # Replace with the actual path to the GeckoDriver executable
    options = Options()
    options.add_argument('-headless')
    options.add_argument("--private")
    driver = webdriver.Firefox(service=Service(firefox_driver_path), options=options)
    print("Driver was created")
    return driver

In [None]:
def scrape_page(url):
    driver = initialize_driver()
    driver.get(url)
    time.sleep(random.randint(2, 4))
    links = []
    extract_project_links(driver, links)
    driver.quit()
    return links

In [None]:
def extract_project_links(driver, links):
    page_links = driver.execute_script("""
        const elements = document.querySelectorAll('div.relative.self-start a[href].block.img-placeholder.w100p');
        const filteredLinks = [];
        elements.forEach(element => {
            const href = element.getAttribute('href');
            if (!href.includes('ref=recommendation-no-result-discoverpage')) {
                filteredLinks.push(href);
            }
        });
        return filteredLinks;
    """)
    links.extend(page_links)

In [None]:

def projects_links_list_from_website(url, start_page, end_page, links,seed):
    page_seed = seed
    for seed_count in range(1, 5):
        page_urls = [url.replace(f'page={start_page}', f'page={page}') for page in range(start_page, end_page+1)]

        with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor:
            futures = [executor.submit(scrape_page, page_url) for page_url in page_urls]

            for future in concurrent.futures.as_completed(futures):
                result = future.result()
                print(result)
                links.extend(result)
                future.done()

            url = url.replace(f'seed={page_seed}', f'seed={page_seed+1}')
            page_seed += 1
        for future in futures:
            future.result()
    return 


In [None]:
def scrape_project_data(link,driver):
    driver.get(link)
    time.sleep(10)
    try:
        element = driver.find_element(By.XPATH, '//*[@id="react-project-header"]')
        days_left_element = driver.find_element(By.XPATH, '/html/body/main/div/div/div[1]/div/div[1]/div[1]/div[2]/div[2]/div[3]/div/div/span[1]')
        data_initial = element.get_attribute("data-initial")
        json_data = json.loads(data_initial)
        project_name = json_data['project']['name']
        project_link = link
        currency = json_data['project']['currency']
        location = json_data['project']['location']['displayableName']
        try:
            parent_category = json_data['project']['category']['parentCategory']['name']
        except:
            parent_category = 'none'
        category_name = json_data['project']['category']['name']
        is_project_we_love = json_data['project']['isProjectWeLove']
        percent_funded = json_data['project']['percentFunded']
        goal_amount = json_data['project']['goal']['amount']
        pledged_amount = json_data['project']['pledged']['amount']
        duration = json_data['project']['duration']
        description_length = len(json_data['project']['description'])
        video_elements = driver.find_elements(By.TAG_NAME, "video")
        video_count = len(video_elements)
        image_elements = driver.find_elements(By.TAG_NAME, "img")
        image_count = len(image_elements)
        days_left = days_left_element.text

        try:
            riskField = driver.find_element(By.XPATH, '//*[@id="risks-and-challenges"]/p')
            riskDescLength = len(riskField.text)
        except:
            riskDescLength = 0
            
        project_info = {
        "Project Link":project_link,
        "Project Name": project_name,
        "Currency": currency,
        "Location": location,
        "Parent Category": parent_category,
        "Category Name": category_name,
        "Is Project We Love": is_project_we_love,
        "Percent Funded": percent_funded,
        "Goal Amount": goal_amount,
        "Pledged Amount": pledged_amount,
        "Duration": duration,
        "Description Length": description_length,
        "Image Count": image_count,
        "Video Count": video_count,
        "Risk Desc Count": riskDescLength,
        "Days Left":days_left
    }
    except:
        print("problem with link", link)


    return project_info

In [None]:
def projects_data_from_links(links,project_data_dic):
    counter = 1

    def scrape_project_data_wrapper(link):
        driver = initialize_driver()
        try:
            return scrape_project_data(link,driver)
        finally:
            print("Driver Quited")
            driver.quit()
            time.sleep(20)
            
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        futures = [executor.submit(scrape_project_data_wrapper, link) for link in links]
        for future in concurrent.futures.as_completed(futures):
            try:
                result = future.result()
                project_data_dic.append(result)
                print('finished with project:', counter)
                counter += 1
            except:
                print('skipping link')

    return 


In [None]:
def create_csv_table(data, filename):
    fieldnames = [
        "Project Name",
        "Project Link",
        "Parent Category",
        "Category Name",
        "Location",
        "Currency",
        "Goal Amount",
        "Pledged Amount",
        "Percent Funded",
        "Duration",
        "Days Left",
        "Description Length",
        "Risk Desc Count",
        "Is Project We Love",
        "Image Count",
        "Video Count"
    ]
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

In [None]:
def load_list():
    with open("ProjectsLinks.txt", "r") as file:
        # Read the content of the file and store each line as an element in the list
        links = file.readlines()
        # Remove trailing newline characters from each line
        links = [link.strip() for link in links]
    return links

In [None]:
def save_urls_to_file(url_list, filename):
    with open(filename, 'w') as file:
        for url in url_list:
            file.write(url + '\n')
    print(f"URLs saved to '{filename}' successfully.")


In [None]:
def load_csv_into_dict(filename):
    data = []
    with open(filename, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            data.append(row)
    return data

In [None]:
def remove_duplicate_dicts(data):
    unique_data = []
    unique_project_links = set()
    for row in data:
        project_link = row['Project Link']
        if project_link not in unique_project_links:
            unique_data.append(row)
            unique_project_links.add(project_link)
    return unique_data

In [None]:
dict2 = load_csv_into_dict('projects_table_data_final1.csv')
unique_data = remove_duplicate_dicts(dict2)
print(len(unique_data))

In [None]:
create_csv_table(unique_data,'projects_table_data_final.csv')


In [None]:
links=[]
project_data_dic = []
url = "https://www.kickstarter.com/discover/advanced?woe_id=Earth&sort=end_date&seed=2809206&page=1"
projects_links_list_from_website(url,1,200,links,2809206)

In [None]:
projects_data_from_links(links,project_data_dic)

In [None]:
create_csv_table(project_data_dic,'projects_table_data600.csv')
print('table was created successfuly')