# Choose a Data Set

You can choose to analyze any data that you would like! Remember, you need 1000 rows of non-null data in order to get 5 points for the "Data" criteria of my [rubric](https://docs.google.com/document/d/1s3wllcF3LLnytxwD8mZ-BCypXKnfaahnizWGNojT-B4/edit?usp=sharing). Consider looking at [Kaggle](https://www.kaggle.com/datasets) or [free APIs](https://free-apis.github.io/#/browse) for datasets of this size. Alternatively, you can scrape the web to make your own dataset! :D

Once you have chosen your dataset, please read your data into a dataframe and call `.info()` below. If you don't call `info` I will give you 0 points for the first criteria described on the [rubric](https://docs.google.com/document/d/1s3wllcF3LLnytxwD8mZ-BCypXKnfaahnizWGNojT-B4/edit?usp=sharing).

In [4]:
# Read data into a dataframe and call info()
    # Example call:
    # df = pd.DataFrame({"A":[1, 2, 3], "B":[4, 5, 6]})
    # df.info()

In [2]:
import pandas as pd
import seaborn as sb
import requests
import urllib.parse
import json
import time
import random as rand
from bs4 import BeautifulSoup

samples = 1000

In [None]:
# Web Crawler

reserve = []
data = {
    'Article': [],
    'Image Count': [],
    'Emissions': []
}
baseURL = 'http://www.scholarpedia.org'

def getHTMLData(url):
    page = None
    
    while page == None:
        try:
            page = requests.get(url)
        except Exception as e:
            print(f' [ WEB CRAWLER ERROR ] - EXCEPTION WHEN GETTING HTML: {e}')
            url = pullNextReserve()
            if url == None:
                print(f'    [X] Reserve exhausted, terminating')
                break
            else:
                print(f'    [-] Moving to {url}')
                url = baseURL + url

        if page != None and page.status_code == 429:
            print(f'[ WEB CRAWLER DEBUG ] - Rate Limited, waiting 5 seconds...')
            time.sleep(5)
            page = None
        elif page != None and page.status_code != 200: # Handle exceptions not raised
            print(f'[ WEB CRAWLER ERROR ] - {url} data failed, code {page.status_code}.')
            url = pullNextReserve()
            if url == None:
                print(f'    [X] Reserve exhausted, terminating')
                break
            else:
                print(f'    [-] Moving to {url}')
                url = baseURL + url
            page = None

    return page

def getArticleData(url):
    # Get html 
    # ...requests the page, if denied then terminate the crawler
    # ...if failed, return error
    page = getHTMLData(url)

    if page == None:
        return None
    
    soup = BeautifulSoup(page.text, 'html.parser')

    # Scrape for links
    # ... isolate article links
    # ... check for compatibility (>3 Links)

    links = [
        link.get('href') for link in set(soup.find_all('a')) # Using the set method to avoid duplicates
        if link.get('href') 
        and '/article/' in link.get('href')
        and '#' not in link.get('href')
        and ':' not in link.get('href')
        and link.get('href') != '/wiki/Main_Page'
    ]

    # Scrape for images

    imageCount = len(soup.find_all('img'))

    return {
        'Article': url,
        'Image Count': imageCount,
        'Links': links
    }



    return emissionData['statistics']['co2']['grid']['grams'], emissionReq.status_code

def pullNextReserve():
    if len(reserve) == 0:
        return None
    
    retVal = reserve[0]
    del reserve[0]
    return retVal


# The Loop
nextLink = '/article/Main_Page' # the url which will be examined next

#print(getHTMLData("http://httpbin.org/status/401"))

#print(getEmissions('https://en.wikipedia.org/wiki/2022_UCI_Mountain_Bike_season'))

for i in range(samples):
    # Get the data from the site
    article_data = getArticleData(baseURL + nextLink)
    
    if article_data == None:
        break

    # Store data

    data['Article'].append(nextLink)
    data['Image Count'].append(article_data['Image Count'])
    data['Emissions'].append(-999)

    # Get the next available article
    # ... if none found, pull from reserve, exhaustion results in termination
    # ... if found, use the next link and then store the rest in reserve
    # ... if the link has already been used, pull from reserve

    if article_data['Links'][0] != None:
        chosenIndex = rand.randint(0, len(article_data['Links'])-1)
        nextLink = article_data['Links'][chosenIndex]

        while (nextLink in data['Article']):
            print(f'[ WEB CRAWLER DEBUG ] - {nextLink} used already, pulling from reserve')
            nextLink = pullNextReserve()

            if nextLink == None:
                print(f'    [X] Reserve exhausted, terminating')
                break
            else:
                print(f'    [-] Moving to {nextLink}')

        if nextLink == None: break


        if len(article_data['Links']) > 1:
            del article_data['Links'][chosenIndex]
            reserve.extend([article for article in article_data['Links'] if article not in reserve])

    else: # There's no links that was found in the article data
        print(f'[ WEB CRAWLER DEBUG ] - No links found for {nextLink}, pulling from reserve')
        nextLink = pullNextReserve()

        if nextLink == None:
            print(f'    [X] Reserve exhausted, terminating')
            break
        else:
            print(f'    [-] Moving to {nextLink}')

    print(f'Progress: {i+1}/{samples}')

data = pd.DataFrame(data)

data


Progress: 1/1000
Progress: 2/1000
[ WEB CRAWLER DEBUG ] - /article/Main_Page used already, pulling from reserve
    [-] Moving to /article/Real-time_data_analysis_in_particle_physics
Progress: 3/1000
Progress: 4/1000
[ WEB CRAWLER DEBUG ] - /article/Main_Page used already, pulling from reserve
    [-] Moving to /article/SuperSymmetry_and_the_LHC_Run_2
[ WEB CRAWLER DEBUG ] - /article/SuperSymmetry_and_the_LHC_Run_2 used already, pulling from reserve
    [-] Moving to /article/The_Top_quark_discovery
Progress: 5/1000
[ WEB CRAWLER DEBUG ] - /article/Main_Page used already, pulling from reserve
    [-] Moving to /article/SuperSymmetry_and_the_LHC_Run_2
[ WEB CRAWLER DEBUG ] - /article/SuperSymmetry_and_the_LHC_Run_2 used already, pulling from reserve
    [-] Moving to /article/Main_Page
[ WEB CRAWLER DEBUG ] - /article/Main_Page used already, pulling from reserve
    [-] Moving to /article/Real-time_data_analysis_in_particle_physics
[ WEB CRAWLER DEBUG ] - /article/Real-time_data_analysi

Unnamed: 0,Article,Image Count,Emissions
0,/article/Main_Page,38,-999
1,/article/SuperSymmetry_and_the_LHC_Run_2,26,-999
2,/article/Special_relativity,7,-999
3,/article/Real-time_data_analysis_in_particle_p...,14,-999
4,/article/Resonance,7,-999
...,...,...,...
995,/article/Brain_connectivity,10,-999
996,/article/Brain_Connectivity,10,-999
997,/article/Maintenance_of_synaptic_plasticity,14,-999
998,/article/Genesis,12,-999


In [None]:
# Get the emissions of each website
import threading
from concurrent.futures import ThreadPoolExecutor

def getEmissions(url, maxTries):
    # Get emission data

    emissionReq = None
    emissionData = None
    tries = 0

    while (emissionReq == None):
        try:
            emissionReq = requests.get(f'https://api.websitecarbon.com/site?url={urllib.parse.quote(url, safe="")}')
            emissionData = json.loads(emissionReq.text)['statistics']['co2']['grid']['grams']
        except Exception as e:
            print(f'[ EMISSIONS ERROR ] - Failed to get data for {url}: {e}')

        if emissionReq.status_code == 429: # Rate Limit
            print('[ EMISSIONS DEBUG ] - Rate Limited... waiting 5 seconds')
            time.sleep(5)
            emissionReq = None
        elif emissionReq.status_code != 200:
            print(f'[ EMISSIONS ERROR ] - Failed to get data for {url}, status code: {emissionReq.status_code}')
        
            if tries == maxTries:
                print(f'[ EMISSIONS FATAL ERROR ] - Tries exhausted for {url}')
                break
            else:
                emissionReq = None
                tries+=1


    return emissionData
    
# Function to handle each row of data
def process_row(row, baseURL, maxTries, data):
    article = data.loc[row, 'Article']
    emissions = getEmissions(baseURL + article, maxTries)
    if emissions is not None:
        data.loc[row, 'Emissions'] = emissions

    print(f"Finished Row: {row + 1}/{len(data)}")


data = pd.read_csv('finalish-data.csv')
baseURL = 'http://www.scholarpedia.org'

data['Emissions'] = data['Emissions'].astype(float)
articles = data['Article']


# Set the maximum number of threads to use
max_threads = 10
maxTries = 5

# Use ThreadPoolExecutor for multithreading
with ThreadPoolExecutor(max_workers=max_threads) as executor:
    # Submit tasks for each row (URL) to the thread pool
    for row in range(len(data)):
        executor.submit(process_row, row, baseURL, maxTries, data)

# Wait for all threads to complete
print("All threads completed.")

data



Finished Row: 8/1000
Finished Row: 7/1000
Finished Row: 3/1000
Finished Row: 1/1000
Finished Row: 4/1000
Finished Row: 2/1000
Finished Row: 5/1000
Finished Row: 6/1000
[ EMISSIONS ERROR ] - Failed to get data for http://www.scholarpedia.org/article/The_Top_quark_discovery,: 'statistics'
[ EMISSIONS ERROR ] - Failed to get data for http://www.scholarpedia.org/article/The_Top_quark_discovery,, status code: 422
Finished Row: 14/1000
[ EMISSIONS ERROR ] - Failed to get data for http://www.scholarpedia.org/article/Searching_for_Long-Lived_BSM_Particles_at_the_LHC,: 'statistics'
[ EMISSIONS ERROR ] - Failed to get data for http://www.scholarpedia.org/article/Searching_for_Long-Lived_BSM_Particles_at_the_LHC,, status code: 422
Finished Row: 12/1000
Finished Row: 13/1000
[ EMISSIONS ERROR ] - Failed to get data for http://www.scholarpedia.org/article/The_Top_quark_discovery,: 'statistics'
[ EMISSIONS ERROR ] - Failed to get data for http://www.scholarpedia.org/article/The_Top_quark_discovery,,

# My Question

### Write your question here.

# My Analysis

In [5]:
# Analyze here

# My Answer

### Write your answer here.