# Choose a Data Set

You can choose to analyze any data that you would like! Remember, you need 1000 rows of non-null data in order to get 5 points for the "Data" criteria of my [rubric](https://docs.google.com/document/d/1s3wllcF3LLnytxwD8mZ-BCypXKnfaahnizWGNojT-B4/edit?usp=sharing). Consider looking at [Kaggle](https://www.kaggle.com/datasets) or [free APIs](https://free-apis.github.io/#/browse) for datasets of this size. Alternatively, you can scrape the web to make your own dataset! :D

Once you have chosen your dataset, please read your data into a dataframe and call `.info()` below. If you don't call `info` I will give you 0 points for the first criteria described on the [rubric](https://docs.google.com/document/d/1s3wllcF3LLnytxwD8mZ-BCypXKnfaahnizWGNojT-B4/edit?usp=sharing).

In [4]:
# Read data into a dataframe and call info()
    # Example call:
    # df = pd.DataFrame({"A":[1, 2, 3], "B":[4, 5, 6]})
    # df.info()

In [4]:
import pandas as pd
import seaborn as sb
import requests
import urllib.parse
import json
import time
from bs4 import BeautifulSoup

samples = 100

In [6]:
# Web Crawler

reserve = []
data = {
    'Article': [],
    'Image Count': [],
    'Emissions': []
}

def getArticleData(url):
    # Get html 
    # ...requests the page, if denied then terminate the crawler
    # ...if failed, return error

    page = requests.get(url)

    if page.status_code == 429: # Rate Limit
        return 'ERR_R', page.status_code
    elif page.status_code != 200:
        return "ERR", page.status_code
    
    soup = BeautifulSoup(page.text, 'html.parser')

    # Scrape for links
    # ... isolate article links
    # ... check for compatibility (>3 Links)

    links = [
        link.get('href') for link in set(soup.find_all('a')) # Using the set method to avoid duplicates
        if link.get('href') 
        and '/article/' in link.get('href')
        and '#' not in link.get('href')
        and ':' not in link.get('href')
        and link.get('href') != '/wiki/Main_Page'
    ]

    if len(links) <= 3: return 'ERR_C'

    # Scrape for images

    imageCount = len(soup.find_all('img'))

    return {
        'Article': url,
        'Image Count': imageCount,
        'Links': links
    }, 200

def getEmissions(url):
    # Get emission data

    emissionReq = requests.get(f'https://api.websitecarbon.com/site?url={urllib.parse.quote(url, safe="")}')

    if emissionReq.status_code == 429: # Rate Limit
        return 'ERR_R', emissionReq.status_code
    elif emissionReq.status_code != 200:
        return 'ERR', emissionReq.status_code
    
    emissionData = json.loads(emissionReq.text)

    return emissionData['statistics']['co2']['grid']['grams'], emissionReq.status_code

def pullNextReserve():
    if len(reserve) == 0:
        return None
    
    retVal = reserve[0]
    del reserve[0]
    return retVal

# The Loop
nextLink = '/article/Main_Page' # the url which will be examined next
baseURL = 'http://www.scholarpedia.org'

#print(getEmissions('https://en.wikipedia.org/wiki/2022_UCI_Mountain_Bike_season'))

for i in range(samples):
    # Get the data from the site
    # ... check success, if ratelimited, denied, or reserve is exhausted then terminate
    # ... if failed but reserve is sufficient, pull the next link and skip the iteration
    article_data, status = getArticleData(baseURL + nextLink)

    if article_data == 'ERR_R':
        print(f'[ WEB CRAWLER DEBUG ] - Rate Limited, waiting 5 seconds...')
        time.sleep(5)
        continue
    
    if status != 200:
        print(f'[ WEB CRAWLER ERROR ] - {nextLink} data failed, code {status}.')

        nextLink = pullNextReserve()

        if nextLink == None:
            print(f'    [X] Reserve exhausted, terminating')
            break
        else:
            print(f'    [-] Moving to {nextLink}')

        continue

    

    # Store data

    data['Article'].append(nextLink)
    data['Image Count'].append(article_data['Image Count'])
    data['Emissions'].append(-999)

    # Get the next available article
    # ... if none found, pull from reserve, exhaustion results in termination
    # ... if found, use the next link and then store the rest in reserve
    # ... if the link has already been used, pull from reserve
    if article_data['Links'][0] != None:
        nextLink = article_data['Links'][0]

        while (nextLink in data['Article']):
            print(f'[ WEB CRAWLER DEBUG ] - {nextLink} used already, pulling from reserve')
            nextLink = pullNextReserve()

            if nextLink == None:
                print(f'    [X] Reserve exhausted, terminating')
                break
            else:
                print(f'    [-] Moving to {nextLink}')

        if nextLink == None: break


        if len(article_data['Links']) > 1:
            reserve.extend(article_data['Links'][1:len(article_data['Links'])-1])

    else: # There's no links that was found in the article data
        print(f'[ WEB CRAWLER DEBUG ] - No links found for {nextLink}, pulling from reserve')
        nextLink = pullNextReserve()

        if nextLink == None:
            print(f'    [X] Reserve exhausted, terminating')
            break
        else:
            print(f'    [-] Moving to {nextLink}')

    print(f'Progress: {i+1}/{samples}')

data = pd.DataFrame(data)

data


Progress: 1/100
[ WEB CRAWLER DEBUG ] - /article/Main_Page used already, pulling from reserve
    [-] Moving to /article/Scholarpedia_of_Touch
Progress: 2/100
[ WEB CRAWLER DEBUG ] - /article/Scholarpedia_of_Touch used already, pulling from reserve
    [-] Moving to /article/The_Top_quark_discovery,
Progress: 3/100
[ WEB CRAWLER ERROR ] - /article/The_Top_quark_discovery, data failed, code 404.
    [-] Moving to /article/Real-time_data_analysis_in_particle_physics
[ WEB CRAWLER DEBUG ] - /article/Main_Page used already, pulling from reserve
    [-] Moving to /article/Main_Page
[ WEB CRAWLER DEBUG ] - /article/Main_Page used already, pulling from reserve
    [-] Moving to /article/Searching_for_Long-Lived_BSM_Particles_at_the_LHC
Progress: 5/100
Progress: 6/100
[ WEB CRAWLER DEBUG ] - /article/The_CMS_experiment used already, pulling from reserve
    [-] Moving to /article/Main_Page
[ WEB CRAWLER DEBUG ] - /article/Main_Page used already, pulling from reserve
    [-] Moving to /article/

Unnamed: 0,Article,Image Count,Emissions
0,/article/Main_Page,38,-999
1,/article/The_Top_quark_discovery,24,-999
2,/article/Scholarpedia_of_Touch,8,-999
3,/article/Real-time_data_analysis_in_particle_p...,14,-999
4,/article/Searching_for_Long-Lived_BSM_Particle...,26,-999
...,...,...,...
93,/article/Hamiltonian_systems,18,-999
94,/article/Celestial_mechanics,14,-999
95,/article/Color_charge,8,-999
96,/article/Axial_anomaly,8,-999


In [None]:
'''# Get Emissions
    # ... if ratelimited or denied, terminate crawler
    # ... if failed then pull from reserve and skip iteration
    emissions, eStatus = getEmissions(baseURL + nextLink)

    if emissions == 'ERR_T':
        print(f'[ WEB CRAWLER FATAL ERROR ] - Emissions Data Denied, terminating... ({eStatus}))')
        break

    if eStatus != 200:
        print(f'[ WEB CRAWLER ERROR ] - {nextLink} Emissions data failed, code {eStatus}.')
        
        nextLink = pullNextReserve()

        if nextLink == None:
            print(f'    [X] Reserve exhausted, terminating')
            break
        else:
            print(f'    [-] Moving to {nextLink}')

        continue'''

# My Question

### Write your question here.

# My Analysis

In [5]:
# Analyze here

# My Answer

### Write your answer here.