# Title

In [1]:
r"""markdown
    TITLE   :
    AUTHOR  :
    PROJECT :
""";

__author__ = ''
__version__ = ''

<span style='font-size:30px;font-weight:650'>
    About
</span>


<br><br>

- - - 
- - - 

<br>

# Prepare

## Imports

In [15]:
## General
# import requests
# from bs4 import BeautifulSoup
# import flask
import pandas as pd

## Custom
from IPython.display import HTML

## Project-Specific

## Functions

In [266]:
def scrape_table(inputurl, columns=[], rename={}, sort=None, parse=True,
                 # dates
                 date_col='date',
                 to_next_date=False, is_start_date=True,
                 # saving
                 save=False, combine_to_master=True,
                ):

    # get get from html
    dflist = pd.read_html(inputurl, attrs={"class": ["wikitable","wikitable sortable"]})
    # concatenate and sort
    df = concatenate_tables(*dflist, sort=sort)
    
    # get right columns
    df = df[columns]
    df.rename(columns=rename, inplace=True)

    # add source
    df['source'] = inputurl

    # parse date
    if parse:
        df = parse_dates(df, date_col=date_col)

    # deal with start and end times
    if is_start_date is True:
        if to_next_date is False:  # single date
            df['start_date'] = df[date_col]
            df['end_date'] = df[date_col]
        else: # date range
            df['start_date'] = df[date_col]
            df['end_date'] = [*df[date_col][1:], 0]
    else:
        pass
#         df['start_date'] = df[date_col] - 
#         df['end_date'] = df[date_col] +
    
#     if isinstance(save, str):
#         df.to_csv(save, index=False)
#     if combine_to_master:
#         add_to_master_table(df)
    return df


def concatenate_tables(*dfs, sort=None):
    cdf = pd.concat(list(dfs))
    if sort is not None:
        cdf.sort_values(sort, inplace=True)
    cdf.reset_index(drop=True, inplace=True)
    return cdf


def add_to_master_table(df):
    try:
        mdf = pd.read_csv('../data/master_table.csv')
    except:
        df.to_csv('../data/master_table.csv', index=False)
    else:
        new_df = concatenate_tables(mdf, df, sort='Date')
        new_df.to_csv('../data/master_table.csv', index=False)
    return df


<br><br>

- - - 
- - - 

<br>

# Code

In [267]:
df = scrape_table("https://en.wikipedia.org/wiki/Timeline_of_the_evolutionary_history_of_life",
                  columns=['Date', 'Event'],
                  rename={'Date': 'date'},
                  date_col='date',
                  save='../data/wikipedia_evolutionary_history_of_life.csv',
                  parse=True
)

df

Unnamed: 0,date,Event,source,start_date,end_date
0,4600000000,The planet Earth forms from the accretion disc...,https://en.wikipedia.org/wiki/Timeline_of_the_...,4600000000,4600000000
1,4500000000,"According to the giant impact hypothesis, the ...",https://en.wikipedia.org/wiki/Timeline_of_the_...,4500000000,4500000000
2,4404000000,First appearance of liquid water on Earth.,https://en.wikipedia.org/wiki/Timeline_of_the_...,4404000000,4404000000
3,4280000000,Earliest possible appearance of life on Earth....,https://en.wikipedia.org/wiki/Timeline_of_the_...,4280000000,4280000000
4,4000000000,Formation of a greenstone belt of the Acasta G...,https://en.wikipedia.org/wiki/Timeline_of_the_...,4000000000,4000000000
...,...,...,...,...,...
142,105,"Martha, last known passenger pigeon, dies",https://en.wikipedia.org/wiki/Timeline_of_the_...,105,105
143,83,"The thylacine goes extinct in a Tasmanian zoo,...",https://en.wikipedia.org/wiki/Timeline_of_the_...,83,83
144,67,The Caribbean monk seal goes extinct[78],https://en.wikipedia.org/wiki/Timeline_of_the_...,67,67
145,11,"The baiji, the Yangtze river dolphin, becomes ...",https://en.wikipedia.org/wiki/Timeline_of_the_...,11,11


In [268]:
def parse_dates(df, date_col='date'):
    
    def parse_start(date):
        date = date.strip()
        if date.startswith('by '):
            return date[2:]
        elif date.startswith('c.'):
            return date[2:]
        else:
            return date
        
    def parse_parenthesis(date):
        date = date.strip()
        if '(' in date:
            return date.split('(')[0]
        else:
            return date
    
    def parse_unit(date):
        """return unit in years"""    
        date = date.strip()
        if date.endswith('ka'):
            return date[:-2], 1e3
        # mega
        elif date.endswith('ma'):
            return date[:-2], 1e6
        elif date.endswith('ma-'):
            return date[:-3], 1e6
        # giga
        elif date.endswith('bya'):
            return date[:-3], 1e9
        elif date.endswith('ya'):  # after bya b/c ya in bya
            return date[:-2], 1
        else:
            return date, 1
    # /def
    
    
    def parse_range(date):
        date = date.strip()
        if ('-' in date):
            s = date.split('-')[0]
            np.mean([float(s[0]), float(s[1])])
        elif ('–' in date):
            s = date.split('–')
            return np.mean([float(s[0]), float(s[1])])
        else:
            return date
    
    
    dates = df[date_col]
    
    for i, date in dates.iteritems():
        
        date = date.lower().strip()
        # parse unit
        date = parse_start(date)
        date = parse_parenthesis(date)
        date, unit = parse_unit(date)
        # parse range
        date = parse_range(date)
        
        # assign
        dates.iloc[i] = float(date) * unit
    
    # reassigning
    df[date_col] = dates
    return df

In [270]:
df = scrape_table("https://en.wikipedia.org/wiki/Timeline_of_natural_history",
                  columns=['Started', 'Period'],
                  rename={'Started': 'date', 'Period': 'event'},
                  date_col='date',
                  save=False,
                  to_next_date=True
#                   save='../data/wikipedia_evolutionary_history_of_life.csv',
#                   parse=True
)

df

Unnamed: 0,date,event,source,start_date,end_date
0,2500000000,Siderian,https://en.wikipedia.org/wiki/Timeline_of_natu...,2500000000,2300000000
1,2300000000,Rhyacian,https://en.wikipedia.org/wiki/Timeline_of_natu...,2300000000,2050000000
2,2050000000,Orosirian,https://en.wikipedia.org/wiki/Timeline_of_natu...,2050000000,1800000000
3,1800000000,Statherian,https://en.wikipedia.org/wiki/Timeline_of_natu...,1800000000,1600000000
4,1600000000,Calymmian,https://en.wikipedia.org/wiki/Timeline_of_natu...,1600000000,1400000000
5,1400000000,Ectasian,https://en.wikipedia.org/wiki/Timeline_of_natu...,1400000000,1200000000
6,1200000000,Stenian,https://en.wikipedia.org/wiki/Timeline_of_natu...,1200000000,1000000000
7,1000000000,Tonian,https://en.wikipedia.org/wiki/Timeline_of_natu...,1000000000,720000000
8,720000000,Cryogenian,https://en.wikipedia.org/wiki/Timeline_of_natu...,720000000,635000000
9,635000000,Ediacaran,https://en.wikipedia.org/wiki/Timeline_of_natu...,635000000,541000000


<br><br>

- - - 
- - - 

<span style='font-size:40px;font-weight:650'>
    END
</span>