# Title

In [1]:
r"""markdown
    TITLE   :
    AUTHOR  :
    PROJECT :
""";

__author__ = 'Nathaniel Starkmana'
__version__ = '1.0.0'

<span style='font-size:30px;font-weight:650'>
    About
</span>


<br><br>

- - - 
- - - 

<br>

# Prepare

## Imports

In [2]:
## General
# import requests
# from bs4 import BeautifulSoup
# import flask
import numpy as np
import pandas as pd

## Custom
from IPython.display import HTML

## Project-Specific

## Functions

In [3]:
def scrape_table(inputurl, columns=[], rename={}, sort=None, parse=True,
                 make_time_negative=False,
                 row_select=slice(None),
                 # dates
                 date_col='date',
                 to_next_date=False, is_start_date=True,
                 # saving
                 save=False, combine_to_master=True,
                 # debug
                 _table_index=slice(None)
                ):

    # get get from html
    dflist = pd.read_html(inputurl, attrs={"class": ["wikitable","wikitable sortable"]})
    # concatenate and sort
    dflist = dflist[_table_index]
    if not isinstance(dflist, list):
        dflist = [dflist, ]
    df = concatenate_tables(*dflist, sort=sort)
    
    # get right columns
    df = df.loc[row_select, columns]
    df.rename(columns=rename, inplace=True)

    # add source
    df['source'] = inputurl
    
    # catch Nans
    df = df[~df['date'].isna()]

    # parse date
    if parse:
        df = parse_dates(df, date_col=date_col)
        df = parse_events(df, event_col='event')
        
    if make_time_negative:
        df[date_col] = -df[date_col]

    # deal with start and end times
    if is_start_date is True:
        if to_next_date is False:  # single date
            df['start_date'] = df[date_col]
            df['end_date'] = df[date_col]
        else: # date range
            df['start_date'] = df[date_col]
            df['end_date'] = [*df[date_col][1:], 0]
            df['date'] = df['start_date'] + df['end_date'] / 2
    else:
        pass
#         df['start_date'] = df[date_col] - 
#         df['end_date'] = df[date_col] +
    
    if isinstance(save, str):
        df.to_csv(save, index=False)
    if combine_to_master:
        add_to_master_table(df, sort=sort)
    return df
# /def


def concatenate_tables(*dfs, sort=None):
    cdf = pd.concat(list(dfs))
    if sort is not None:
        cdf.sort_values(sort, inplace=True)
    cdf.reset_index(drop=True, inplace=True)
    return cdf
# /def


def add_to_master_table(df, sort):
    try:
        mdf = pd.read_csv('../data/master_table.csv', sep=';')
    except:
        df.to_csv('../data/master_table.csv', index=False, sep=';')
    else:
        new_df = concatenate_tables(mdf, df, sort='date')
        new_df.to_csv('../data/master_table.csv', index=False, sep=';')
    return df
# /def

In [4]:
def parse_dates(df, date_col='date'):

    def parse_start(date):
        """recursively parse start to strip out"""
        date = date.strip()
        if date.startswith('by '):
            return parse_start(date[2:])
        elif date.startswith('c.'):
            return parse_start(date[2:])
        elif date.startswith('>'):
            return parse_start(date[1:])
        elif date.startswith('~'):
            return parse_start(date[1:])
        else:
            return date

    def parse_parenthesis(date):
        date = date.strip()
        if '(' in date:
            date = date.split('(')[0]
        if '[' in date:
            date = date.split('[')[0]
        return date
    
    def parse_unit(date):
        """return unit in years"""    
        date = date.strip()
        if date.endswith('ka'):
            return date[:-2], 1e3
        # mega
        elif date.endswith('ma'):
            return date[:-2], 1e6
        elif date.endswith('ma-'):
            return date[:-3], 1e6
        elif date.endswith('million'):
            return date[:-7], 1e6
        elif date.endswith('million+'):
            return date[:-8], 1e6
        # giga
        elif date.endswith('bya'):
            return date[:-3], 1e9
        elif date.endswith('billion'):
            return date[:-7], 1e6
        elif date.endswith('ya'):  # after bya b/c ya in bya
            return date[:-2], 1
        else:
            return date, 1
    # /def

    def parse_number(number):
        if isinstance(number, (int, float)):
            return float(number)
        else:
            return float(number.replace(',', ''))

    def parse_range(date):
        date = date.strip()
        if ('-' in date):
            s = date.split('-')[0]
            return np.mean([parse_number(s[0]), parse_number(s[1])])
        elif ('–' in date):
            s = date.split('–')
            return np.mean([parse_number(s[0]), parse_number(s[1])])
        elif (' to ' in date):
            s = date.split(' to ')
            return 10**np.mean([np.log10(parse_number(s[0])), np.log10(parse_number(s[1]))])
        else:
            return date
    
    dates = df[date_col]
    
    for i, date in dates.iteritems():

        if isinstance(date, str):
            date = date.lower().strip()
            # parse unit
            date = parse_start(date)
            date = parse_parenthesis(date)
            date, unit = parse_unit(date)
            # parse range
            date = parse_range(date)

            # assign
            dates.iloc[i] = parse_number(date) * unit

    # reassigning
    df[date_col] = dates
    return df
# /def

In [5]:
def parse_events(df, event_col='event'):
    
    def strip_citation(event):
        for i in range(1000):
            event = event.strip(f'[{i}]')
        return event
    
    events = df[event_col]
    
    for i, event in events.iteritems():
        if isinstance(event, str):
            
            event = event.strip()
            event = strip_citation(event)        
    
    df[event_col] = events
    
    
    return df
            
            
            

<br><br>

- - - 
- - - 

<br>

# Code

In [6]:
df = scrape_table("https://en.wikipedia.org/wiki/Timeline_of_the_evolutionary_history_of_life",
                  columns=['Date', 'Event'],
                  rename={'Date': 'date', 'Event': 'event'},
                  date_col='date',
                  save='../data/timeline/wikipedia_evolutionary_history_of_life.csv',
                  sort='Date',
                  parse=True,
                  combine_to_master=True,
                  make_time_negative=True,
)

df

Unnamed: 0,date,event,source,start_date,end_date
0,-1000000,First coyotes,https://en.wikipedia.org/wiki/Timeline_of_the_...,-1000000,-1000000
1,-1000000000,The first non-marine eukaryotes move onto land...,https://en.wikipedia.org/wiki/Timeline_of_the_...,-1000000000,-1000000000
2,-1200000,Evolution of Homo antecessor. The last members...,https://en.wikipedia.org/wiki/Timeline_of_the_...,-1200000,-1200000
3,-1700000,Extinction of australopithecines,https://en.wikipedia.org/wiki/Timeline_of_the_...,-1700000,-1700000
4,-10000000,"Grasslands and savannas are established, diver...",https://en.wikipedia.org/wiki/Timeline_of_the_...,-10000000,-10000000
...,...,...,...,...,...
142,-90000000,Extinction of ichthyosaurs. Earliest snakes an...,https://en.wikipedia.org/wiki/Timeline_of_the_...,-90000000,-90000000
143,-95000000,First crocodilians evolve,https://en.wikipedia.org/wiki/Timeline_of_the_...,-95000000,-95000000
144,-1200000000,Meiosis and sexual reproduction are present in...,https://en.wikipedia.org/wiki/Timeline_of_the_...,-1200000000,-1200000000
145,-1850000000,Eukaryotic cells appear. Eukaryotes contain me...,https://en.wikipedia.org/wiki/Timeline_of_the_...,-1850000000,-1850000000


In [7]:
df = scrape_table("https://en.wikipedia.org/wiki/Timeline_of_natural_history",
                  columns=['Started', 'Period'],
                  rename={'Started': 'date', 'Period': 'event'},
                  date_col='date',
                  to_next_date=True,
                  save='../data/timeline/wikipedia_timeline_of_natural_history.csv',
                  parse=True,
                  combine_to_master=True,
                  make_time_negative=True
)

df

Unnamed: 0,date,event,source,start_date,end_date
0,-3650000000.0,Siderian,https://en.wikipedia.org/wiki/Timeline_of_natu...,-2500000000,-2300000000
1,-3325000000.0,Rhyacian,https://en.wikipedia.org/wiki/Timeline_of_natu...,-2300000000,-2050000000
2,-2950000000.0,Orosirian,https://en.wikipedia.org/wiki/Timeline_of_natu...,-2050000000,-1800000000
3,-2600000000.0,Statherian,https://en.wikipedia.org/wiki/Timeline_of_natu...,-1800000000,-1600000000
4,-2300000000.0,Calymmian,https://en.wikipedia.org/wiki/Timeline_of_natu...,-1600000000,-1400000000
5,-2000000000.0,Ectasian,https://en.wikipedia.org/wiki/Timeline_of_natu...,-1400000000,-1200000000
6,-1700000000.0,Stenian,https://en.wikipedia.org/wiki/Timeline_of_natu...,-1200000000,-1000000000
7,-1360000000.0,Tonian,https://en.wikipedia.org/wiki/Timeline_of_natu...,-1000000000,-720000000
8,-1037500000.0,Cryogenian,https://en.wikipedia.org/wiki/Timeline_of_natu...,-720000000,-635000000
9,-905500000.0,Ediacaran,https://en.wikipedia.org/wiki/Timeline_of_natu...,-635000000,-541000000


In [8]:
df = scrape_table("https://en.wikipedia.org/wiki/Timeline_of_the_far_future",
                  columns=['Years from now', 'Event'],
                  rename={'Years from now': 'date', 'Event': 'event'},
                  date_col='date',
                  to_next_date=False,
                  save='../data/timeline/wikipedia_timeline_of_the_far_future.csv',
                  parse=True,
                  combine_to_master=True,
                  sort=None,
                  row_select=range(72),
                  _table_index=1,
                  make_time_negative=False
)

df

Unnamed: 0,date,event,source,start_date,end_date
0,10000,"If a failure of the Wilkes Subglacial Basin ""i...",https://en.wikipedia.org/wiki/Timeline_of_the_...,10000,10000
1,10000,The red supergiant star Antares will likely ha...,https://en.wikipedia.org/wiki/Timeline_of_the_...,10000,10000
2,15000,"According to the Sahara pump theory, the prece...",https://en.wikipedia.org/wiki/Timeline_of_the_...,15000,15000
3,25000,The northern Martian polar ice cap could reced...,https://en.wikipedia.org/wiki/Timeline_of_the_...,25000,25000
4,36000,The small red dwarf Ross 248 will pass within ...,https://en.wikipedia.org/wiki/Timeline_of_the_...,36000,36000
...,...,...,...,...,...
67,4500000,Mars reaches the same solar flux the Earth did...,https://en.wikipedia.org/wiki/Timeline_of_the_...,4500000,4500000
68,5400000,With the hydrogen supply exhausted at its core...,https://en.wikipedia.org/wiki/Timeline_of_the_...,5400000,5400000
69,6500000,Mars reaches the same solar radiation flux as ...,https://en.wikipedia.org/wiki/Timeline_of_the_...,6500000,6500000
70,7500000,Earth and Mars may become tidally locked with ...,https://en.wikipedia.org/wiki/Timeline_of_the_...,7500000,7500000


<br><br>

- - - 
- - - 

<br>

## Split Table

In [10]:
df = pd.read_csv('../data/master_table.csv', sep=';')

In [11]:
rangedf = df.loc[df['start_date'] != df['end_date']]

rangedf.to_csv('../data/master_range_table.csv', sep=';')

In [12]:
eventdf = df.loc[df['start_date'] == df['end_date']]

eventdf.to_csv('../data/master_event_table.csv', sep=';')

<br><br>

- - - 
- - - 

<span style='font-size:40px;font-weight:650'>
    END
</span>