In [1]:
from datetime import datetime
import lxml
from lxml import html
import requests
import numpy as np
import pandas as pd

In [2]:
def get_page(url):
    return requests.get(url)

In [3]:
def parse_rows(table_rows):
    parsed_rows = []

    for table_row in table_rows:
        parsed_row = []
        el = table_row.xpath("./div")

        none_count = 0

        for rs in el:
            try:
                (text,) = rs.xpath('.//span/text()[1]')
                parsed_row.append(text)
            except ValueError:
                parsed_row.append(np.NaN)
                none_count += 1

        if (none_count < 4):
            parsed_rows.append(parsed_row)
            
    return pd.DataFrame(parsed_rows)

In [4]:
def clean_data(df):
    df = df.set_index(0) 
    df = df.transpose()
    
    cols = list(df.columns)
    cols[0] = 'Date'
    df = df.set_axis(cols, axis='columns', inplace=False)
    
    numeric_columns = list(df.columns)[1::] 

    for column_index in range(1, len(df.columns)): 
        df.iloc[:,column_index] = df.iloc[:,column_index].str.replace(',', '') 
        df.iloc[:,column_index] = df.iloc[:,column_index].astype(np.float64)
        
    return df

In [4]:
def scrape_table(url):
    page = get_page(url);
    tree = html.fromstring(page.content)
    table_rows = tree.xpath("//div[contains(@class, 'D(tbr)')]")    
    df = parse_rows(table_rows)
    df = clean_data(df)
    return df

In [5]:
symbol = 'AAPL'
df_balance_sheet = scrape_table('https://finance.yahoo.com/quote/' + symbol + '/balance-sheet?p=' + symbol)

NameError: name 'clean_data' is not defined

In [None]:
df_balance_sheet

In [None]:
df_income_statement = scrape_table('https://finance.yahoo.com/quote/' + symbol + '/financials?p=' + symbol)


In [None]:
df_income_statement

In [None]:
df_cashflow_statement = scrape_table('https://finance.yahoo.com/quote/' + symbol + '/cash-flow?p=' + symbol)


In [None]:
df_cashflow_statement

### Define one function that scraps everything and puts in a Single Dataframe for a given ticker

In [None]:
def scrape(symbol):
    print('Attempting to scrape data for ' + symbol)

    df_balance_sheet = scrape_table('https://finance.yahoo.com/quote/' + symbol + '/balance-sheet?p=' + symbol)
    df_balance_sheet = df_balance_sheet.set_index('Date')

    df_income_statement = scrape_table('https://finance.yahoo.com/quote/' + symbol + '/financials?p=' + symbol)
    df_income_statement = df_income_statement.set_index('Date')
    
    df_cash_flow = scrape_table('https://finance.yahoo.com/quote/' + symbol + '/cash-flow?p=' + symbol)
    df_cash_flow = df_cash_flow.set_index('Date')
    
    df_joined = df_balance_sheet \
        .join(df_income_statement, on='Date', how='outer', rsuffix=' - Income Statement') \
        .join(df_cash_flow, on='Date', how='outer', rsuffix=' - Cash Flow') \
        .dropna(axis=1, how='all') \
        .reset_index()
            
    df_joined.insert(1, 'Symbol', symbol)
    print('Successfully scraped data for ' + symbol)
    return df_joined
    

In [None]:
financial_data_reliance = scrape('RELIANCE.NS')

In [None]:
financial_data_aapl

### Scrape for a list of symbols

In [None]:
def scrape_multi_symbols(symbols):
    return pd.concat([scrape(symbol) for symbol in symbols], sort=False)

In [None]:
scrape_multi_symbols(['MSFT','TSLA'])