# Importing stock data

from: https://subscription.packtpub.com/book/big_data_and_business_intelligence/9781787123137/15/ch15lvl1sec123/obtaining-and-organizing-stock-data-from-google

In [2]:
import numpy as np # linear algebra
import pandas as pd # pandas for dataframe based data processing and CSV file I/O
import requests # for http requests
from bs4 import BeautifulSoup # for html parsing and scraping
import bs4
from fastnumbers import isfloat 
from fastnumbers import fast_float
from multiprocessing.dummy import Pool as ThreadPool 

import matplotlib.pyplot as plt
import seaborn as sns
import json
from tidylib import tidy_document # for tidying incorrect html

sns.set_style('whitegrid')
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


String to float conversion

In [3]:
def ffloat(string):
    if string is None:
        return np.nan
    if type(string)==float or type(string)==np.float64:
        return string
    if type(string)==int or type(string)==np.int64:
        return string
    return fast_float(string.split(" ")[0].replace(',','').replace('%',''),
                      default=np.nan)

# Another way to do it

# def ffloat_list(string_list):
#    return list(map(ffloat,string_list))

maintain consistency

In [4]:
def remove_multiple_spaces(string):
    if type(string)==str:
        return ' '.join(string.split())
    return string

In [15]:
response = requests.get("https://www.investing.com/", timeout=240)
response.status_code
response.content

403

b'\n<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n<html>\n  <head>\n    <title>403 You are banned from this site.  Please contact via a different client configuration if you believe that this is a mistake.</title>\n  </head>\n  <body>\n    <h1>Error 403 You are banned from this site.  Please contact via a different client configuration if you believe that this is a mistake.</h1>\n    <p>You are banned from this site.  Please contact via a different client configuration if you believe that this is a mistake.</p>\n    <h3>Guru Meditation:</h3>\n    <p>XID: 3274123960</p>\n    <hr>\n    <p>Varnish cache server</p>\n  </body>\n</html>\n'

In [19]:
!vcgencmd measure_temp

temp=37.6'C


In [17]:
url = "https://www.investing.com/"
response = requests.get(url, timeout=240)
response.status_code
response.json()

content = response.json()
content.keys()

403

JSONDecodeError: Expecting value: line 2 column 1 (char 1)

Parsing & Traversing HTML

In [7]:
from IPython.core.display import HTML
HTML("<b>Rendered HTML</b>")

In [9]:
response = requests.get("https://www.moneycontrol.com/india/stockpricequote/auto-2-3-wheelers/heromotocorp/HHM", timeout=240)
page_content = BeautifulSoup(response.content, "html.parser")
HTML(str(page_content.find("h3")))

In [9]:
response = requests.get("https://www.moneycontrol.com/india/stockpricequote/auto-2-3-wheelers/heromotocorp/HHM", timeout=240)
content = BeautifulSoup(response.content, "html.parser")
price_div = content.find("div",attrs={"id":'b_changetext'})
HTML(str(price_div))

In [14]:
response = requests.get("https://www.investing.com", timeout=240)
content = BeautifulSoup(response.content, "html.parser")
price_div = content.find("table",attrs={"id":'lastNum pid-8839-last greenBg'})
HTML(str(price_div))

#<td class="lastNum pid-8839-last greenBg" id="sb_last_8839">2,824.88</td>

In [19]:
list(price_div.children)


[' ', <span class="gr_15 uparw_pc"><strong>15.80</strong></span>, ' (+0.61%)']

In [18]:
def get_children(html_content):
    return [item for item in html_content.children if type(item)==bs4.element.Tag or len(str(item).replace("\n","").strip())>0]

Step 1: Initialise final row_data as empty list.
Step 2: Get all rows in a list
Step 3: For each row in the list of rows
        - Initialise current_row_data as empty list
        - Get a list of cells in the row.
        - For each cell get its text content
          # if no text content present skip to next cell 
          # else put the text content into current_row_data
        - Put current_row_data into row_data
Step 4: return row_data

In [12]:
html = '''
<table>
    <tr>
        <td>Month</td>
        <td>Price</td>
    </tr>
    <tr>
        <td>July</td>
        <td>2</td>
    </tr>
    <tr>
        <td>August</td>
        <td>4</td>
    </tr>
    <tr>
        <td>September</td>
        <td>3</td>
    </tr>
    <tr>
        <td>October</td>
        <td>2</td>
    </tr>
</table>
'''
HTML(html)

0,1
Month,Price
July,2
August,4
September,3
October,2


In [13]:
def get_table_simple(table,is_table_tag=True):
    elems = table.find_all('tr') if is_table_tag else get_children(table)
    table_data = list()
    for row in elems:
        row_data = list()
        row_elems = get_children(row)
        for elem in row_elems:
            text = elem.text.strip().replace("\n","")
            text = remove_multiple_spaces(text)
            if len(text)==0:
                continue
            row_data.append(text)
        table_data.append(row_data)
    return table_data

In [14]:
html = BeautifulSoup(html,"html.parser")
get_table_simple(html)

[['Month', 'Price'],
 ['July', '2'],
 ['August', '4'],
 ['September', '3'],
 ['October', '2']]

In [15]:
html = '''
<html>
<body>
<div id="table" class="FL" style="width:210px; padding-right:10px">
    <div class="PA7 brdb">
        <div class="FL gL_10 UC">MARKET CAP (Rs Cr)</div>
        <div class="FR gD_12">63,783.84</div>
        <div class="CL"></div>
    </div>
    <div class="PA7 brdb">
        <div class="FL gL_10 UC">P/E</div>
        <div class="FR gD_12">17.27</div>
        <div class="CL"></div>
    </div>
    <div class="PA7 brdb">
        <div class="FL gL_10 UC">BOOK VALUE (Rs)</div>
        <div class="FR gD_12">589.29</div>
        <div class="CL"></div>
    </div>
    <div class="PA7 brdb">
        <div class="FL gL_10 UC">DIV (%)</div>
        <div class="FR gD_12">4750.00%</div>
        <div class="CL"></div>
    </div>
    <div class="PA7 brdb">
        <div class="FL gL_10 UC">Market Lot</div>
        <div class="FR gD_12">1</div>
        <div class="CL"></div>
    </div>
    <div class="PA7 brdb">
        <div class="FL gL_10 UC">INDUSTRY P/E</div>
        <div class="FR gD_12">19.99</div>
        <div class="CL"></div>
    </div>
</div>
</body>
</html>
'''
HTML(html)
content = BeautifulSoup(html,"html.parser")
get_table_simple(content.find("div",attrs={"id":"table"}),is_table_tag=False)

[['MARKET CAP (Rs Cr)', '63,783.84'],
 ['P/E', '17.27'],
 ['BOOK VALUE (Rs)', '589.29'],
 ['DIV (%)', '4750.00%'],
 ['Market Lot', '1'],
 ['INDUSTRY P/E', '19.99']]

In [40]:
def get_scrip_info(url):
    original_url = url
    key_val_pairs = {}
    
    page_response = requests.get(url, timeout=240)
    page_content = BeautifulSoup(page_response.content, "html.parser")
    price = ffloat(page_content.find('div',attrs={'id':'Nse_Prc_tick_div'}).text)
    name = page_content.find('h1',attrs={'class':'company_name'}).text

    yearly_high = page_content.find('span',attrs={'id':'n_52high'}).text.strip()
    yearly_low = page_content.find('span',attrs={'id':'n_52low'}).text.strip()
    html_data_content = page_content.find('div', attrs={'id': 'mktdet_1'})

    petable = get_table_simple(get_children(html_data_content)[0],is_table_tag=False)
    pbtable = get_table_simple(get_children(html_data_content)[0],is_table_tag=False)
    volume = ffloat(page_content.find('span',attrs={'id':'nse_volume'}).text)


    data_table = list()
    data_table.extend(petable)
    data_table.extend(pbtable)

    collector = {row[0]:ffloat(row[1]) if len(row)==2 else None for row in data_table}


    key_val_pairs["pe"] = collector['P/E']
    key_val_pairs["book_value"] = collector['BOOK VALUE (Rs)']
    key_val_pairs["deliverables"] = collector['DIV (%)']
    if 'MARKET CAP (Rs Cr)' in collector:
        key_val_pairs["market_cap"] = collector['MARKET CAP (Rs Cr)']
    elif '**MARKET CAP (Rs Cr)' in collector:
        key_val_pairs["market_cap"] = collector['**MARKET CAP (Rs Cr)']
    key_val_pairs["pb"] = collector['BOOK VALUE (Rs)']
    key_val_pairs['price'] = price
    key_val_pairs['volume'] = volume
    key_val_pairs["yearly_low"] = ffloat(yearly_low)
    key_val_pairs["yearly_high"] = ffloat(yearly_high)
    return key_val_pairs

get_scrip_info("https://www.moneycontrol.com/india/stockpricequote/auto-2-3-wheelers/heromotocorp/HHM")

{'book_value': 589.25,
 'deliverables': 4750.0,
 'market_cap': 51849.1,
 'pb': 589.25,
 'pe': 14.31,
 'price': 2574.45,
 'volume': 555742.0,
 'yearly_high': 3825.0,
 'yearly_low': 2537.05}