In [1]:
# importing required libraries
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import sqlite3

In [2]:
# connecting with sqlite server and getting a cursor object to execute SQL queries
conn = sqlite3.connect("python_doc.db")        # Here I have named the database as 'test.db', you can changeit.
c = conn.cursor()

In [3]:
# making a list of few urls taken randomly from python official doccumentation
urls = ["https://docs.python.org/3/library/fractions.html","https://docs.python.org/3/library/random.html", "https://docs.python.org/3/library/statistics.html"]


modules = []
git_modules = []

for url in urls:
# load html code from a webpage
    page=urllib.request.urlopen(url)
    soup=bs(page,"lxml")             # I am using 'lxml' parser, you can leave it blank to use the default html parser that comes with BeautifulSoup

    # module name
    heading = soup.body.find("h1").text
    module = heading.split()[0]

    # git source code of the module
    source=soup.body.find_all("a",class_="reference external")[0]["href"]


    # All the function names
    names = soup.body.find_all("dt")
    function_names = tuple(re.findall('id="({}.\w+)'.format(module), str(names)))

    # All the corresponding function usages
    description = soup.body.find_all("dd")
    function_usage = tuple(item.text.replace('\n', " ")for item in description)

    print(module, "function_names length: ", len(function_names))
    print(module, "function_usages length: ", len(function_usage))
    with conn:
        try:
            # Creating a dataframe
            dataframe = pd.DataFrame({"function_name": function_names,"function_usage": function_usage}, index=range(1, len(function_names)+1))
            modules.append(module)
            git_modules.append(source)
        except ValueError:
            print("\nParser detected that function_names and function_usage of '" +str(heading[:-1])+"' webpage does not have corresponding entries for each other, hence skipped that page\n")
            print("".center(90,"-"))
        else:
            # Converting it into a database table
            dataframe.to_sql("{}".format(module), conn, if_exists="replace",index=False)
            print("\n'"+str(heading[:-1]+"' data-table created successfully\n"))
            print("".center(90,"-"))

fractions function_names length:  10
fractions function_usages length:  11

Parser detected that function_names and function_usage of 'fractions — Rational numbers' webpage does not have corresponding entries for each other, hence skipped that page

------------------------------------------------------------------------------------------
random function_names length:  25
random function_usages length:  25

'random — Generate pseudo-random numbers' data-table created successfully

------------------------------------------------------------------------------------------
statistics function_names length:  30
statistics function_usages length:  30

'statistics — Mathematical statistics functions' data-table created successfully

------------------------------------------------------------------------------------------


In [4]:
# Create a table to list all the modules and commit all the changes to the database
with conn:
    mod_frame = pd.DataFrame({"module": modules,"source_code":git_modules})
    mod_frame.to_sql("module_list", conn, if_exists="replace",index=False)
    conn.commit()