In [1]:
import time
import re
from selenium import webdriver

## Set Options

In [2]:
options = webdriver.FirefoxOptions()
options.add_argument('--headless')
webpage =  webdriver.Firefox(options=options, executable_path='./drivers/geckodriver')

## Set Url

In [3]:
# index url
urlpage = 'https://docs.oracle.com/en/solutions/index.html?type=reference-architectures&page=0&is=true&sort=0' 
# get web page
webpage.get(urlpage)

## Search for webpages links

In [4]:
# execute script to scroll down the page
for i in range(10):
    webpage.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
    # sleep for 10s
    time.sleep(3)

In [5]:
all_solution_links = webpage.find_elements_by_class_name("solutionurl")
print(f"Total links founded: {len(all_solution_links)}")

Total links founded: 79


In [6]:
url_architecture = []
for href in all_solution_links:
    url_architecture.append(href.get_attribute('href'))

## Single URL Loop - Simple
note: You can change ':' or '\n' delimiter

## Single URL Loop - regex version

In [7]:
json_out = {}

for link in url_architecture:
    
    try:
        # url
        single_url_arch = link
        # get web page
        webpage.get(single_url_arch)
    
        # execute script to scroll down the page
        for i in range(10):
            
            webpage.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
            
            # sleep for 10s
            time.sleep(1)
            
        #title
        title = webpage.find_elements_by_tag_name("h1")[0].text
            
        #sections
        sections = webpage.find_elements_by_tag_name("div.sect2")
        
        # in most cases [0] first section from all div.sect2 refers to architecture content.
        architecture_content = sections[0]
        
        # get first image
        image_link = architecture_content.find_elements_by_tag_name('img')[0].get_attribute('src')
            
        # get first content unordered list
        ul = architecture_content.find_elements_by_tag_name('ul')[0]
        # get all li elements
        all_li = ul.find_elements_by_tag_name('li')
        
        # create conten list
        content_list = [re.search("(.)+(:|\n)", li.text)[0] for li in all_li]
            
        # combine_all
        architecture_json = {'title':title, 'content': content_list, 'img_link':image_link,}
        
        #output
        json_out[str(link)] = architecture_json
    
    except:
        print(link)

https://docs.oracle.com/en/solutions/cobol-mainframe-to-oci/index.html
https://docs.oracle.com/en/solutions/mainframe-to-oke/index.html
https://docs.oracle.com/en/solutions/wls-on-oke-marketplace/index.html
https://docs.oracle.com/en/solutions/peoplesoft-fortinet-oci/index.html
https://docs.oracle.com/en/solutions/deploy-websphere-on-oci/index.html
https://docs.oracle.com/en/solutions/hybrid-dns/index.html
https://docs.oracle.com/en/solutions/ocvs-oci/index.html
https://docs.oracle.com/en/solutions/ofsaa-oci/index.html
https://docs.oracle.com/en/solutions/db-sharding-single-region/index.html
https://docs.oracle.com/en/solutions/xregion-pvt-connectivity-oci/index.html
https://docs.oracle.com/en/solutions/flexcube-oci/index.html
https://docs.oracle.com/en/solutions/oci-ellucian/index.html


In [8]:
webpage.close

<bound method WebDriver.close of <selenium.webdriver.firefox.webdriver.WebDriver (session="958e7c18-23ed-43f3-b65f-0452115bd246")>>

## Save json

In [9]:
import json

In [12]:
a_file = open("architecture_links.json", "w")

json.dump(json_out, a_file)

a_file.close()

## Save csv

In [75]:
import pandas as pd

In [141]:
df = pd.read_json('architecture_links.json')

In [142]:
df = df.T

In [143]:
df = df.reset_index()

In [144]:
ext_list = []
for cont in df.content:
    int_list = []
    for item in cont:
        int_list.append(item[:-1])
    ext_list.append(int_list)

In [145]:
df.content = ext_list

In [146]:
df.head()

Unnamed: 0,index,title,content,img_link
0,https://docs.oracle.com/en/solutions/implement...,Implement a custom error page for a load balan...,"[Region, Availability domains, Virtual cloud n...",https://docs.oracle.com/en/solutions/implement...
1,https://docs.oracle.com/en/solutions/migrate-t...,Migrate an on-premises Oracle Database deploym...,"[On-premises deployment, Region, Availability ...",https://docs.oracle.com/en/solutions/migrate-t...
2,https://docs.oracle.com/en/solutions/migrate-t...,Migrate an on-premises Oracle Database deploym...,"[On-premises deployment, Region, Availability ...",https://docs.oracle.com/en/solutions/migrate-t...
3,https://docs.oracle.com/en/solutions/migrate-e...,Migrate an on-premises Oracle Database deploym...,"[On-premises deployment, Region, Availability ...",https://docs.oracle.com/en/solutions/migrate-e...
4,https://docs.oracle.com/en/solutions/private-d...,Use private DNS in your VCN,"[Region, Availability domains, Virtual cloud n...",https://docs.oracle.com/en/solutions/private-d...


In [147]:
df = df.explode('content')

In [149]:
df.to_csv('architecture_links.csv', index_label='pandas_ind')

## Deep learning Data Frame

In [169]:
df = df[['img_link', 'content']]

In [170]:
df['exist'] = 1

In [177]:
df = df.pivot_table(index = 'img_link', columns='content', values='exist', aggfunc='sum', fill_value=0)

In [185]:
df.to_csv('architecture_deep_learning.csv')