In [1]:
!pip install pdfplumber



In [2]:
import io
import re
import time
import requests
import pdfplumber
import numpy as np
import pandas as pd
import urllib.request
from io import BytesIO
from bs4 import BeautifulSoup

In [3]:
# Open website home page

URL = 'https://explore-datascience.net/'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

# Extract all urls from the website page

def get_urls(soup):
    """
    Function returns a list of all url links from a BeautifulSoup object.
    """
    urls = []
    for link in soup.find_all('a'):
        if link.get('href') != None:
            urls.append(link.get('href'))
    urls = [x for x in urls if 'https' in x]
    urls = list(set(urls))
    return urls

urls_list = get_urls(soup)

In [4]:
def website_links(urls_list):
    urls = [x for x in urls_list if '.pdf' not in x and 'facebook' not in x and 'twitter' not in x and 'linkedin' not in x]
    pdfs = [x for x in urls_list if '.pdf' in x]
    new_urls = []
    for i in range(len(urls)):
        page = requests.get(urls[i])
        soup = BeautifulSoup(page.content, 'html.parser')
        pg_urls = get_urls(soup)
        if len(pg_urls) > 0:
            pdfs.extend([x for x in pg_urls if '.pdf' in x and x not in pdfs])
            new_urls.extend([x for x in pg_urls if x not in urls 
                             and x not in new_urls 
                             and x not in pdfs 
                             and 'https://explore-datascience.net' in x])
    urls.extend(new_urls)
    return urls, len(new_urls), pdfs

In [5]:
# Extract all urls and pdfs from the website.

urls, new_urls, pdfs = website_links(urls_list)
if new_urls > 0:
    urls, new_urls, pdfs = website_links(urls_list)

In [6]:
urls

['https://explore-datascience.net/course/info/1/3',
 'https://explore-datascience.net/alumni/hire-explorer',
 'https://explore-datascience.net/course/info/40/5/online',
 'https://explore-datascience.net/course/info/4/3',
 'https://explore-datascience.net/course/info/53/5/online',
 'https://explore-datascience.net/course/info/4/3/1',
 'https://explore-datascience.net/',
 'https://explore-datascience.net/course/info/39/5/online',
 'https://explore-datascience.net/enterprise/summary',
 'https://explore-datascience.net/course/info/41/5/online',
 'https://explore-datascience.net/course/info/42/5/online',
 'https://explore-datascience.net/about-us/our-values',
 'https://explore-datascience.net/course/info/1/3/1',
 'https://explore-datascience.net/course/info/34/3/online',
 'https://explore-datascience.net/course/info/35/4',
 'https://explore-datascience.net/course/info/43/5/online',
 'https://explore-datascience.net/course/info/4/3/online',
 'https://explore-datascience.net/course/info/3/3/1

In [7]:
# Scrape text data from the website (this excludes the pdfs)

for i in range(len(urls)):
    page = requests.get(urls[i])
    soup = BeautifulSoup(page.content, 'html.parser')
    name = str(urls[i]).replace('https://explore-datascience.net', 'explore').replace('/', '_')
    f = open("{}.txt".format(name), "a+", encoding="utf-8")
    f.write('{}\n'.format(str(urls[i])))
    for items in soup.find_all():
        all_text = [item.text for item in items.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'a', 'p'])]
        for j in all_text:
            f.write('{}\n'.format(j))
    f.close()
    print('Completed {0}: {1} of {2}'.format(name, i+1, len(urls)))

Completed explore_course_info_1_3: 1 of 40
Completed explore_alumni_hire-explorer: 2 of 40
Completed explore_course_info_40_5_online: 3 of 40
Completed explore_course_info_4_3: 4 of 40
Completed explore_course_info_53_5_online: 5 of 40
Completed explore_course_info_4_3_1: 6 of 40
Completed explore_: 7 of 40
Completed explore_course_info_39_5_online: 8 of 40
Completed explore_enterprise_summary: 9 of 40
Completed explore_course_info_41_5_online: 10 of 40
Completed explore_course_info_42_5_online: 11 of 40
Completed explore_about-us_our-values: 12 of 40
Completed explore_course_info_1_3_1: 13 of 40
Completed explore_course_info_34_3_online: 14 of 40
Completed explore_course_info_35_4: 15 of 40
Completed explore_course_info_43_5_online: 16 of 40
Completed explore_course_info_4_3_online: 17 of 40
Completed explore_course_info_3_3_1: 18 of 40
Completed explore_contact: 19 of 40
Completed explore_course_info_1_3_online: 20 of 40
Completed explore_course_info_34_3: 21 of 40
Completed explore_

In [8]:
for i in pdfs:
    print(i)

https://explore-datascience.net/pdf/Data Science 2021 (1).pdf
https://explore-datascience.net/pdf/Visualisation Prospectus 2020.pdf
https://explore-datascience.net/pdf/Data Analytics 2021 (1).pdf
https://explore-datascience.net/pdf/Introduction to Data Science Prospectus 2021 [v1.0].pdf
https://explore-datascience.net/pdf/Python for Data Science Prospectus 2020.pdf
https://explore-datascience.net/pdf/Business Intelligence 2021 (1).pdf
https://explore-datascience.net/pdf/Data Visualisation Prospectus 2020.pdf
https://explore-datascience.net/pdf/Data Engineering 2021 (1).pdf
https://explore-datascience.net/pdf/SQL Prospectus 2020.pdf
https://explore-datascience.net/pdf/Machine Learning Prospectus 2020.pdf


In [9]:
pdfs = ['https://explore-datascience.net/pdf/EDSA_Course_Outline.pdf?12.4',
 'https://explore-datascience.net/pdf/Data_Analytics.pdf',
 'https://explore-datascience.net/pdf/EDSA_Course_Outline.pdf',
 'https://explore-datascience.net/pdf/Data_Science.pdf',
 'https://explore-datascience.net/pdf/Full_Syllbus_DS_for_Executives_Mar_2020.pdf',
 'https://explore-datascience.net/pdf/Machine_Learning_Analysts_Short.pdf',
 'https://explore-datascience.net/pdf/careers/Senior_Data_Scientist.pdf',
 'https://explore-datascience.net/pdf/careers/Senior_Data_Engineer.pdf',
 'https://explore-datascience.net/pdf/Machine_Learning_for_Analysts.pdf',
 'https://explore-datascience.net/pdf/Data_Engineering.pdf',
 'https://explore-datascience.net/pdf/Full_Syllbus_DS_for_Managers_Mar_2020.pdf',
 'https://explore-datascience.net/pdf/Data_Science_Managers_Short.pdf',
 'https://explore-datascience.net/pdf/Advanced_Python_Short.pdf',
 'https://explore-datascience.net/pdf/Explore_Course_Catalogue.pdf',
 'https://explore-datascience.net/pdf/Advanced_Visualisation_Short.pdf',
 'https://explore-datascience.net/pdf/Data_Science_High_School_Short.pdf',
 #'https://explore-datascience.net/pdf/aws_cloud_practitioner_short.pdf',
 'https://explore-datascience.net/pdf/Deep_Learning_AI.pdf',
 'https://explore-datascience.net/pdf/Insights_Led_Organisation.pdf',
 'https://explore-datascience.net/pdf/How to Structure Your Data Science Capability.pdf',
 'https://explore-datascience.net/pdf/Investing_in_LandD.pdf',
 'https://explore-datascience.net/pdf/A Data Science Team.pdf',
 'https://explore-datascience.net/pdf/Ogranization_Data_Maturity.pdf',
 'https://explore-datascience.net/pdf/Machine_Learning_for_Analysts_Short.pdf']

In [10]:
# Scrape text data from the PDFs
for i in pdfs:
    rq = requests.get(i)
    pdf = pdfplumber.open(BytesIO(rq.content))
    name = "".join(re.findall(r'pdf/(.*?).pdf', str(i)))
    name = name.replace('careers/', '')
    myfile = io.open(name + ".txt", "w", encoding="utf-8")
    for i in range(len(pdf.pages)):
        p = pdf.pages[i]
        text = p.extract_text()
        myfile.write(str(text)+"\n ")
    myfile.close()
    print(name)

EDSA_Course_Outline
Data_Analytics
EDSA_Course_Outline
Data_Science
Full_Syllbus_DS_for_Executives_Mar_2020
Machine_Learning_Analysts_Short
Senior_Data_Scientist
Senior_Data_Engineer
Machine_Learning_for_Analysts
Data_Engineering
Full_Syllbus_DS_for_Managers_Mar_2020
Data_Science_Managers_Short
Advanced_Python_Short
Explore_Course_Catalogue
Advanced_Visualisation_Short
Data_Science_High_School_Short
Deep_Learning_AI
Insights_Led_Organisation
How to Structure Your Data Science Capability
Investing_in_LandD
A Data Science Team
Ogranization_Data_Maturity
Machine_Learning_for_Analysts_Short
