# Scrape Data Quest data scientist path page & get the curriclum

### The page to scrape is as below:
<img src="dashboard.png">

* First, we need to log in to DataQuest to access the dashboard
* Then click on each step, then dropdown in each course to scrape the name of missions in each step
* The login and clicking on steps will be automated using Chromedriver and Selenium

In [1]:
from selenium import webdriver

In [7]:
# Scrape content of the Data Scientist path page
url = 'https://app.dataquest.io/dashboard'
driver = webdriver.Chrome('/Users/veratsien/tools/chromedriver')

# Login with email & password
driver.get(url)
email = driver.find_element_by_name('email')
email.clear()
email.send_keys('veratsien@gmail.com')

pswd = driver.find_element_by_name ('password')
pswd.clear()
pswd.send_keys('DAT314zen#!$')

driver.find_element_by_css_selector('button[type=submit]').click()

In [30]:
# Locate paths 
paths = driver.find_elements_by_class_name('PathStepTab__root-compact')

In [31]:
# Initialize a curriculum dictionary 
curriculum = {}

# Click each step and dropdown to access missions
for i in range(len(paths)):
    # Avoid stale element error by skipping click on starting point
    if i == 0:
        path = paths[i]
    else:
        path = paths[i]
        path.click()
    
    path_name = path.text
    
    # Find & expand courses by clicking dropdowns in each step
    dropdown = driver.find_elements_by_class_name('DqCollapse__left')
    
    # Initialize a courses dictionary
    courses = {}
    
    # Loop through and click on each course in current step
    for j in range(len(dropdown)):
        course = dropdown[j]
        course.click()
        course_name = course.find_element_by_tag_name('a').text
        
        # Find missions list in each course
        missions = driver.find_element_by_class_name('dq-list-none').find_elements_by_tag_name('li')
        
        # Create a list containing missions in each course
        mission_list = [mission.text for mission in missions]
        
        # Create key-value pair of current course and it's missions
        courses[course_name] = mission_list
        
        # Close current dropdown
        course.click()
        
    # Create key-value pair of current path and it's courses    
    curriculum[path_name] = courses      

In [32]:
curriculum

{'Step 1': {'Python for Data Science: Fundamentals': ['Programming in Python',
   'Variables and Data Types',
   'Lists and For Loops',
   'Conditional Statements',
   'Dictionaries and Frequency Tables',
   'Functions: Fundamentals',
   'Functions: Intermediate',
   'Project: Learn and Install Jupyter Notebook',
   'Guided Project: Profitable App Profiles for the App Store and Google Play Markets'],
  'Python for Data Science: Intermediate': ['Cleaning and Preparing Data in Python',
   'Python Data Analysis Basics',
   'Object-Oriented Python',
   'Working with Dates and Times in Python',
   'Guided Project: Exploring Hacker News Posts']},
 'Step 2': {'Pandas and NumPy Fundamentals': ['Introduction to NumPy',
   'Boolean Indexing with NumPy',
   'Introduction to pandas',
   'Exploring Data with pandas: Fundamentals',
   'Exploring Data with pandas: Intermediate',
   'Data Cleaning Basics',
   'Guided Project: Exploring Ebay Car Sales Data'],
  'Exploratory Data Visualization': ['Line 

In [96]:
# Create multi-index with steps & courses
step_level = []
course_level = []
all_missions = []
for step, courses in curriculum.items():
    for course, missions in courses.items():
        for mission in missions:
            step_level.append(step)
            course_level.append(course)
            all_missions.append(mission)

In [92]:
# Sanity check
len(step_level), len(course_level)

(165, 165)

In [94]:
import pandas as pd

In [95]:
# Create multi-index
idx_tuple = list(zip(step_level, course_level))
idx = pd.MultiIndex.from_tuples(idx_tuple)

In [99]:
# Create a dataframe from the curriculum data
dq_ds_curriculum = pd.DataFrame(all_missions, index = idx, columns=['mission'])

In [103]:
dq_ds_curriculum.head(15)

Unnamed: 0,Unnamed: 1,mission
Step 1,Python for Data Science: Fundamentals,Programming in Python
Step 1,Python for Data Science: Fundamentals,Variables and Data Types
Step 1,Python for Data Science: Fundamentals,Lists and For Loops
Step 1,Python for Data Science: Fundamentals,Conditional Statements
Step 1,Python for Data Science: Fundamentals,Dictionaries and Frequency Tables
Step 1,Python for Data Science: Fundamentals,Functions: Fundamentals
Step 1,Python for Data Science: Fundamentals,Functions: Intermediate
Step 1,Python for Data Science: Fundamentals,Project: Learn and Install Jupyter Notebook
Step 1,Python for Data Science: Fundamentals,Guided Project: Profitable App Profiles for th...
Step 1,Python for Data Science: Intermediate,Cleaning and Preparing Data in Python


In [104]:
dq_ds_curriculum.to_csv('dq_ds_curriculum.csv')