# Data Collection: Building an Instagram Crawler

In this walkthrough we will see how we can crawl data from Instagram using the Selenium WebDriver.

# Step 1: Set up the WebDriver

The first thing we need is a web driver. A web driver is basically a browser that can be controlled programatically.

In [68]:
# Set up a function to start the webdriver
from selenium import webdriver
def start_webdriver():
    chromedriver_path = 'helpers/chromedriver'
    driver = webdriver.Chrome(chromedriver_path)
    return driver

In [94]:
# Execute the webdriver
driver = start_webdriver()

# Step 2: Navigate to Instagram and Login
In order to log in we need to mimic the same user flow that we would use to log on manually. To do so, we use the Google Developer Tools to find the selectors / clases of the respective buttons and form fields.

In [70]:
driver.get("https://www.instagram.com/")

In [71]:
# Import packages that we need
from random import randint
import time
import json

In [95]:
# Define Login Helper Function
def login():
  try:
    # the login_info.json file contains the login information for five different Instagram accounts. We randomly pick one of the five accounts and save the username and password in respective variables.
    with open('helpers/login_info.json') as f:
        login_info = json.load(f)
    random_number = randint(0, len(login_info['accounts']) - 1)
    user = login_info['accounts'][random_number]['username']
    pw = login_info['accounts'][random_number]['pw']
    # We go to the Instagram home page
    driver.get("https://www.instagram.com/")
    # We wait five seconds in order to make sure the page has fully loaded
    time.sleep(5)
    # We programmatically close the cookie notice
    try:
      driver.find_element_by_css_selector('.aOOlW').click()
    except:
      pass
    time.sleep(3)
    # We find the username and password fields and make sure they are empty
    username = driver.find_element_by_css_selector("input[name='username']")
    password = driver.find_element_by_css_selector("input[name='password']")
    username.clear()
    password.clear()
    # We enter the login credentials into the form and then programmatically click the login button.
    username.send_keys(user)
    password.send_keys(pw)
    time.sleep(randint(3,5))
    login = driver.find_element_by_css_selector("button[type='submit']").click()
  except:
    print("Could not log in / already logged in")

In [73]:
# Define Logout Helper Function
def logout():
  try:
    # We go to the Instagram home page
    driver.get("https://www.instagram.com/")
    time.sleep(5)
    # We open the dropdown menu
    menu = driver.find_element_by_class_name("_47KiJ")
    menu.find_element_by_class_name("_2dbep").click()
    time.sleep(3)
    # We click on the logout button
    menu.find_element_by_css_selector('.-qQT3:last-child').click()
    time.sleep(1)
    # We delete all cookies so that Instagram does not 
    # modify the login process when we want to log in again
    driver.delete_all_cookies()
  except:
    print("Could not log out / already logged out")

In [96]:
# Execute Login Helper Function
login()

In [81]:
# Execute Logout Helper Function
logout()

# Step 3: Decide which Accounts / Companies to Crawl and Create a Folder for Each

In [83]:
# For this tutorial we take three big FMCG brands as an example
companies = ['nestle', 'unilever', 'proctergamble']

In [102]:
# Import the os package
import os
# Define function that checks if a folder for the company exists within the example data folder. If not, it creates one.
def create_company_folder(company):
  path = os.getcwd() + "/example_data/" + company + "/"
  if not os.path.exists(path):
      os.makedirs(path)
  return path
# Execute the function
for company in companies:
  create_company_folder(company)

In [85]:
# Check out one of the accounts to plan the next steps
driver.get("https://www.instagram.com/" + companies[0])

# Step 4: Get Post URLs
In order to crawl the content of all Instagram posts of the selected accounts, we need their respective URLs. We get these by programatically scrolling to the bottom of the page and saving the URLs (href-attribute of the pictures) after every scroll action.

For this tutorial we limit the number of scrolled posts to 100 per company.

In [109]:
#check if hrefs.txt file already exists in the folder
from pathlib import Path
def find_hrefs(company):
    # Define the file in which all URLs will be saved in the end
    href_file = os.getcwd() + "/example_data/" + company + "/hrefs.txt"
    # Check if the file already exists
    href_file_exists = Path(href_file).is_file()
    # if the file does not exist, start scraping the URLs
    if not href_file_exists:
        print("no URLs found for " + company + ", starting to scrape them")
        time.sleep(randint(3, 5))
        # Go to the company's Instagram account
        driver.get("https://www.instagram.com/" + company)
        # Scrape all URLs, then scroll down and scrape the newly loaded posts until the end of the page is reached
        # For this tutorial we also limit the number of scrolled posts to 100 per company
        hrefs = []
        scrolldown = 0
        match = False
        while (match == False and len(hrefs) < 100):
            last_count = scrolldown
            # Find all links that are currently displayed on the page
            links = driver.find_elements_by_tag_name('a')
            time.sleep(randint(3, 4))
            for link in links:
                try:
                    href = link.get_attribute('href')
                    # only take links that include "/p/", indicating that it is a post link
                    if '/p/' in href:
                        # only add the post to the list of URLs if it is not in the list yet (prevent duplicates)
                        if href not in hrefs:
                            if len(hrefs) < 100:
                                hrefs.append(href)
                except:
                    pass
            # scroll to the bottom of the page to load new posts
            scrolldown = driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);var scrolldown=document.body.scrollHeight;return scrolldown;")
            # stop the process when the end of the page is reached or we have collected at least 100 URLs
            if last_count == scrolldown and len(hrefs) == 100:
                match = True
        print("saving " + str(len(hrefs)) + " URLs to file for " + company)
        with open(href_file, 'w') as file:
            file.write(str(hrefs))
    else:
        print("URLs file discovered for " + company)
        file = open(href_file, 'r')
        hrefs = eval(file.read())
    return hrefs, href_file_exists

In [110]:
# Execute the Scraping Function, iterating over the companies
for company in companies:
  find_hrefs(company)

no URLs found for nestle, starting to scrape them
saving 100 URLs to file for nestle
URLs file discovered for unilever
URLs file discovered for proctergamble


# Step 5: Iterate over the URLs and save post data as raw .json-files
Fortunately, we can leverage the Instagram GraphQL API to retrieve posts as structured data (otherwise we would have to go through all the different HTML elements that contain the information we need and scrape it from there). To retrieve a post as structured data we just add "?__a=1" to the URL. This yields a .json (Javascript Object Notation) file that we can save to our harddrive.

There is a caveat: images are only saved as URLs. However, Instagram periodically changes image URLs. Therefore, we need to save the images on our own harddrive or some other service (see next steps).

In [115]:
# define function that iterates over URLs and save json files
def save_post_data(company):
    href_file = os.getcwd() + "/example_data/" + company + "/hrefs.txt"
    file = open(href_file, 'r')
    hrefs = eval(file.read())
    counter = 0
    while counter < 100:
        href = hrefs[counter]
        # get the post ID from the URL
        post_id = os.path.basename(os.path.normpath(href))
        # define the file name
        json_file = os.getcwd() + "/example_data/" + company + "/" + post_id + ".json"
        # navigate to the structured data of the post and save it as json file
        if not Path(json_file).is_file():
            time.sleep(randint(2, 5))
            href_url = href + "?__a=1"
            driver.get(href_url)
            post_data = json.loads(driver.find_element_by_tag_name('body').text)
            with open(json_file, 'w', encoding="utf-8") as file:
                json.dump(post_data, file)
            counter = counter + 1
        else:
            counter = counter + 1

In [120]:
# execute the function
for company in companies:
  save_post_data(company)

# Step 6: Aggregate the data into a CSV file

In [121]:
import numpy as np
import pandas as pd

In [122]:
def get_data_from_json(path):
    with open(path) as f:
        post_data = json.load(f)
    
    post = post_data['graphql']['shortcode_media']
    
    try:
        post_description = post["edge_media_to_caption"]["edges"][0]["node"]["text"]
    except:
        post_description = ""
        
    try:
        accessibility_caption = post["edge_media_to_caption"]["accessibility_caption"]
    except:
        accessibility_caption = None

    try:
        location = post["location"]["name"]
    except:
        location = ""
    
    try:
        comments = post["edge_media_to_parent_comment"]["edges"]
    except:
        comments = []

    try:
        carousel = post["edge_sidecar_to_children"]["edges"]
        images = []
        for image in carousel:
            images.append({
                "url": image["node"]["display_url"]
            })
    except:
        images = [
            {
            "url": post["display_url"]
            }
        ]
    
    posting = {
        "shortcode": post["shortcode"],
        "images": images,
        "company": post["owner"]["username"],
        "post_id": post["id"],
        "date": post["edge_media_preview_like"]["count"],
        "post_description": post_description,
        "accessibility_caption": accessibility_caption,
        "count_likes": post["edge_media_preview_like"]["count"],
        "count_comments": post["edge_media_to_parent_comment"]["count"],
        "location": location,
        "comments_disabled": post["comments_disabled"],
        "is_ad": post["is_ad"],
        "is_video": post["is_video"] 
    }
    
    comments = []
    answers = []
    
    comments_raw = post["edge_media_to_parent_comment"]["edges"]
    for comment in comments_raw:
        comment = comment["node"]
        comments.append({
            "id": comment["id"],
            "text": comment["text"],
            "author": comment["owner"]["username"],
            "created_at": comment["created_at"],
            "count_likes": comment["edge_liked_by"]["count"],
            "count_answers": comment["edge_threaded_comments"]["count"],
            "report_as_spam": comment["did_report_as_spam"],
            "post": post["shortcode"]
        })
        
        answers_raw = comment["edge_threaded_comments"]["edges"]
        for answer in answers_raw:
            answer = answer["node"]
            answers.append({
                "id": answer["id"],
                "text": answer["text"],
                "author": answer["owner"]["username"],
                "created_at": answer["created_at"],
                "count_likes": answer["edge_liked_by"]["count"],
                "report_as_spam": answer["did_report_as_spam"],
                "comment": comment["id"],
                "comment_text": comment["text"],
                "post": post["shortcode"]
            })
        
    
    return posting, comments, answers

In [128]:
def iterate_over_jsons():
    postings = []
    comments = []
    answers = []

    for company in companies:
        folder_path = os.getcwd() + "/example_data/" + company + "/"
        files = [f for f in os.listdir(folder_path)]
        files.remove('hrefs.txt')
        for file in files:
            file_path = folder_path + file
            p, c, a = get_data_from_json(file_path)
            postings.append(p)
            comments = comments + c
            answers = answers + a
    return postings, comments, answers

In [132]:
postings, comments, answers = iterate_over_jsons()

In [138]:
postings_df = pd.DataFrame(postings).set_index("shortcode")
postings_df.head(3)

Unnamed: 0_level_0,images,company,post_id,date,post_description,accessibility_caption,count_likes,count_comments,location,comments_disabled,is_ad,is_video
shortcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
CMhBzBMAaJY,[{'url': 'https://scontent-zrh1-1.cdninstagram...,nestle,2531312369573929560,344,2020 highlighted the strength of the human-pet...,,344,82,,False,False,True
CMSLVR1AiB0,[{'url': 'https://scontent-zrh1-1.cdninstagram...,nestle,2527132181671846004,936,"In Pakistan, we’re helping over 1,500 women ca...",,936,2499,,False,False,False
CTwRr8lMW4g,[{'url': 'https://scontent-zrh1-1.cdninstagram...,nestle,2661705166552657440,398,Do you know the 5 innovations our experts have...,,398,2,,False,False,False


In [134]:
comments_df = pd.DataFrame(comments).set_index("id")
comments_df.head(3)

Unnamed: 0_level_0,text,author,created_at,count_likes,count_answers,report_as_spam,post
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
18200756470003696,"Please, don't sponsor belarusian dictator🤍❤️🤍",lee_sarko,1616013475,5,0,False,CMhBzBMAaJY
17876003864289688,"Nestlé, please stop sponsoring the Lukashenka ...",ogo.by,1616013519,8,0,False,CMhBzBMAaJY
17905891759742854,Please stop ignoring comments!,pro.otdyh,1616014005,10,0,False,CMhBzBMAaJY
17886434645021066,"You steal water, use slavery and child labour,...",asymptotax,1616014415,2,0,False,CMhBzBMAaJY
18199387912033928,"As I see Nestle on Lukashenko's TV, no more Ne...",maks.zakharau,1616018307,3,0,False,CMhBzBMAaJY


In [137]:
answers_df = pd.DataFrame(answers).set_index("id")
answers_df.head(3)

Unnamed: 0_level_0,text,author,created_at,count_likes,report_as_spam,comment,comment_text,post
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
17871699620348519,"@igs_wijaya Hi, please know that tackling plas...",nestle,1621234593,1,False,17925167515608531,Biggest pollutant,COx11GkLDUB
17963515711414212,@nestle @greenpeace @greenpeaceid,igs_wijaya,1621241505,0,False,17925167515608531,Biggest pollutant,COx11GkLDUB
17900352358979841,@nestle and that's why you guys continue to pu...,cherry_jubilee33,1621353219,2,False,17925167515608531,Biggest pollutant,COx11GkLDUB


In [136]:
postings_df.to_csv("postings.csv")
comments_df.to_csv("comments.csv")
answers_df.to_csv("answers.csv")

# Step 7: Export the data to Airtable for manual coding of images

In [118]:
from Naked.toolshed.shell import execute, execute_js, muterun_js
for company in companies:
  execute('node airtable/airtable.js ' + company)

nestle
[]
recDemBQxeLqFYWzc
recQeX80OYJ21CXL7
reciDXel0yYaWf8Ro
recYT7vqs50xUHeNh
rec5z7K7hY17i4jNY
recrwtkBpXV2UHTii
recAng7QSx2RKsjeh
recy1qT9BsxgUxyOV
recZECMKnizJI4u6k
recXSj0hgrmJ7Pxdv
recFVZUKlUZMWno4p
recKoM6j5cRqc0Fix
rec0CQaeCvLgtypQX
reczIcXQNjFk7A7HB
recO2TatF3dIreULJ
recdUlq1Z0yoO0T8B
recG2FhKimPYwwfZ3
recJ8lPD8PJqthTsq
recQRpvbw7yCN7nZw
rechpfER1dRR1nXjf
recG8IVAyTpErpfok
recAshBRuZWuvZHtZ
recg4sPfQKN4Hl48H
rec1RRBPguPYd8zdK
recQ2POOGsK6PCPdX
recR38uXgaChPQSeE
recqOf1vFqQldDmUm
recPSd3xwTiZY9ftb
rec7dYrjR2Tv59Vb8
rec0R4Q3vXCL94xLs
recdGAogvqToClHHn
rec1iUrIRPTzaU18g
recnQWntVgzaK9CsL
recXvzPZyD7pn5kbg
recYviScHpn72n1O7
recxsYXxyRTbUK3Gp
rec6MDu9ps18XWPRn
recsp9IXphzEVZilA
recEzyHfgbHmZvoYB
recLbvn3t4BCyzSoi
recWl49l5dkkMKm8s
recoeU0QskFTeRmq3
recaCkdh7RNeGXWVU
recyyVIrxzbqNDKFI
recIG5q8JMoyFzunv
recaUjsjyZbVhe4Xk
recMIbannqsgb2TB3
rec5mERfShFN7oSOO
recMIXlX6PCGLYSc8
recdOtfqgMjYF9Plv
rec8hJHNzXOIxRQax
recxaVwdJoTFDliWz
recAyf2UGE8hGCO5N
rec3ZU7IpiukPoM3o
rec6W1I0cPX0jjbzf
