Web Crawling & Scraping (Instagram - Instaloader)

Ref: https://www.geeksforgeeks.org/introduction-to-instaloader-module-in-python/

Ref: https://instaloader.github.io/codesnippets.html

In [None]:
import instaloader

In [None]:
# Get instance
loader = instaloader.Instaloader()

In [None]:
# Login using the credentials
# Your Own Instagram Credentials
USER = ''
PASSWORD = ''

In [None]:
# Login using the credentials
loader.login(USER, PASSWORD)

In [None]:
# Use Profile class to access metadata of account
profile = instaloader.Profile.from_username(loader.context, 'justinbieber')

In [None]:
# See followers
followers = profile.get_followers()

for follower in followers:
    print(follower)

In [None]:
# See followees
followees = profile.get_followees()

for followee in followees:
    print(followee)

In [None]:
# Get the total number of posts
media = profile.mediacount

print(media)

In [None]:
# Get the total number of IGTV posts
igtv = profile.igtvcount

print(igtv)

In [None]:
# Is this account private or public?
private = profile.is_private

print(private)

In [None]:
# Get biography
bio = profile.biography

print(bio)

In [None]:
# Get profile picture
profile_pic = profile.profile_pic_url

print(profile_pic)

In [None]:
# Get external url (if any)
ext_url = profile.external_url

print(ext_url)

In [None]:
# Is this a business account?
business = profile.is_business_account

print(business)

In [None]:
# returns the business category
business_type = profile.business_category_name
 
print(business_type)

# Download Posts in a Specific Period
To only download Instagram pictures (and metadata) that are within a specific period, you can simply use dropwhile() and takewhile() from itertools on a generator that returns Posts in exact chronological order, such as Profile.get_posts().

In [None]:
from datetime import datetime
from itertools import dropwhile, takewhile

import instaloader

L = instaloader.Instaloader()

posts = instaloader.Profile.from_username(L.context, "justinbieber").get_posts()

# Tracing backwards from today
SINCE = datetime(2022, 2, 22) # February 22, 2022
UNTIL = datetime(2022, 1, 22) # January 22, 2022

for post in takewhile(lambda p: p.date > UNTIL, dropwhile(lambda p: p.date > SINCE, posts)):
    print(post.date)
    L.download_post(post, "justinbieber")

In [None]:
# Retriev Metadata from posts
# Credit to TA Thao, thank you!

import os
import pandas as pd 

def parse_instafiles(path):

    os.chdir(path)
    
    dataframe = pd.DataFrame(columns=[])
    
    #print('Traversing file tree...')
    
    glob('*UTC.json')
    
    for file in glob('*UTC.json'):
        with open(file, 'r') as filecontent:
            filename = filecontent.name[:-5]

            try:
                metadata = orjson.loads(filecontent.read())
            
            except IOError as e:
                #print("I/O Error. Couldn't load file. Trying the next one...")
                continue
            else:
                pass
            #print('Collecting relevant metadata...')
            username = metadata['node']['owner']['username']
            image = metadata['node']['display_url']
            time = datetime.fromtimestamp(int(metadata['node']['taken_at_timestamp']))
            likes = int(metadata['node']['edge_media_preview_like']['count'])   
            
#             If you want to crawl comments
#             comments = int(metadata['node']['edge_media_to_comment']['count'])
#             if comments > 0:
#                 with open(filename+"_comments.json" ,'r') as comment_file:
#                     data = orjson.loads(comment_file.read())
#                     comment1 = data[0]['text']
            
            try:
                text = metadata['node']['edge_media_to_caption']['edges'][0]['node']['text']
            except:
                text = ""
            try:
                post_id = metadata['node']['id']
            except:
                post_id = ""
            minedata = {'filename': filename, "username": username, 'time': time, 'text': text,
                    'likes': likes, 'post_id' : post_id, 'image_url':image}
            #print('Writing to dataframe...')
            dataframe = dataframe.append(minedata, ignore_index=True)
            #print('Closing file...')
            del metadata
            filecontent.close()
    #print('Storing dataframe to CSV file...')
    #print('Done.')
    return dataframe

In [None]:
# Windows OS path
# If you're using Mac, replace \\ with /
df_instagram = parse_instafiles(os.getcwd() + "\\justinbieber")

In [None]:
# Let's see the metadata for the first few entries
df_instagram.head()