# Instagram Scrapper with GraphQL

Greetings, in this notebook i'm going to implement an IG Scrapper in order to extract information needed from a specific page in Instagram Social Media. The reason I decided to implement it myself is tackling the problem first hands so I would have a clear vision and understanding in explanation of my thesis, also in order to make progress in my thesis and data-wise, I need to save my data and features in a specific way so my machine learning algorithm would work flawlessly.

This Scrapper was implemented with the help of Igscrapper source code from realsirjoe Github account you can check it here: [realsirjoe Github Account](https://github.com/realsirjoe)

In [1]:
def get_id_from_code(code):
    alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
    id = 0
    for i in range(len(code)):
        c = code[i]
        id = id * 64 + alphabet.index(c)
    return id

In [2]:
print(get_id_from_code('BHaRdodBouH'))

1286417446659853191


In [2]:
import time
import requests
import json
import hashlib
import re
from model.comment import Comment

In [45]:
class Instagram:
    HTTP_NOT_FOUND = 404
    HTTP_OK = 200
    HTTP_FORBIDDEN = 403
    HTTP_BAD_REQUEST = 400
    MAX_COMMENTS_PER_REQUEST = 300
    MAX_LIKES_PER_REQUEST = 50
    # 30 mins time limit on operations that require multiple self.__req
    PAGING_TIME_LIMIT_SEC = 1800
    PAGING_DELAY_MINIMUM_MICROSEC = 1000000  # 1 sec min delay to simulate browser
    PAGING_DELAY_MAXIMUM_MICROSEC = 3000000  # 3 sec max delay to simulate browser
    instance_cache = None
    
    def __init__(self, sleep_between_requests=0):
        self.__req = requests.session()
        self.paging_time_limit_sec = Instagram.PAGING_TIME_LIMIT_SEC
        self.paging_delay_minimum_microsec = Instagram.PAGING_DELAY_MINIMUM_MICROSEC
        self.paging_delay_maximum_microsec = Instagram.PAGING_DELAY_MAXIMUM_MICROSEC
        self.session_username = None
        self.session_password = None
        self.cookie = None
        self.user_session = None
        self.rhx_gis = None
        self.sleep_between_requests = sleep_between_requests
        self.user_agent = 'Instagram 126.0.0.25.121 Android (23/6.0.1; 320dpi; 720x1280; samsung; SM-A310F; a3xelte; samsungexynos7580; en_GB; 110937453)'
        
    def __generate_gis_token(self, variables):
        """
        :param variables: a dict used to  generate_gis_token
        :return: a token used to be verified by instagram
        """
        rhx_gis = self.__get_rhx_gis() if self.__get_rhx_gis() is not None else 'NULL'
        string_to_hash = ':'.join([rhx_gis, json.dumps(variables, separators=(',', ':')) if isinstance(variables, dict) else variables])
        return hashlib.md5(string_to_hash.encode('utf-8')).hexdigest()
    
    def __get_rhx_gis(self):
        """
        :return: a string to generate gis_token
        """
        if self.rhx_gis is None:
            try:
                shared_data = self.__get_shared_data_from_page()
            except Exception as _:
                raise InstagramException('Could not extract gis from page')

            if 'rhx_gis' in shared_data.keys():
                self.rhx_gis = shared_data['rhx_gis']
            else:
                self.rhx_gis = None
        return self.rhx_gis
    
    def __get_shared_data_from_page(self, url = 'https://www.instagram.com'):
        """
        :param url: the requested url
        :return: a dict extract from page
        """
        url = url.rstrip('/') + '/'
        time.sleep(self.sleep_between_requests)
        response = self.__req.get(url, headers=self.generate_headers(
            self.user_session))

        if Instagram.HTTP_NOT_FOUND == response.status_code:
            raise InstagramNotFoundException(f"Page {url} not found")

        if not Instagram.HTTP_OK == response.status_code:
            raise InstagramException.default(response.text,
                                             response.status_code)

        return Instagram.extract_shared_data_from_body(response.text)
    
    def extract_shared_data_from_body(body):
        """
        :param body: html string from a page
        :return: a dict extract from page
        """
        array = re.findall(r'_sharedData = .*?;</script>', body)
        if len(array) > 0:
            raw_json = array[0][len("_sharedData ="):-len(";</script>")]

            return json.loads(raw_json)

        return None
    
    def generate_headers(self, session, gis_token=None):
        """
        :param session: user session dict
        :param gis_token: a token used to be verified by instagram in header
        :return: header dict
        """
        headers = {}
        if session is not None:
            cookies = ''

            for key in session.keys():
                cookies += f"{key}={session[key]}; "

            csrf = session['x-csrftoken'] if session['csrftoken'] is None else \
                session['csrftoken']

            headers = {
                'cookie': cookies,
                'referer': endpoints.BASE_URL + '/',
                'x-csrftoken': csrf
            }

        if self.user_agent is not None:
            headers['user-agent'] = self.user_agent

            if gis_token is not None:
                headers['x-instagram-gis'] = gis_token

        return headers

    def get_media_comments_by_code(self, code, count = 10, max_id=''):

        """
        parameters:
            code: string , the code each post has, it's something like <BHaRdodBouH>, you can check it via 'copy link url' or post url after /p/ in instagram web app.
            count: int , default = 10, the number of comments you want to retrieve.
            max_id: it's used for pagination.
        return:
            comments lists of specific post.

        """

        comments = []
        index = 0
        has_previous = True
        get_comments_hash_url = 'https://www.instagram.com/graphql/query/?query_hash=97b41c52301f77ce508f55e66d17620e&variables='

        while has_previous and index < count:
            number_of_comments_to_recieve = 0
            if count - index > Instagram.MAX_LIKES_PER_REQUEST:
                number_of_comments_to_recieve = Instagram.MAX_COMMENTS_PER_REQUEST
            else:
                number_of_comments_to_recieve = count - index
            
            variables = {
                "shortcode" : str(code),
                "first" : str(number_of_comments_to_recieve),
                "after" : "" if not max_id else max_id
            }
            
            comments_url = get_comments_hash_url + str(variables).replace("'",'"')
            
            time.sleep(self.sleep_between_requests)
            response = self.__req.get(comments_url,
                                     headers = self.generate_headers(
                                         self.user_session,
                                         self.__generate_gis_token(variables)))
            
            if not response.status_code == Instagram.HTTP_OK:
                raise f'response code is {response.status_code} and response text is {response.text}.'
            
            jsonResponse = response.json()
            
            nodes = jsonResponse['data']['shortcode_media']['edge_media_to_parent_comment']['edges']
            
            for commentArray in nodes:
                comment = Comment(commentArray['node'])
                comments.append(comment)
                index += 1

            has_previous = jsonResponse['data']['shortcode_media']['edge_media_to_parent_comment']['page_info']['has_next_page']

            number_of_comments = jsonResponse['data']['shortcode_media']['edge_media_to_parent_comment']['count']
            if count > number_of_comments:
                count = number_of_comments

            max_id = jsonResponse['data']['shortcode_media']['edge_media_to_parent_comment']['page_info']['end_cursor']

            if len(nodes) == 0:
                break


        data = {}
        data['next_page'] = max_id
        data['comments'] = comments
        return data
            

In [46]:
instagram = Instagram()
comments = instagram.get_media_comments_by_code('BZdomXBl3nO', count = 100)
for comment in comments['comments']:
    print(comment.text)
    print(comment.owner)

ModuleNotFoundError: No module named 'model.account'

In [3]:
r = requests.get('https://www.instagram.com/graphql/query/?query_hash=97b41c52301f77ce508f55e66d17620e&variables={%22shortcode%22:%22BZdomXBl3nO%22,%22first%22:%22100%22,%22after%22:%22%22}')
r.json()

{'data': {'shortcode_media': {'edge_media_to_parent_comment': {'count': 19,
    'page_info': {'has_next_page': False, 'end_cursor': None},
    'edges': [{'node': {'id': '17891273890076184',
       'text': 'اگر همه عکس ها همزمان میذاشتید اونوقت مشخص بود کدام عکس بیشترین لایک جمع کرده تصویر های اخیر لایک های کمتر دارند چون دیر گذاشته شد.',
       'created_at': 1506340752,
       'did_report_as_spam': False,
       'owner': {'id': '4223882288',
        'is_verified': False,
        'profile_pic_url': 'https://scontent-bru2-1.cdninstagram.com/v/t51.2885-19/s150x150/103149811_1185601558471872_8142834131690009814_n.jpg?_nc_ht=scontent-bru2-1.cdninstagram.com&_nc_ohc=GaZpUPwDw4cAX-7We-9&tp=1&oh=6dad1b298cf03d60ee6d808a43e052b8&oe=5FE43CAD',
        'username': 'aycel_queen'},
       'viewer_has_liked': False,
       'edge_liked_by': {'count': 2},
       'edge_threaded_comments': {'count': 0,
        'page_info': {'has_next_page': False, 'end_cursor': None},
        'edges': []}}},
     {'node