In [17]:
import requests
import time
import re
import unicodedata
import pandas as pd
import json
import networkx as nx

In [18]:
def strip_accents(text):
   
    try:
        text = unicode(text, 'utf-8')
    except (TypeError, NameError):
        pass
    
    text = unicodedata.normalize('NFD', text)
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    
    return str(text)

In [19]:
def text2tags(text, striptag=True):
    
    pattern = '#\S+'
    
    text = text.lower()
    
    text = strip_accents(text)
    
    matches = re.findall(pattern, text)
    
    if striptag :
        matches = [ match.replace('#','') for match in matches ]
    
    return matches

In [20]:
def json2posts(json_info, infilter=False):

    posts_list = json_info['graphql']['hashtag']['edge_hashtag_to_media']['edges']

    posts_dicts = []
    
    # a generic media post preffix (concat with media shortcode to view)
    posturl_prefix = 'https://www.instagram.com/p/'

    for post in posts_list:

        node = post['node']

        id_post = node['id']

        id_owner = node['owner']['id']

        shortcode = node['shortcode']

        edges = node['edge_media_to_caption']['edges']
        
        text = edges[0]['node']['text'].replace('\n','') if len(edges) else ''
        
        tags = text2tags(text)

        post_url = posturl_prefix + shortcode + '/'

        post_dict = {
            'id_post': id_post,
            'id_owner': id_owner,
            'shortcode': shortcode,
            'text': text,
            'post_url': post_url,
            'tags': tags
        }
        
        if infilter :
            if len(tags) :
                posts_dicts.append( post_dict )
        else:
            pass
    
    else:
        posts_dicts.append( post_dict )
    
    return posts_dicts

In [21]:
def snowball(url, deep=1, end_cursor='', count=0, showurl=False, 
             sleep=0, forever=False, progress=False, pause=60 ):

    # suffix to end cursor when requesting posts by tag
    tagurl_endcursor = '&max_id='

    request_url = url + tagurl_endcursor + end_cursor

    if showurl :
        print(request_url)
    else:
        if progress :
            print( count, end=' ' )
    
    while True :
        try :
            json_info = requests.get( request_url ).json()
            break
        except:
            if forever :
                print('Fail, retrying in ' + str(pause) + ' seconds')
                time.sleep(pause)
            else:
                print('Fail, ' + str(count) + ' requests done')
                return []
    
    end_cursor = json_info['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']

    posts = json2posts( json_info, True )

    time.sleep(sleep)
  
    count = count + 1

    if count < deep :
        posts += snowball(
            url=url, 
            deep=deep, 
            end_cursor=end_cursor, 
            count=count, 
            showurl=showurl, 
            sleep=sleep,
            forever=forever,
            progress=progress, 
            pause=pause)
    else:
        pass
    
    if showurl :
        pass
    else:
        if progress :
            if count == deep :
                print()

    return posts

In [22]:
def validate_tag(tag):
    
    """
    Checks if a tag is valid according to its contents and size
    """

    MAX_LEN = 25
    MIN_LEN = 1

    pattern = '^[a-zA-Z0-9]+$'
    
    if re.match(pattern, tag) and len(tag) < MAX_LEN and len(tag) > MIN_LEN :
        return True
    else :
        return False

In [23]:
# Instagram base url preffix
tagurl_prefix = 'https://www.instagram.com/explore/tags/'

# suffix to append to tag request url to retrieve data in JSON format
tagurl_suffix = '/?__a=1'
    
# target initial tags
tags = ['IPL', 'CSK', 'RCB']

# urls to initial tags
queries = [ tagurl_prefix + tag + tagurl_suffix for tag in tags ]

In [28]:
%%time

data = {}

depth = 4 # Number of pages to query per post.

for tag, query in zip( tags, queries ) :
    
    print( 'Querying ' + tag + 'for depth of '+ str(depth) + ' pages...' )
    
    posts = snowball(query, deep=depth, forever=True, sleep=1, pause=60, progress=True)
    
    print("Total Number of posts", len(posts))
    
    data[tag] = posts
    
    print('Finished querying for ' + tag )
    
    print('\nNow waiting for 30 seconds before querying for the next tag.')
    
    time.sleep(30)

Querying IPLfor depth of 4 pages...
0 1 2 3 
Total Number of posts 276
Finished querying for IPL

Now waiting for 30 seconds before querying for the next tag.
Querying CSKfor depth of 4 pages...
0 1 2 3 
Total Number of posts 284
Finished querying for CSK

Now waiting for 30 seconds before querying for the next tag.
Querying RCBfor depth of 4 pages...
0 1 2 3 
Total Number of posts 270
Finished querying for RCB

Now waiting for 30 seconds before querying for the next tag.
CPU times: user 424 ms, sys: 0 ns, total: 424 ms
Wall time: 1min 59s


In [29]:
# saving data to a JSON file
f = open('posts_data.json', 'w')
json.dump(data, f)
f.close()

In [30]:
#open/load the saved file.
file = open('posts_data.json')
data_json = json.load(file)

In [31]:
# trying a limitation in the number of posts
POSTS_MAX = 100

# this list contains just edges from initial target (keys) tags to related post tags
edges_list_keys = []

# this list contains all edges between pairs of tags from the same post
edges_list_all = []

# populating the lists of edges
for person, posts in data_json.items() :
    
    # traversing each post for each key tag
    for post in posts[:POSTS_MAX] :
        
        # list of tags in the post including trash tags
        post_tags = post['tags']
        
        # list of tags in the post after filtering
        post_tags = [tag for tag in post_tags if validate_tag(tag)]
        
        # list of tags without the key tag
        post_tags_drop_person = [tag for tag in post_tags if not tag == person]
        
        # creating edges between key tag and all others
        for tag in post_tags_drop_person :
            
            edge_keys = (person, tag)
            
            edges_list_keys.append( edge_keys )
        
        # creating the edges between all the tags
        for tag in post_tags :
            
            # index of the current tag in the list
            tag_index = post_tags.index(tag)
            
            # this slice is needed in order to connect all edges one and only on time
            post_tags_slice = post_tags[tag_index+1:]
            
            for stag in post_tags_slice :
                
                edge_all_pre = (tag, stag)
                
                # creating the edge element in alphabetical order
                edge_all = ( min(edge_all_pre) , max(edge_all_pre) )
                
                edges_list_all.append( edge_all )

In [32]:
print('Numbers of edges:')

print(len(edges_list_keys))

print(len(edges_list_all))

Numbers of edges:
5249
60946


In [33]:
edges_df = pd.DataFrame(edges_list_all, columns=['source', 'target'])
edges_df['tuple'] = pd.Series(zip(edges_df.source, edges_df.target))

# grouping the dataframe by tuple of source and target.
edges_grouped = edges_df.groupby('tuple').count()
edges_grouped.drop(columns='target', inplace=True, errors='ignore')
edges_grouped.columns=['weight']
edges_grouped.reset_index(inplace=True)

# Adding source and target to the data frame.
edges_grouped['source'] = edges_grouped.tuple.str[0]
edges_grouped['target'] = edges_grouped.tuple.str[1]
edges_grouped = edges_grouped.drop(columns='tuple')
edges_grouped.sample(5)

Unnamed: 0,weight,source,target
9902,1,england,indiancricket
17129,1,love,style
15669,2,justiceforsushant,viratmsdhoni7781mivscsk
12284,1,hardikpandya,viratkohlifanpage
6765,3,csk,uae


In [34]:
G = nx.from_pandas_edgelist(edges_grouped, edge_attr=True)

In [36]:
print("Grouped Nodes length -", len(G.nodes))
print(list(G.nodes)[:10])

Grouped Nodes length - 1287
['1000families100likes', 'csection', 'csk', 'funnyvideos', 'likeforfollow', 'manishgoplani', 'memesdaily', 'sdvtodosnahora', 'share', 'stayhome']


In [37]:
print("Grouped Edges length - ", len(G.edges))
print(list(G.edges(data=True))[:10])

Grouped Edges length -  21684
[('1000families100likes', 'csection', {'weight': 1}), ('1000families100likes', 'csk', {'weight': 1}), ('1000families100likes', 'funnyvideos', {'weight': 1}), ('1000families100likes', 'likeforfollow', {'weight': 1}), ('1000families100likes', 'manishgoplani', {'weight': 1}), ('1000families100likes', 'memesdaily', {'weight': 1}), ('1000families100likes', 'sdvtodosnahora', {'weight': 1}), ('1000families100likes', 'share', {'weight': 1}), ('1000families100likes', 'stayhome', {'weight': 1}), ('1000families100likes', 'taehyung', {'weight': 1})]


In [39]:
# limiting weights > 5
THRESHOLD = 5
print('Filtering out post with weights < 5')
mask_insignificant = edges_grouped.weight.apply(lambda x : x <= THRESHOLD)
edges_grouped_dropped = edges_grouped[~mask_insignificant]
print(edges_grouped_dropped.head())

Filtering out post with weights < 5
     weight        source       target
129       7          2020      cricket
269       6    abdeallien          csk
292       6  abdevilliers          csk
293       6  abdevilliers      cskvsmi
294       6  abdevilliers  davidwarner


In [40]:
g = nx.from_edgelist(edges_list_keys)
edges_df_keys = pd.DataFrame(edges_list_keys, columns=['source', 'target'])
edges_df_keys['tuple'] = pd.Series(zip(edges_df_keys.source, edges_df_keys.target))
edges_grouped_keys = edges_df_keys.groupby('tuple').count()
edges_grouped_keys.drop(columns='target', inplace=True, errors='ignore')
edges_grouped_keys.columns=['weight']
edges_grouped_keys.reset_index(inplace=True)
edges_grouped_keys['source'] = edges_grouped_keys.tuple.str[0]
edges_grouped_keys['target'] = edges_grouped_keys.tuple.str[1]
edges_grouped_keys = edges_grouped_keys.drop(columns='tuple')


In [41]:
# creating a dictionary of weights
node_weights = {}

# populating the dictionary
for person, posts in data_json.items() :
    
    for post in posts[:POSTS_MAX] :
        
        post_tags = post['tags']
        
        post_tags = [tag for tag in post_tags if validate_tag(tag)]
        
        for tag in post_tags :
            
            if tag in node_weights : 
                node_weights[tag] = node_weights[tag] + 1
            else :
                node_weights[tag] = 1

In [44]:
nx.set_node_attributes(G, node_weights, 'weight')
nx.write_graphml(G, "all_edges_nodes.graphml") #needed

In [42]:
# creating a new graph with dropped data
G_dropped = nx.from_pandas_edgelist(edges_grouped_dropped, edge_attr=True)
nx.set_node_attributes(G_dropped, node_weights, 'weight')
nx.write_graphml(G_dropped, "with_important_weights.graphml") #needed

In [43]:
# creating the edges counted keys graph.
g = nx.from_pandas_edgelist(edges_grouped_keys, edge_attr=True)
nx.set_node_attributes(g, node_weights, 'weight')
nx.write_graphml(g, "with_tags.graphml") # needed