In [20]:
import requests
import time
import re
import unicodedata

# Data Collection

Prepare for parsing data from Instagram

In [21]:
# Instagram base url preffix
tagurl_prefix = 'https://www.instagram.com/explore/tags/'

# suffix to append to tag request url to retrieve data in JSON format
tagurl_suffix = '/?__a=1'

# suffix to end cursor when requesting posts by tag
tagurl_endcursor = '&max_id='

# a generic media post preffix (concat with media shortcode to view)
posturl_prefix = 'https://www.instagram.com/p/'

In [22]:
import unicodedata

def strip_accents(text):
    
    """
    Strip accents from input String.

    :param text: The input string.
    :type text: String.

    :returns: The processed String.
    :rtype: String.
    """
    
    try:
        text = unicode(text, 'utf-8')
    except (TypeError, NameError):
        pass
    
    text = unicodedata.normalize('NFD', text)
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    
    return str(text)

In [23]:
import re

def text2tags(text, striptag=True):
    
    pattern = '#\S+'
    
    text = text.lower()
    
    text = strip_accents(text)
    
    matches = re.findall(pattern, text)
    
    if striptag :
        matches = [ match.replace('#','') for match in matches ]
    
    return matches

In [24]:
def json2posts(json_info, infilter=False):

    posts_list = json_info['graphql']['hashtag']['edge_hashtag_to_media']['edges']

    posts_dicts = []

    for post in posts_list:

        node = post['node']

        id_post = node['id']

        id_owner = node['owner']['id']

        shortcode = node['shortcode']

        edges = node['edge_media_to_caption']['edges']
        
        text = edges[0]['node']['text'].replace('\n','') if len(edges) else ''
        
        tags = text2tags(text)

        post_url = posturl_prefix + shortcode + '/'

        post_dict = {
            'id_post': id_post,
            'id_owner': id_owner,
            'shortcode': shortcode,
            'text': text,
            'post_url': post_url,
            'tags': tags
        }
        
        if infilter :
            if len(tags) :
                posts_dicts.append( post_dict )
        else:
            pass
    
    else:
        posts_dicts.append( post_dict )
    
    return posts_dicts

In [25]:
import requests
import time

def snowball(url, deep=1, end_cursor='', count=0, showurl=False, 
             sleep=0, forever=False, progress=False, pause=60 ):

    request_url = url + tagurl_endcursor + end_cursor

    if showurl :
        print(request_url)
    else:
        if progress :
            print( count, end=' ' )
    
    while True :
        try :
            json_info = requests.get( request_url ).json()
            break
        except:
            if forever :
                print('Fail, retrying in ' + str(pause) + ' seconds')
                time.sleep(pause)
            else:
                print('Fail, ' + str(count) + ' requests done')
                return []
    
    end_cursor = json_info['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']

    posts = json2posts( json_info, True )

    time.sleep(sleep)
  
    count = count + 1

    if count < deep :
        posts += snowball(
            url=url, 
            deep=deep, 
            end_cursor=end_cursor, 
            count=count, 
            showurl=showurl, 
            sleep=sleep,
            forever=forever,
            progress=progress, 
            pause=pause)
    else:
        pass
    
    if showurl :
        pass
    else:
        if progress :
            if count == deep :
                print()

    return posts

Set hashtags to be parsed data for

In [26]:
# target initial tags
tags = ['palmtree', 'garden', 'beach', 'vacation', 'home']

In [27]:
# urls to initial tags
queries = [ tagurl_prefix + tag + tagurl_suffix for tag in tags ]

In [28]:
%%time

data = {}

for tag, query in zip( tags, queries ) :
    
    print( 'Querying ' + tag + '...' )
    
    posts = snowball(query, deep=40, forever=True, sleep=1, pause=60, progress=True)
    
    data[tag] = posts
    
    print( 'Done' )
    
    time.sleep(30)

Querying palmtree...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Done
Querying garden...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Done
Querying beach...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Done
Querying vacation...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Done
Querying home...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Done
Wall time: 10min 30s


In [29]:
# checking number of medias
for key, posts in data.items() :
    
    print(key, len(posts))

palmtree 2465
garden 2693
beach 2478
vacation 2377
home 2522


In [30]:
import json

# saving data to a JSON file
f = open('data.json', 'w')
json.dump(data, f)
f.close()

# Calculating and Storing Edges

In [None]:
# checking non-tagged medias

for key, posts in data.items():
    
    for post in posts:
        
        tags = post['tags']
        
        if len(tags) :
            pass
        else:
            print(post['post_url'], media['text'])

In [32]:
# define a function for tag validation
import re
    
def validate_tag(tag):
    
    """
    Checks if a tag is valid according to its contents and size
    """

    MAX_LEN = 25
    MIN_LEN = 1

    pattern = '^[a-zA-Z0-9]+$'
    
    if re.match(pattern, tag) and len(tag) < MAX_LEN and len(tag) > MIN_LEN :
        return True
    else :
        return False

In [33]:
# trying a limitation in the number of posts (too many posts for Gephi to handle otherwise)
POSTS_MAX = 100
# this list contains just edges from initial target (keys) tags to related post tags
edges_list_keys = []
# this list contains all edges between pairs of tags from the same post
edges_list_all = []
# populating the lists of edges
for person, posts in data.items() :

    # traversing each post for each key tag
    for post in posts[:POSTS_MAX] :
        
        # list of tags in the post including trash tags
        post_tags = post['tags']
        
        # list of tags in the post after filtering
        post_tags = [tag for tag in post_tags if validate_tag(tag)]
        
        # list of tags without the key tag
        post_tags_drop_person = [tag for tag in post_tags if not tag == person]
        
        # creating edges between key tag and all others
        for tag in post_tags_drop_person :
            
            edge_keys = (person, tag)
            
            edges_list_keys.append( edge_keys )
        
        # creating the edges between all the tags
        for tag in post_tags :
            
            # index of the current tag in the list
            tag_index = post_tags.index(tag)
            
            # this slice is needed in order to connect all edges one and only on time
            post_tags_slice = post_tags[tag_index+1:]
            
            for stag in post_tags_slice :
                
                edge_all_pre = (tag, stag)
                
                # creating the edge element in alphabetical order
                edge_all = ( min(edge_all_pre) , max(edge_all_pre) )
                
                edges_list_all.append( edge_all )

In [34]:
print('Numbers of edges:')

print(len(edges_list_keys))

print(len(edges_list_all))

Numbers of edges:
8120
102083


In [35]:
# checking a sample of edges
edges_list_all[:10]

[('mangroveswamp', 'palmtree'),
 ('mangroveswamp', 'wheelybin'),
 ('mangroveswamp', 'shy'),
 ('backwaters', 'mangroveswamp'),
 ('australia', 'mangroveswamp'),
 ('mangroveswamp', 'nswathome'),
 ('palmtree', 'wheelybin'),
 ('palmtree', 'shy'),
 ('backwaters', 'palmtree'),
 ('australia', 'palmtree')]

In [36]:
# checking a sample of edges
edges_list_keys[:10]

[('palmtree', 'mangroveswamp'),
 ('palmtree', 'wheelybin'),
 ('palmtree', 'shy'),
 ('palmtree', 'backwaters'),
 ('palmtree', 'australia'),
 ('palmtree', 'nswathome'),
 ('palmtree', 'latenightwalks'),
 ('palmtree', 'goodgirl'),
 ('palmtree', 'iseeyoupalmtree'),
 ('palmtree', 'california')]

Handling List of All Edges
Initial Graph

In [37]:
import networkx as nx

In [38]:
G = nx.from_edgelist(edges_list_all)

In [39]:
list(G.nodes)[:10]

['mangroveswamp',
 'palmtree',
 'wheelybin',
 'shy',
 'backwaters',
 'australia',
 'nswathome',
 'goodgirl',
 'latenightwalks',
 'iseeyoupalmtree']

In [40]:
list(G.edges())[:10]

[('mangroveswamp', 'palmtree'),
 ('mangroveswamp', 'wheelybin'),
 ('mangroveswamp', 'shy'),
 ('mangroveswamp', 'backwaters'),
 ('mangroveswamp', 'australia'),
 ('mangroveswamp', 'nswathome'),
 ('palmtree', 'wheelybin'),
 ('palmtree', 'shy'),
 ('palmtree', 'backwaters'),
 ('palmtree', 'australia')]

In [41]:
len(G.nodes)

3601

In [42]:
len(G.edges)

60568

In [43]:
# percentage from graph edges to list of edges
100 * len(G.edges)/len(edges_list_all)

59.33211210485585

In [44]:
import pandas as pd

In [45]:
edges_df = pd.DataFrame(edges_list_all, columns=['source', 'target'])

In [46]:
edges_df.head()

Unnamed: 0,source,target
0,mangroveswamp,palmtree
1,mangroveswamp,wheelybin
2,mangroveswamp,shy
3,backwaters,mangroveswamp
4,australia,mangroveswamp


In [47]:
edges_df.to_csv('edges_list_all.csv')

In [48]:
edges_df['tuple'] = pd.Series(zip(edges_df.source, edges_df.target))

In [49]:
edges_df.head()

Unnamed: 0,source,target,tuple
0,mangroveswamp,palmtree,"(mangroveswamp, palmtree)"
1,mangroveswamp,wheelybin,"(mangroveswamp, wheelybin)"
2,mangroveswamp,shy,"(mangroveswamp, shy)"
3,backwaters,mangroveswamp,"(backwaters, mangroveswamp)"
4,australia,mangroveswamp,"(australia, mangroveswamp)"


In [50]:
edges_grouped = edges_df.groupby('tuple').count()

In [51]:
edges_grouped.sample(5)

Unnamed: 0_level_0,source,target
tuple,Unnamed: 1_level_1,Unnamed: 2_level_1
"(florist, instagood)",5,5
"(midcenturymodern, midcenturymodernhome)",1,1
"(instravel, offduty)",1,1
"(garden, travelphotography)",3,3
"(goodnight, photo)",1,1


In [52]:
edges_grouped.drop(columns='target', inplace=True, errors='ignore')

In [53]:
edges_grouped.columns=['weight']

In [54]:
edges_grouped.reset_index(inplace=True)

In [55]:
edges_grouped.sample(5)

Unnamed: 0,tuple,weight
55835,"(record, traveler)",1
57274,"(sfrealestate, surfsf)",1
13837,"(canabismaconhabrasil, sp)",1
31539,"(girls, tflers)",3
1430,"(adventures, visiting)",1


In [56]:
edges_grouped.shape

(60568, 2)

In [57]:
edges_grouped['source'] = edges_grouped.tuple.str[0]

In [58]:
edges_grouped['target'] = edges_grouped.tuple.str[1]

In [59]:
edges_grouped = edges_grouped.drop(columns='tuple')

In [60]:
edges_grouped.sample(5)

Unnamed: 0,weight,source,target
7880,4,beautiful,traveltheworld
57299,1,share,watch
47956,1,miamilurkers,palmtrees
17891,2,costarica,shotoniphone
10579,1,blondegirl,photogtaphy


In [61]:
edges_grouped.to_csv('edges_counted.csv')

Creating New Graph

In [62]:
G = nx.from_pandas_edgelist(edges_grouped, edge_attr=True)

In [63]:
list(G.nodes)[:10]

['100likes',
 '50likes',
 'amazing',
 'beach',
 'beauty',
 'bestoftheday',
 'cool',
 'f4f',
 'fitness',
 'followforfollow']

In [64]:
list(G.edges(data=True))[:10]

[('100likes', '50likes', {'weight': 3}),
 ('100likes', 'amazing', {'weight': 3}),
 ('100likes', 'beach', {'weight': 3}),
 ('100likes', 'beauty', {'weight': 3}),
 ('100likes', 'bestoftheday', {'weight': 3}),
 ('100likes', 'cool', {'weight': 3}),
 ('100likes', 'f4f', {'weight': 3}),
 ('100likes', 'fitness', {'weight': 3}),
 ('100likes', 'followforfollow', {'weight': 3}),
 ('100likes', 'girls', {'weight': 3})]

In [65]:
len(G.nodes)

3601

In [66]:
len(G.edges)

60568

In [67]:
# the same percetual as before, but now with the grouped dataframe
100 * len(G.edges)/edges_grouped.shape[0]

100.0

In [68]:
nx.write_graphml(G, "edges_counted_" + str(POSTS_MAX) + ".graphml")

In [69]:
# inspect edges
edges_grouped.sort_values(by='weight', ascending=False).head(10)

Unnamed: 0,weight,source,target
42668,91,keralagodsowncountry,vintagestyle
53925,84,photoshooting,vintagestyle
42663,84,keralagodsowncountry,photoshooting
42514,63,kerala,photoshooting
45910,57,love,nature
49821,56,nature,photooftheday
42488,56,kerala,keralagodsowncountry
42534,56,kerala,vintagestyle
59596,55,travel,vacation
49963,54,nature,sunset


In [70]:
# defining masks to select data

mask_source_vacation = edges_grouped.source == 'vacation'
mask_source_travel = edges_grouped.source == 'travel'

mask_target_vacation = edges_grouped.target == 'vacation'
mask_target_travel = edges_grouped.target == 'travel'

In [71]:
edges_grouped[mask_source_vacation & mask_target_travel]

Unnamed: 0,weight,source,target


In [72]:
edges_grouped[mask_target_travel & mask_source_travel]

Unnamed: 0,weight,source,target
59545,6,travel,travel


Inspecting Weights

In [73]:
edges_grouped.weight.sort_values(ascending=False).sample(15)

58233    1
29578    1
50601    1
10287    1
10688    1
57472    1
39319    8
28910    1
35572    6
43840    1
38749    1
7577     1
13271    1
20756    1
31281    2
Name: weight, dtype: int64

In [74]:
weight_counts = edges_grouped.weight.value_counts().sort_index(ascending=False)

In [75]:
weight_counts.head(10)

91    1
84    2
63    1
57    1
56    3
55    1
54    1
53    1
52    4
49    1
Name: weight, dtype: int64

In [76]:
weight_counts.tail(15)

15       36
14       44
13       61
12      141
11       76
10      108
9       163
8       213
7       285
6       881
5      1108
4      1070
3      3208
2      5209
1     47635
Name: weight, dtype: int64

Dropping Insignificant Edges

In [77]:
TRESHOLD = 4

mask_insignificant = edges_grouped.weight.apply(lambda x : x <= TRESHOLD)

In [78]:
edges_grouped_dropped = edges_grouped[~mask_insignificant]

In [79]:
edges_grouped_dropped.weight.value_counts().sort_index(ascending=False).head(10)

91    1
84    2
63    1
57    1
56    3
55    1
54    1
53    1
52    4
49    1
Name: weight, dtype: int64

In [80]:
edges_grouped_dropped.weight.value_counts().sort_index(ascending=False).tail(15)

19      15
18      35
17      23
16      36
15      36
14      44
13      61
12     141
11      76
10     108
9      163
8      213
7      285
6      881
5     1108
Name: weight, dtype: int64

In [81]:
# creating a new graph with dropped data
G_dropped = nx.from_pandas_edgelist(edges_grouped_dropped, edge_attr=True)

Selfloop Edges
Dropped Graph

In [82]:
list(nx.selfloop_edges(G_dropped, data=True))[:10]

[('beach', 'beach', {'weight': 12}),
 ('beautiful', 'beautiful', {'weight': 6}),
 ('instagood', 'instagood', {'weight': 10}),
 ('love', 'love', {'weight': 6}),
 ('photooftheday', 'photooftheday', {'weight': 10}),
 ('sunset', 'sunset', {'weight': 18}),
 ('travel', 'travel', {'weight': 6}),
 ('thailand', 'thailand', {'weight': 8}),
 ('giardino', 'giardino', {'weight': 8}),
 ('instagram', 'instagram', {'weight': 6})]

In [83]:
len(list(nx.selfloop_edges(G_dropped, data=True)))

28

Complete Graph

In [84]:
list(nx.selfloop_edges(G, data=True))[:10]

[('beach', 'beach', {'weight': 12}),
 ('beauty', 'beauty', {'weight': 2}),
 ('instagram', 'instagram', {'weight': 6}),
 ('music', 'music', {'weight': 2}),
 ('photo', 'photo', {'weight': 4}),
 ('sunset', 'sunset', {'weight': 18}),
 ('home', 'home', {'weight': 2}),
 ('travel', 'travel', {'weight': 6}),
 ('photographer', 'photographer', {'weight': 6}),
 ('summer', 'summer', {'weight': 2})]

In [85]:
len(list(nx.selfloop_edges(G, data=True)))

96

In [86]:
nx.write_graphml(G_dropped, "edges_counted_" + str(POSTS_MAX) + "_dropped.graphml")

In [87]:
import matplotlib.pyplot as plt

In [88]:
%%time

# turn to False to disable a long time operation
if False :

    nx.draw(G)

    plt.show()

Wall time: 0 ns


Handling List of Key Edges
Creating Keys Graph

In [89]:
edges_list_keys[:10]

[('palmtree', 'mangroveswamp'),
 ('palmtree', 'wheelybin'),
 ('palmtree', 'shy'),
 ('palmtree', 'backwaters'),
 ('palmtree', 'australia'),
 ('palmtree', 'nswathome'),
 ('palmtree', 'latenightwalks'),
 ('palmtree', 'goodgirl'),
 ('palmtree', 'iseeyoupalmtree'),
 ('palmtree', 'california')]

In [90]:
g = nx.from_edgelist(edges_list_keys)

In [91]:
len(g.nodes)

3616

In [92]:
len(g.edges)

4386

In [93]:
# percentage from graph edges to list of edges
100 * len(g.edges)/len(edges_list_keys)

54.01477832512315

Grouping and Counting Keys Edges

In [94]:
edges_df_keys = pd.DataFrame(edges_list_keys, columns=['source', 'target'])

In [95]:
edges_df_keys.sample(5)

Unnamed: 0,source,target
193,palmtree,antalya
2499,garden,pnwplants
7946,home,milan
1243,palmtree,fun
1135,palmtree,california


In [96]:
edges_df_keys.to_csv('edges_list_keys.csv')

In [97]:
edges_df_keys['tuple'] = pd.Series(zip(edges_df_keys.source, edges_df_keys.target))

In [98]:
edges_df_keys.sample(5)

Unnamed: 0,source,target,tuple
7017,home,westernbayofplenty,"(home, westernbayofplenty)"
3913,beach,night,"(beach, night)"
1316,palmtree,palmeras,"(palmtree, palmeras)"
3931,beach,summer,"(beach, summer)"
1046,palmtree,alpaca,"(palmtree, alpaca)"


In [99]:
edges_grouped_keys = edges_df_keys.groupby('tuple').count()

In [100]:
edges_grouped_keys.sample(5)

Unnamed: 0_level_0,source,target
tuple,Unnamed: 1_level_1,Unnamed: 2_level_1
"(home, metas)",1,1
"(garden, mygarden)",5,5
"(home, kadoaniversarymurah)",6,6
"(home, gymlife)",1,1
"(home, sepreicantik)",6,6


In [101]:
edges_grouped_keys.drop(columns='target', inplace=True, errors='ignore')

In [102]:
edges_grouped_keys.columns=['weight']

In [103]:
edges_grouped_keys.reset_index(inplace=True)

In [104]:
edges_grouped_keys.sample(5)

Unnamed: 0,tuple,weight
2704,"(home, travelgram)",3
1787,"(home, bikelife)",1
2914,"(palmtree, city)",5
1929,"(home, desainrumah)",1
2779,"(palmtree, 80s)",1


In [105]:
edges_grouped_keys['source'] = edges_grouped_keys.tuple.str[0]

In [106]:
edges_grouped_keys['target'] = edges_grouped_keys.tuple.str[1]

In [107]:
edges_grouped_keys.shape

(4388, 4)

In [108]:
edges_grouped_keys = edges_grouped_keys.drop(columns='tuple')

In [109]:
edges_grouped_keys.sample(5)

Unnamed: 0,weight,source,target
4092,25,vacation,nature
2885,1,palmtree,building
3507,1,palmtree,takeaway
400,1,beach,mebel
4206,1,vacation,skygirls


In [110]:
edges_grouped_keys.to_csv('edges_counted_keys.csv')

Creating New Keys Graph

In [111]:
g = nx.from_pandas_edgelist(edges_grouped_keys, edge_attr=True)

In [112]:
list(g.nodes)[:10]

['beach',
 '100likes',
 '12',
 '141',
 '20likes',
 '50likes',
 '7t',
 'a7riiiswimmingpool',
 'acoustic',
 'adamsjuggler']

In [113]:
list(g.edges(data=True))[:10]

[('beach', '100likes', {'weight': 3}),
 ('beach', '12', {'weight': 1}),
 ('beach', '141', {'weight': 1}),
 ('beach', '20likes', {'weight': 1}),
 ('beach', '50likes', {'weight': 3}),
 ('beach', '7t', {'weight': 1}),
 ('beach', 'a7riiiswimmingpool', {'weight': 1}),
 ('beach', 'acoustic', {'weight': 1}),
 ('beach', 'adamsjuggler', {'weight': 1}),
 ('beach', 'adventure', {'weight': 2})]

In [114]:
len(g.nodes)

3616

In [115]:
len(g.edges)

4386

In [116]:
# the same percetual as before, but now with the grouped dataframe
100 * len(g.edges)/edges_grouped_keys.shape[0]

99.95442114858706

In [117]:
nx.write_graphml(g, "edges_counted_keys_" + str(POSTS_MAX) + ".graphml")

Inspecting Keys Edges

In [118]:
edges_grouped_keys.sample(10)

Unnamed: 0,weight,source,target
435,1,beach,nativeinstruments
2743,1,home,visitparnu
3612,1,palmtree,yoga
79,1,beach,bollywood
1173,1,garden,happycolour
3650,1,vacation,asalcalawak
2030,1,home,foto
3185,1,palmtree,lovehawaii
1600,1,garden,terrassendeko
3300,1,palmtree,palmisland


In [119]:
# checking if there are any null value
edges_grouped_keys.isnull().sum()

weight    0
source    0
target    0
dtype: int64

In [120]:
# checking different weight values
edges_grouped_keys.weight.sort_values().unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 27, 29, 30, 31, 32, 34, 36, 37, 44],
      dtype=int64)

In [121]:
# checking for empty tags
edges_grouped_keys.source.apply( lambda x : x is '' ).sum()

0

In [122]:
# checking for empty tags
edges_grouped_keys.target.apply( lambda x : x is '' ).sum()

0

In [123]:
# checking for self loop edges
list(nx.selfloop_edges(g))

[]

In [124]:
# checking for swapped key tags

key_tags = edges_grouped_keys.source.unique().tolist()

mask_key_tags = edges_grouped_keys.target.isin(key_tags)

edges_grouped_keys[mask_key_tags]

Unnamed: 0,weight,source,target
246,1,beach,garden
281,1,beach,home
739,3,beach,vacation
1192,6,garden,home
1643,1,garden,vacation
1767,1,home,beach
2841,16,palmtree,beach
3032,1,palmtree,garden
3085,7,palmtree,home
3572,9,palmtree,vacation


Plotting Keys Graph

In [125]:
import matplotlib.pyplot as plt

In [126]:
%%time

# just to disable a long time operation
if False :
    
    nx.draw(g)

    plt.show()

Wall time: 0 ns


Node Weights
Calculating Weights

In [127]:
# creating a dictionary of weights
node_weights = {}

# populating the dictionary
for person, posts in data.items() :
    
    for post in posts[:POSTS_MAX] :
        
        post_tags = post['tags']
        
        post_tags = [tag for tag in post_tags if validate_tag(tag)]
        
        for tag in post_tags :
            
            if tag in node_weights : 
                node_weights[tag] = node_weights[tag] + 1
            else :
                node_weights[tag] = 1

In [128]:
# checking the nodes before assign weights
list(G.nodes(data=True))[:10]

[('100likes', {}),
 ('50likes', {}),
 ('amazing', {}),
 ('beach', {}),
 ('beauty', {}),
 ('bestoftheday', {}),
 ('cool', {}),
 ('f4f', {}),
 ('fitness', {}),
 ('followforfollow', {})]

In [129]:
# checking the nodes before assign weights
list(g.nodes(data=True))[:10]

[('beach', {}),
 ('100likes', {}),
 ('12', {}),
 ('141', {}),
 ('20likes', {}),
 ('50likes', {}),
 ('7t', {}),
 ('a7riiiswimmingpool', {}),
 ('acoustic', {}),
 ('adamsjuggler', {})]

In [130]:
len(G.nodes)

3601

In [131]:
len(G_dropped.nodes)

408

In [132]:
len(g.nodes)

3616

Assigning Weights
All Edges Graph

In [133]:
nx.set_node_attributes(G, node_weights, 'weight')

In [134]:
list(G.nodes(data=True))[:10]

[('100likes', {'weight': 3}),
 ('50likes', {'weight': 3}),
 ('amazing', {'weight': 22}),
 ('beach', {'weight': 110}),
 ('beauty', {'weight': 54}),
 ('bestoftheday', {'weight': 19}),
 ('cool', {'weight': 20}),
 ('f4f', {'weight': 10}),
 ('fitness', {'weight': 19}),
 ('followforfollow', {'weight': 11})]

In [135]:
nx.write_graphml(G, "edges_counted_" + str(POSTS_MAX) + "_nw.graphml")

Dropped All Edges Graph

In [136]:
nx.set_node_attributes(G_dropped, node_weights, 'weight')

In [137]:
list(G_dropped.nodes(data=True))[:10]

[('adventure', {'weight': 26}),
 ('beach', {'weight': 110}),
 ('beautiful', {'weight': 62}),
 ('beautifuldestinations', {'weight': 9}),
 ('explore', {'weight': 23}),
 ('fun', {'weight': 22}),
 ('happy', {'weight': 31}),
 ('holiday', {'weight': 33}),
 ('igtravel', {'weight': 15}),
 ('instagood', {'weight': 66})]

In [138]:
nx.write_graphml(G_dropped, "edges_counted_" + str(POSTS_MAX) + "_dropped_nw.graphml")

Key Edges Graph

In [139]:
nx.set_node_attributes(g, node_weights, 'weight')

In [140]:
list(G.nodes(data=True))[:10]

[('100likes', {'weight': 3}),
 ('50likes', {'weight': 3}),
 ('amazing', {'weight': 22}),
 ('beach', {'weight': 110}),
 ('beauty', {'weight': 54}),
 ('bestoftheday', {'weight': 19}),
 ('cool', {'weight': 20}),
 ('f4f', {'weight': 10}),
 ('fitness', {'weight': 19}),
 ('followforfollow', {'weight': 11})]

In [141]:
nx.write_graphml(g, "edges_counted_keys_" + str(POSTS_MAX) + "_nw.graphml")