In [1]:
#!pip install py2neo

In [1]:
import pandas as pd
from py2neo import Graph, Node
import time
import bson
from tabulate import tabulate

In [3]:
start_time = time.time()

In [4]:
bson_file = open('WDM1.bson', 'rb')
data = bson.decode_all(bson_file.read())
bson_file.close()

In [4]:
graph = Graph("bolt://localhost:7687", auth=("neo4j", "twitterdb"))

In [6]:
# Create sets to store unique tags, urls and tweets
unique_tags = set()
unique_urls = set()
unique_tweets = set()
unique_users = set()

# Iterate over the rows in the DataFrame and add tags, urls, and tweets to the sets
for item in data:

    # Get the tags from the tweets
    if 'entities' in item['includes']['tweets'][0] and 'hashtags' in item['includes']['tweets'][0]['entities']:
        hashtags = item['includes']['tweets'][0]['entities']['hashtags']
        for i in range(len(hashtags)):
            if 'tag' in hashtags[i]:
                tag = hashtags[i]['tag'].lower()
                unique_tags.add(tag)

    # Get the urls from the tweets
    if 'entities' in item['includes']['tweets'][0] and 'urls' in item['includes']['tweets'][0]['entities']:
        urls = item['includes']['tweets'][0]['entities']['urls']
        for i in range(len(urls)):
            if 'expanded_url' in urls[i]:
                unique_urls.add(urls[i]['expanded_url'])

    # Get the tweets 
    tweet = dict()
    tweet["id"] =  item['includes']['tweets'][0]['id']  
    tweet["created_at"] = item['includes']['tweets'][0]['created_at']  
    tweet["author_id"] = item['includes']['tweets'][0]['author_id']
    if 'reply_count' in item['includes']['tweets'][0]['public_metrics']:
        tweet["reply_count"] = item['includes']['tweets'][0]['public_metrics']['reply_count']
    else:
        tweet["reply_count"] = "nan"        
    if 'referenced_tweets' in item['includes']['tweets'][0]:       
        tweet["type"] = item['includes']['tweets'][0]['referenced_tweets'][0]['type']   
    else:
        tweet["type"] = "nan"   
        
    unique_tweets.add(frozenset(tweet.items()))
    

    # Get the users 
    user = dict()
    user["id"] = item['includes']['users'][0]['id']  
    user["username"] = item['includes']['users'][0]['username']  
    if 'followers_count' in item['includes']['users'][0]['public_metrics']:
        user["followers_count"] = item['includes']['users'][0]['public_metrics']['followers_count']    
    else:
        user["followers_count"] = "nan"
        
    unique_users.add(frozenset(user.items()))



In [7]:
# Create a node for each tag
for tag in unique_tags:
    node = Node("Hashtag", tag=tag)
    graph.create(node)

In [8]:
# Create a node for each url
for url in unique_urls:
    node = Node("Link", url=url)
    graph.create(node)

In [9]:
# Create a node for each tweet
for tweet in unique_tweets:
    # Convert the hashable tuple back to a dictionary
    tweet_dict = dict(tweet)
    node = Node("Tweet", id=tweet_dict["id"], created_at=tweet_dict["created_at"], reply_count=tweet_dict["reply_count"]
                , type=tweet_dict["type"], author_id=tweet_dict["author_id"])
    graph.create(node)

In [10]:
# Create a node for each user
#constraint_created = False
for user in unique_users:
    # Convert the hashable tuple back to a dictionary
    user_dict = dict(user)
    node = Node("User", id=user_dict["id"], username=user_dict["username"], followers_count=user_dict["followers_count"])
    #if not constraint_created:
        #graph.run("CREATE CONSTRAINT constraint_user_id FOR (u:User) REQUIRE (u.id) IS UNIQUE")
        #constraint_created = True
    graph.merge(node, "User", "id")

In [11]:
print("tweet begin")
graph.run("MATCH (u: User),(t: Tweet) WHERE u.id = t.author_id and t.type<> 'retweeted' and t.type<> 'quoted' and t.type<> 'replied_to' MERGE (u)-[r:TWEETED]->(t) return count(r)")

tweet begin


count(r)
8477


In [12]:
print("retweet begin")
graph.run("MATCH (u: User),(t: Tweet) WHERE u.id = t.author_id and t.type = 'retweeted' MERGE (u)-[r:RETWEETED]->(t) return count(r)")

retweet begin


count(r)
22887


In [13]:
print("quote begin")
graph.run("MATCH (u: User),(t: Tweet) WHERE u.id = t.author_id and t.type = 'quoted' MERGE (u)-[r:QUOTED]->(t) return count(r)")

quote begin


count(r)
749


In [14]:
print("reply begin")
graph.run("MATCH (u: User),(t: Tweet) WHERE u.id = t.author_id and t.type = 'replied_to' MERGE (u)-[r:REPLIED_TO]->(t) return count(r)")

reply begin


count(r)
1110


In [15]:

# Iterate over the rows in the DataFrame and add tags, urls, and tweets to the sets
for item in data:

    # Get the tweets
    tweet = {}
    tweet["id"] = item['includes']['tweets'][0]['id']  
    tweet["author_id"] = item['includes']['tweets'][0]['author_id']  

    if all(tweet.values()) and not all(pd.isna(v) and  pd.isnull(v) and v != "nan" for v in tweet.values()):  # only add non-empty and non-nan tweets
        if 'entities' in item['includes']['tweets'][0] and 'hashtags' in item['includes']['tweets'][0]['entities']:
            hashtags = item['includes']['tweets'][0]['entities']['hashtags']
            for i in range(len(hashtags)):
                if 'tag' in hashtags[i]:
                    tag = hashtags[i]['tag'].lower()
                    tagRelTweet = graph.run("MATCH (t: Tweet),(h: Hashtag) WHERE h.tag = $tag1 and t.id = $tweetId MERGE (t)-[r:HAS_HASHTAG]->(h) return count(r)",tag1 = tag, tweetId = tweet["id"]).data()
                    tagRelUser = graph.run("MATCH (u: User),(h: Hashtag) WHERE h.tag = $tag1 and u.id = $userId MERGE (u)-[r:USED_HASHTAG]->(h) return count(r)",tag1 = tag, userId = tweet["author_id"]).data()
                     
        if 'entities' in item['includes']['tweets'][0] and 'urls' in item['includes']['tweets'][0]['entities']:
            urls = item['includes']['tweets'][0]['entities']['urls']
            for i in range(len(urls)):
                if 'expanded_url' in urls[i]:
                    url = urls[i]['expanded_url']
                    urlRelTweet = graph.run("MATCH (t: Tweet),(l: Link) WHERE l.url = $url1 and t.id = $tweetId MERGE (t)-[r:HAS_URL]->(l) return count(r)",url1 = url, tweetId = tweet["id"]).data()
                    urlReluser = graph.run("MATCH (u: User),(l: Link) WHERE l.url = $url1 and u.id = $userId MERGE (u)-[r:USED_URL]->(l) return count(r)",url1 = url, userId = tweet["author_id"]).data()
   
    # Get the users 
    user = {}
    user["id"] = item['includes']['users'][0]['id'] 

    if all(user.values()) and not all(pd.isna(v) and  pd.isnull(v) and v != "nan" for v in user.values()):  # only add non-empty and non-nan tweets
        
         if 'entities' in item['data'] and 'mentions' in item['data']['entities']:
            mentions = item['data']['entities']['mentions']
            for i in range(len(mentions)):
                if 'id' in mentions[i]:
                    mentions_id = mentions[i]['id']
                    mentions_username = mentions[i]['username']
                    
                    userExists = graph.run("MATCH (u:User {id: $id1}) return count(u.id)",id1 = mentions_id).data()
                    if(userExists[0]["count(u.id)"] == 0):
                        node = Node("User", id=mentions_id, username=mentions_username)
                        graph.merge(node, "User", "id")   
                    mentionRel = graph.run("MATCH (u1:User),(u2:User) WHERE u1.id = $user1Id and u2.id = $user2Id MERGE (u1)-[r:MENTIONS]->(u2) return count(r)",user1Id = user["id"], user2Id = mentions_id).data()    

In [16]:
end_time = time.time()  # set the end time
elapsed_time = end_time - start_time  # calculate the elapsed time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 7147.115245580673 seconds


# Questions & Answers

### 1. Get the total number of tweets

In [17]:
query1 = graph.run("MATCH (t:Tweet) RETURN count(t) AS total_tweets")
print(query1)

 total_tweets 
--------------
        33223 



### 3. Get the total number of hashtags (case insensitive)

In [18]:
query3 = graph.run("MATCH (h:Hashtag) RETURN count(h) AS total_hashtags")
print(query3)

 total_hashtags 
----------------
          11404 



### 5. Get the 20 most popular URLs in descending order

### 7. Get the followers count of each user

In [11]:
query7 = graph.run("""
MATCH (u:User) 
WHERE u.followers_count is not null  
RETURN u.username AS username, u.followers_count AS followers_count
ORDER BY u.followers_count DESC
""")

# convert the result to a list of lists
table = []
for record in query7:
    table.append([record['username'], record['followers_count']])

# print the table
print(tabulate(table, headers=['username', 'followers_count'], tablefmt='orgtbl'))

| username        |   followers_count |
|-----------------+-------------------|
| thekiranbedi    |          12185146 |
| UNHumanRights   |           4022080 |
| UNESCO          |           3675313 |
| UN_Women        |           2242549 |
| bsindia         |           2232593 |
| glamourmag      |           1302202 |
| ELLEINDIA       |           1235302 |
| CNBCTV18News    |           1089824 |
| BLACKPINKGLOBAL |           1000438 |
| khaleejtimes    |            974472 |
| UNGeneva        |            899576 |
| thebetterindia  |            796657 |
| iownjd          |            795479 |
| UN_News_Centre  |            709651 |
| OECD            |            694189 |
| MPTourism       |            642323 |
| threadreaderapp |            636172 |
| startupindia    |            590581 |
| unwomenindia    |            578258 |
| UNPeacekeeping  |            569547 |
| amitabhk87      |            566904 |
| skylarclouds    |            545827 |
| CarolNdosi      |            542696 |


### 9. Get the number of tweets & retweets per hour

In [19]:
query9 = graph.run("""
MATCH (u:User)-[:TWEETED]->(t:Tweet)
WHERE t.created_at IS NOT NULL
WITH datetime(t.created_at).hour AS hour, count(t) AS total_tweets
ORDER BY hour
WITH hour, total_tweets

MATCH (u:User)-[:RETWEETED]->(t:Tweet)
WHERE t.created_at IS NOT NULL
WITH hour, total_tweets, datetime(t.created_at).hour AS retweet_hour, count(t) AS total_retweets
WHERE hour = retweet_hour
RETURN hour, total_tweets, total_retweets, (total_tweets + total_retweets) AS total
ORDER BY hour
""")

# convert the result to a list of lists
table = []
for record in query9:
    table.append([record['hour'], record['total_tweets'], record['total_retweets'], record['total']])

# print the table
print(tabulate(table, headers=['hour', 'total_tweets', 'total_retweets', 'total'], tablefmt='orgtbl'))

|   hour |   total_tweets |   total_retweets |   total |
|--------+----------------+------------------+---------|
|      0 |            258 |              743 |    1001 |
|      1 |            192 |              673 |     865 |
|      2 |            193 |              588 |     781 |
|      3 |            274 |              732 |    1006 |
|      4 |            228 |              708 |     936 |
|      5 |            276 |              729 |    1005 |
|      6 |            303 |              838 |    1141 |
|      7 |            304 |              807 |    1111 |
|      8 |            362 |              924 |    1286 |
|      9 |            291 |              943 |    1234 |
|     10 |            356 |             1004 |    1360 |
|     11 |            370 |             1138 |    1508 |
|     12 |            434 |             1137 |    1571 |
|     13 |            470 |             1214 |    1684 |
|     14 |            550 |             1260 |    1810 |
|     15 |            612 |    

### 11. Get the user with the most replies

In [9]:
query11 = graph.run("""
MATCH (u:User)-[r:REPLIED_TO]->(t:Tweet) 
RETURN u.username , count(r) as reply_count
ORDER BY count(r) DESC LIMIT 1
""")

print(query11)

 u.username  | reply_count 
-------------|-------------
 FatmaHasimm |          54 



### 13. Get the top-20 hashtags that co-occur with the hashtag that has been used the most

### 15. Get the most “important” user in the dataset (use Graph algorithms: Pagerank, Betweenness centrality, etc.). You will apply these algorithms in the mention network (which includes retweets)

In [22]:
query15_1 = graph.run("""
CALL gds.graph.project.cypher(
    'mentionGraph3',
    'MATCH (u:User) RETURN id(u) AS id',
    'MATCH (u:User)-[e:MENTIONS]->(m:User) 
     RETURN id(u) AS source, e.weight AS weight, id(m) AS target'
)
""")

### Pagerank

In [80]:
query15_2 = graph.run("""
CALL gds.pageRank.stream('mentionGraph3') 
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).username AS name, score
ORDER BY score DESC LIMIT 5
""")

first_user = query15_2.next()['name']

print("maxIterations = 20, dampingFactor = 0.85\n")
print(f"Most important user: {first_user}")
print("Next 4 most important users:")
print([usr['name'] for usr in query15_2])
 
    
# different pageRank parameters    
query15_3 = graph.run("""
CALL gds.pageRank.stream('mentionGraph3', {
  maxIterations: 200,
  dampingFactor: 0.9
})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).username AS name, score
ORDER BY score DESC LIMIT 5
""")

first_user = query15_3.next()['name']

print("\n\nmaxIterations = 200, dampingFactor = 0.9\n")
print(f"Most important user: {first_user}")
print("Next 4 most important users:")
print([usr['name'] for usr in query15_3])  


maxIterations = 20, dampingFactor = 0.85

Most important user: crazydollsnft
Next 4 most important users:
['cryptosanthoshK', 'CatherineAdenle', 'GirlsWhoCode', 'conscientious1o']


maxIterations = 200, dampingFactor = 0.9

Most important user: crazydollsnft
Next 4 most important users:
['CatherineAdenle', 'cryptosanthoshK', 'CareerEmporium', 'conscientious1o']


### 17. For the 5th most important user, get the list of hashtags and URLs that have been posted (if no hashtags or URLs - check another user e.g. 6th, 7th , etc..)

### 19. Get the user communities that have been created based on the users’ interactions and visualise them (Louvain algorithm)