In [1]:
#!pip install py2neo

In [2]:
import pandas as pd
from py2neo import Graph, Node
import time
import bson

In [3]:
start_time = time.time()

In [4]:
bson_file = open('WDM1.bson', 'rb')
data = bson.decode_all(bson_file.read())
bson_file.close()

In [6]:
graph = Graph("bolt://localhost:7687", auth=("neo4j", "twitterdb"))

In [7]:
# Create sets to store unique tags, urls and tweets
unique_tags = set()
unique_urls = set()
unique_tweets = set()
unique_users = set()

# Iterate over the rows in the DataFrame and add tags, urls, and tweets to the sets
for item in data:

    # Get the tags from the tweets
    if 'entities' in item['includes']['tweets'][0] and 'hashtags' in item['includes']['tweets'][0]['entities']:
        hashtags = item['includes']['tweets'][0]['entities']['hashtags']
        for i in range(len(hashtags)):
            if 'tag' in hashtags[i]:
                tag = hashtags[i]['tag'].lower()
                unique_tags.add(tag)

    # Get the urls from the tweets
    if 'entities' in item['includes']['tweets'][0] and 'urls' in item['includes']['tweets'][0]['entities']:
        urls = item['includes']['tweets'][0]['entities']['urls']
        for i in range(len(urls)):
            if 'expanded_url' in urls[i]:
                unique_urls.add(urls[i]['expanded_url'])

    # Get the tweets 
    tweet = dict()
    tweet["id"] =  item['includes']['tweets'][0]['id']  
    tweet["created_at"] = item['includes']['tweets'][0]['created_at']  
    tweet["author_id"] = item['includes']['tweets'][0]['author_id']
    if 'reply_count' in item['includes']['tweets'][0]['public_metrics']:
        tweet["reply_count"] = item['includes']['tweets'][0]['public_metrics']['reply_count']
    else:
        tweet["reply_count"] = "nan"        
    if 'referenced_tweets' in item['includes']['tweets'][0]:       
        tweet["type"] = item['includes']['tweets'][0]['referenced_tweets'][0]['type']   
    else:
        tweet["type"] = "nan"   
        
    unique_tweets.add(frozenset(tweet.items()))
    

    # Get the users 
    user = dict()
    user["id"] = item['includes']['users'][0]['id']  
    user["username"] = item['includes']['users'][0]['username']  
    if 'followers_count' in item['includes']['users'][0]['public_metrics']:
        user["followers_count"] = item['includes']['users'][0]['public_metrics']['followers_count']    
    else:
        user["followers_count"] = "nan"
        
    unique_users.add(frozenset(user.items()))



In [8]:
# Create a node for each tag
for tag in unique_tags:
    node = Node("Hashtag", tag=tag)
    graph.create(node)

In [9]:
# Create a node for each url
for url in unique_urls:
    node = Node("Link", url=url)
    graph.create(node)

In [10]:
# Create a node for each tweet
for tweet in unique_tweets:
    # Convert the hashable tuple back to a dictionary
    tweet_dict = dict(tweet)
    node = Node("Tweet", id=tweet_dict["id"], created_at=tweet_dict["created_at"], reply_count=tweet_dict["reply_count"]
                , type=tweet_dict["type"], author_id=tweet_dict["author_id"])
    graph.create(node)

In [11]:
# Create a node for each user
constraint_created = False
for user in unique_users:
    # Convert the hashable tuple back to a dictionary
    user_dict = dict(user)
    node = Node("User", id=user_dict["id"], username=user_dict["username"], followers_count=user_dict["followers_count"])
    if not constraint_created:
        graph.run("CREATE CONSTRAINT constraint_user_id FOR (u:User) REQUIRE (u.id) IS UNIQUE")
        constraint_created = True
    graph.merge(node, "User", "id")

In [12]:
print("tweet begin")
graph.run("MATCH (u: User),(t: Tweet) WHERE u.id = t.author_id and t.type<> 'retweeted' and t.type<> 'quoted' and t.type<> 'replied_to' MERGE (u)-[r:TWEETED]->(t) return count(r)")

tweet begin


count(r)
8477


In [13]:
print("retweet begin")
graph.run("MATCH (u: User),(t: Tweet) WHERE u.id = t.author_id and t.type = 'retweeted' MERGE (u)-[r:RETWEETED]->(t) return count(r)")

retweet begin


count(r)
22887


In [14]:
print("quote begin")
graph.run("MATCH (u: User),(t: Tweet) WHERE u.id = t.author_id and t.type = 'quoted' MERGE (u)-[r:QUOTED]->(t) return count(r)")

quote begin


count(r)
749


In [15]:
print("reply begin")
graph.run("MATCH (u: User),(t: Tweet) WHERE u.id = t.author_id and t.type = 'replied_to' MERGE (u)-[r:REPLIED_TO]->(t) return count(r)")

reply begin


count(r)
1110


In [16]:

# Iterate over the rows in the DataFrame and add tags, urls, and tweets to the sets
for item in data:

    # Get the tweets
    tweet = {}
    tweet["id"] = item['includes']['tweets'][0]['id']  
    tweet["author_id"] = item['includes']['tweets'][0]['author_id']  

    if all(tweet.values()) and not all(pd.isna(v) and  pd.isnull(v) and v != "nan" for v in tweet.values()):  # only add non-empty and non-nan tweets
        if 'entities' in item['includes']['tweets'][0] and 'hashtags' in item['includes']['tweets'][0]['entities']:
            hashtags = item['includes']['tweets'][0]['entities']['hashtags']
            for i in range(len(hashtags)):
                if 'tag' in hashtags[i]:
                    tag = hashtags[i]['tag'].lower()
                    tagRelTweet = graph.run("MATCH (t: Tweet),(h: Hashtag) WHERE h.tag = $tag1 and t.id = $tweetId MERGE (t)-[r:HAS_HASHTAG]->(h) return count(r)",tag1 = tag, tweetId = tweet["id"]).data()
                    tagRelUser = graph.run("MATCH (u: User),(h: Hashtag) WHERE h.tag = $tag1 and u.id = $userId MERGE (u)-[r:USED_HASHTAG]->(h) return count(r)",tag1 = tag, userId = tweet["author_id"]).data()
                     
        if 'entities' in item['includes']['tweets'][0] and 'urls' in item['includes']['tweets'][0]['entities']:
            urls = item['includes']['tweets'][0]['entities']['urls']
            for i in range(len(urls)):
                if 'expanded_url' in urls[i]:
                    urlRelTweet = graph.run("MATCH (t: Tweet),(l: Link) WHERE l.url = $url1 and t.id = $tweetId MERGE (t)-[r:HAS_URL]->(l) return count(r)",url1 = url, tweetId = tweet["id"]).data()
                    urlReluser = graph.run("MATCH (u: User),(l: Link) WHERE l.url = $url1 and u.id = $userId MERGE (u)-[r:USED_URL]->(l) return count(r)",url1 = url, userId = tweet["author_id"]).data()
   
    # Get the users 
    user = {}
    user["id"] = item['includes']['users'][0]['id'] 

    if all(user.values()) and not all(pd.isna(v) and  pd.isnull(v) and v != "nan" for v in user.values()):  # only add non-empty and non-nan tweets
        
         if 'entities' in item['data'] and 'mentions' in item['data']['entities']:
            mentions = item['data']['entities']['mentions']
            for i in range(len(mentions)):
                if 'id' in mentions[i]:
                    mentions_id = mentions[i]['id']
                    mentions_username = mentions[i]['username']
                    
                    userExists = graph.run("MATCH (u:User {id: $id1}) return count(u.id)",id1 = mentions_id).data()
                    if(userExists[0]["count(u.id)"] == 0):
                        node = Node("User", id=mentions_id, username=mentions_username)
                        graph.merge(node, "User", "id")   
                    mentionRel = graph.run("MATCH (u1:User),(u2:User) WHERE u1.id = $user1Id and u2.id = $user2Id MERGE (u1)-[r:MENTIONS]->(u2) return count(r)",user1Id = user["id"], user2Id = mentions_id).data()    

In [17]:
end_time = time.time()  # set the end time
elapsed_time = end_time - start_time  # calculate the elapsed time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 4755.683638811111 seconds
