In [1]:
#!pip install py2neo

In [2]:
import pandas as pd
from py2neo import Graph, Node
import time

In [3]:
start_time = time.time()

In [4]:
graph = Graph("bolt://localhost:7687", auth=("neo4j", "twitterdb"))

In [5]:
df = pd.read_csv("WDM1.csv", low_memory=False)
df = df.iloc[:1300,:]

In [6]:
df.shape[0]

1300

In [7]:
# Create sets to store unique tags, urls and tweets
unique_tags = set()
unique_urls = set()
unique_tweets = set()
unique_users = set()

# Iterate over the rows in the DataFrame and add tags, urls, and tweets to the sets
for index, row in df.iterrows():

    # Get the tags from the row
    tweet_tags = [row.get(f"includes.tweets.0.entities.hashtags.{i}.tag") for i in range(0, 25)]

    # Get the urls from the row
    tweet_urls = [row.get(f"includes.tweets.0.entities.urls.{i}.expanded_url") for i in range(0, 3)]

    # Get the tweets from the row
    tweet = dict()
    tweet["id"] = str(row.get(f"includes.tweets.0.id"))
    tweet["created_at"] = row.get(f"includes.tweets.0.created_at")
    tweet["reply_count"] = row.get(f"includes.tweets.0.public_metrics.reply_count")
    tweet["type"] = str(row.get(f"includes.tweets.0.referenced_tweets.0.type"))
    tweet["author_id"] = str(row.get(f"includes.tweets.0.author_id"))
    

    # Get the users from the row
    user = dict()
    user["id"] = str(row.get(f"includes.users.0.id"))
    user["username"] = str(row.get(f"includes.users.0.username"))
    user["followers_count"] = row.get(f"includes.users.0.public_metrics.followers_count")

    # Remove any None or NaN values from the list of tags and urls
    tweet_tags = [str(tag).lower() for tag in tweet_tags if isinstance(tag, str)]
    tweet_urls = [str(url) for url in tweet_urls if isinstance(url, str)]
    
    # Add the lists to the sets
    unique_tags.update(tweet_tags)
    unique_urls.update(tweet_urls)

    # Convert the dictionary to a frozenset before adding to the set to make it hashable
    unique_tweets.add(frozenset(tweet.items()))
    unique_users.add(frozenset(user.items()))

In [8]:
# Create a node for each tag
for tag in unique_tags:
    node = Node("Hashtag", tag=tag)
    graph.create(node)

In [9]:
# Create a node for each url
for url in unique_urls:
    node = Node("Link", url=url)
    graph.create(node)

In [10]:
# Create a node for each tweet
for tweet in unique_tweets:
    # Convert the hashable tuple back to a dictionary
    tweet_dict = dict(tweet)
    node = Node("Tweet", id=tweet_dict["id"], created_at=tweet_dict["created_at"], reply_count=tweet_dict["reply_count"]
                , type=tweet_dict["type"], author_id=str(tweet_dict["author_id"]))
    graph.create(node)

In [11]:
# Create a node for each user
constraint_created = False
for user in unique_users:
    # Convert the hashable tuple back to a dictionary
    user_dict = dict(user)
    node = Node("User", id=str(user_dict["id"]), username=user_dict["username"], followers_count=user_dict["followers_count"])
    if not constraint_created:
        graph.run("CREATE CONSTRAINT constraint_user_id FOR (u:User) REQUIRE (u.id) IS UNIQUE")
        constraint_created = True
    graph.merge(node, "User", "id")

In [12]:
print("tweet begin")
graph.run("MATCH (u: User),(t: Tweet) WHERE u.id = t.author_id and t.type<> 'retweeted' and t.type<> 'quoted' and t.type<> 'replied_to' MERGE (u)-[r:TWEETED]->(t) return count(r)")

tweet begin


count(r)
311


In [13]:
print("retweet begin")
graph.run("MATCH (u: User),(t: Tweet) WHERE u.id = t.author_id and t.type = 'retweeted' MERGE (u)-[r:RETWEETED]->(t) return count(r)")

retweet begin


count(r)
941


In [14]:
print("quote begin")
graph.run("MATCH (u: User),(t: Tweet) WHERE u.id = t.author_id and t.type = 'quoted' MERGE (u)-[r:QUOTED]->(t) return count(r)")

quote begin


count(r)
12


In [15]:
print("reply begin")
graph.run("MATCH (u: User),(t: Tweet) WHERE u.id = t.author_id and t.type = 'replied_to' MERGE (u)-[r:REPLIED_TO]->(t) return count(r)")

reply begin


count(r)
36


In [19]:

# Iterate over the rows in the DataFrame and add tags, urls, and tweets to the sets
for index, row in df.iterrows():

    # Get the tweets from the row
    tweet = {}
    tweet["id"] = str(row.get(f"includes.tweets.0.id"))
    tweet["author_id"] = str(row.get(f"includes.tweets.0.author_id"))
    #print(index)
    if all(tweet.values()) and not all(pd.isna(v) and  pd.isnull(v) and v != "nan" for v in tweet.values()):  # only add non-empty and non-nan tweets
        for j in range(0,25):
            tag = str(row.get(f"includes.tweets.0.entities.hashtags.{j}.tag"))
            if(not pd.isna(tag) and not pd.isnull(tag) and tag != "nan"):
                tag = tag.lower()
                tagRelTweet = graph.run("MATCH (t: Tweet),(h: Hashtag) WHERE h.tag = $tag1 and t.id = $tweetId MERGE (t)-[r:HAS_HASHTAG]->(h) return count(r)",tag1 = tag, tweetId = tweet["id"]).data()
                tagRelUser = graph.run("MATCH (u: User),(h: Hashtag) WHERE h.tag = $tag1 and u.id = $userId MERGE (u)-[r:USED_HASHTAG]->(h) return count(r)",tag1 = tag, userId = tweet["author_id"]).data()

        for j in range(0,3):
            url = str(row.get(f"includes.tweets.0.entities.urls.{j}.expanded_url"))
            if(not pd.isna(url) and not pd.isnull(url) and url != "nan"):
                urlRelTweet = graph.run("MATCH (t: Tweet),(l: Link) WHERE l.url = $url1 and t.id = $tweetId MERGE (t)-[r:HAS_URL]->(l) return count(r)",url1 = url, tweetId = tweet["id"]).data()
                urlReluser = graph.run("MATCH (u: User),(l: Link) WHERE l.url = $url1 and u.id = $userId MERGE (u)-[r:USED_URL]->(l) return count(r)",url1 = url, userId = tweet["author_id"]).data()

#    # Get the users from the row
    user = {}
    user["id"] = str(row.get(f"includes.users.{0}.id"))

    if all(user.values()) and not all(pd.isna(v) and  pd.isnull(v) and v != "nan" for v in user.values()):  # only add non-empty and non-nan tweets
        
        for j in range(0,6):
            mentions_username = str(row.get(f"data.entities.mentions.{j}.username"))
            mentions_id = str(row[f"data.entities.mentions.{j}.id"])
            if(not pd.isna(mentions_id) and not pd.isnull(mentions_id) and mentions_id != "nan"):
                userExists = graph.run("MATCH (u:User {id: $id1}) return count(u.id)",id1 = mentions_id).data()
                if(userExists[0]["count(u.id)"] == 0):
                    node = Node("User", id=str(mentions_id), username=str(mentions_username))
                    graph.merge(node, "User", "id")               
                mentionRel = graph.run("MATCH (u1:User),(u2:User) WHERE u1.id = $user1Id and u2.id = $user2Id MERGE (u1)-[r:MENTIONS]->(u2) return count(r)",user1Id = user["id"], user2Id = mentions_id).data()


In [17]:
end_time = time.time()  # set the end time
elapsed_time = end_time - start_time  # calculate the elapsed time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 30.860256910324097 seconds
