In [1]:
import py2neo
import pandas as pd
import numpy as np

port = input("Enter Neo4j DB Bolt port: ")
user = input("Enter Neo4j DB Username: ")
pswd = input("Enter Neo4j DB Password: ")
# graph = Graph("bolt://localhost:7687", auth=("neo4j", "neo4j_auth")

graph = py2neo.Graph(port, auth=(user, pswd))

Get the total number of retweets

In [2]:
query1 = "MATCH p=()-[r:RETWEETED]->() RETURN COUNT(*)"
retweets = graph.run(query1).evaluate()
print("The total number of retweets is:", retweets)

The total number of retweets is: 24252


Get the 20 most popular hashtags (case insensitive) in descending order


In [3]:
query2 = """
    MATCH p=()-[r:HAS_HASHTAG]->(n:Hashtag)
    RETURN n.tag AS popular_hashtags, COUNT(*) AS frequency
    ORDER BY frequency DESC
    LIMIT 20
"""
pop_hashtags = graph.run(query2).to_data_frame()
print('The 20 most popular hashtags in descending order are:')
print(pop_hashtags)


The 20 most popular hashtags in descending order are:
      popular_hashtags  frequency
0          womenintech       4506
1     womenempowerment       3736
2       genderequality       3205
3        100daysofcode       1018
4         womenwhocode        990
5                women        931
6                   ai        877
7               coding        855
8             violence        815
9          datascience        745
10         womeninstem        717
11              python        690
12      womeninscience        689
13                tech        684
14         programming        611
15          javascript        562
16  womenentrepreneurs        523
17             bigdata        518
18          technology        503
19    blacktechtwitter        477


Get the total number of URLs (unique)


In [4]:
query3 = "MATCH (n:URL) RETURN COUNT(n)"
urls = graph.run(query3).evaluate()
print("The total number of urls is:", urls)

The total number of urls is: 9136


Get the 20 users with most followers in descending order


In [5]:
query4 = """
    MATCH (n:User)
    WHERE n.followers IS NOT NULL
    RETURN n.username AS username, n.followers as followers
    ORDER BY followers DESC
    LIMIT 20
"""
pop_users = graph.run(query4).to_data_frame()
print('The 20 users with most followers in descending order are:')
print(pop_users)

The 20 users with most followers in descending order are:
           username  followers
0          elonmusk  128282742
1      narendramodi   86387842
2               CNN   61110504
3        MileyCyrus   46930879
4             POTUS   29400524
5           FoxNews   23704890
6            Forbes   18710133
7               ICC   18401325
8              ndtv   17700505
9                UN   16213190
10      smritiirani   12724316
11  harbhajan_singh   11848884
12      PiyushGoyal   11685985
13         TimesNow   10325121
14          binance   10164724
15     TheDailyShow    9499257
16              ANI    7554914
17           snooki    5943793
18       Mike_Pence    5823821
19        Cobratate    4996182


Get the hour with the most tweets and retweets


In [6]:
query5 = """
    MATCH (u:User)-[:TWEETED]->(t:Tweet)
    WITH u, t, substring(t.created_at, 11, 2) AS hour
    RETURN u, t, toInteger(hour) AS tweet_hour
    UNION
    MATCH (u:User)-[:RETWEETED]->(t:Tweet)
    WITH u, t, substring(t.created_at, 11, 2) AS hour
    RETURN u, t, toInteger(hour) AS tweet_hour
"""
tweet_hour_df = graph.run(query5).to_data_frame()
hour_count = tweet_hour_df.groupby(['tweet_hour']).count()['u'].values
ind = np.argmax(hour_count)
print('The  hour with the most tweets and retweets is the', ind, 'th')


The  hour with the most tweets and retweets is the 15 th


In [9]:
Get the 20 users, in descending order, that have been mentioned the most


SyntaxError: invalid syntax (491170735.py, line 1)

In [11]:
query6 = """
    MATCH p=()-[r:MENTIONED]->(n:User)
    RETURN n.username AS username, COUNT(*) AS number_of_mentions
    ORDER BY number_of_mentions DESC
    LIMIT 20
"""
most_mentioned_users = graph.run(query6).to_data_frame()
print('The 20 users, in descending order, that have been mentioned the most are:')
print(most_mentioned_users)


The 20 users, in descending order, that have been mentioned the most are:
           username  number_of_mentions
0         Microsoft                6132
1     anthonyjdella                6124
2      Equal_Fights                 988
3        FightHaven                 745
4        see_fullen                 737
5      GirlsWhoCode                 423
6      ToofaniBaba1                 307
7         NCMIndiaa                 246
8    Khulood_Almani                 233
9   NorthernComd_IA                 197
10  WomensVoicesNow                 177
11   HarishKhuranna                 157
12        BJP4Delhi                 157
13  Virend_Sachdeva                 157
14  jindadilkashmir                 156
15   sanjeevchadha8                 156
16        TMGAwards                 132
17         UN_Women                 125
18    PointerSchool                 122
19     emily_gunton                 121


In [15]:
Get the top 20 tweets that have been retweeted the most and the persons that posted them


SyntaxError: invalid syntax (3186667878.py, line 1)

In [16]:
query7 = """
    MATCH (n:Tweet)
    RETURN n.id as tweet_id, n.author as author_id, n.retweets as retweets
    ORDER BY retweets DESC
    LIMIT 20
"""
most_retweeted = graph.run(query7).to_data_frame()
print('The top 20 tweets that have been retweeted the most and the persons that posted them are:')
print(most_retweeted)

The top 20 tweets that have been retweeted the most and the persons that posted them are:
               tweet_id            author_id  retweets
0   1617735335408017410  1162333473966891008     23351
1   1617723820604854272  1499144181822136320     23351
2   1614904562577715202   736244975944400896     21311
3   1616739692153802753  1578001920169750533     21311
4   1615338234250407936  1477971375100887042     21311
5   1615254039788400642  1571833730783940609     21311
6   1617351171341180928           3271065654     21311
7   1616741044154175491  1120982776843399169     21311
8   1616748922441199621  1488570069689204736     21311
9   1616812458999681025           3093825314     21311
10  1615752926869192711  1542755697087102976     21311
11  1617535902116642818  1061286182313635842     21311
12  1617895018127314944  1587502588957806599     21311
13  1615988984060207110  1605845393081712640     21311
14  1614904554247815170   736244975944400896     11731
15  1617351148654178305       

Run PageRank on the mention network


In [17]:
query8_1 = """
CALL gds.graph.project.cypher(
    'mentionGraph',
    'MATCH (u:User) RETURN id(u) AS id',
    'MATCH (u:User)-[r:MENTIONED]-(u1:User) RETURN id(u) AS source, id(u1) AS target')
"""
query8_2 = """
    CALL gds.pageRank.stream('mentionGraph') 
    YIELD nodeId, score
    RETURN gds.util.asNode(nodeId).username AS username, score
    ORDER BY score DESC LIMIT 10"""

mention_graph = graph.run(query8_1)
PR = graph.run(query8_2).to_data_frame()
print("The 10 highest PageRank scores for the 'Mentioned' network are:", PR)
print("The most important user according to the highest PageRank value is:",
      PR.iloc[0].values[0])


ClientError: [Procedure.ProcedureCallFailed] Failed to invoke procedure `gds.graph.project.cypher`: Caused by: java.lang.IllegalArgumentException: A graph with name 'mentionGraph' already exists.

Get the 20 users with most similar hashtags to the 6th important user(as the others used no hashtags)


In [None]:
def jaccard_sim(list1, list2):
    '''compute Jaccard similarity of two sets'''
    intersec = len(set(list1).intersection(set(list2)))
    union = len(set(list1).union(set(list2)))
    if union > 0:
        return intersec / union
    else:
        return 0

def get_hashtags(name):
    '''get the hashtags used by 6th important user
    '''

    query = """
        MATCH (u:User)-[r:USED_HASHTAG]->(h:Hashtag)
        WHERE u.username = $name
        RETURN DISTINCT h.tag 
    """
    tags = graph.run(query, name=name)
    hashtags = [t["h.tag"] for t in tags]

    return hashtags

def get_most_similar_user(name):
    '''get the 20 users with most similar hashtags
    to the 6th important user'''

    user_tags = get_hashtags(name)
    query9 = '''MATCH (u:User)-[r:USED_HASHTAG]->(h:Hashtag)
        WHERE u.username <> $name
        RETURN u.username, COLLECT(h.tag) AS hashtags'''
    result = graph.run(query9, name=name)
    tag_sim, users = list(), list()
    for r in result:
        other_user = r["u.username"]
        other_tags = r["hashtags"]
        if len(other_tags) > 0:
            sim = jaccard_sim(user_tags, other_tags)
            tag_sim.append(sim)
            users.append(other_user)
    tag_sim, users = zip(*sorted(zip(tag_sim, users)))

    sim_user = users[-20:]
    print('Τhe 20 users who used most similar hashtags to the 6th important user are:', sim_user)

get_most_similar_user('ToofaniBaba1')

Get the community that each user belongs to in the MENTION graph according to Louvain algorithm


In [None]:
query10 = """
    CALL gds.louvain.stream('mentionGraph') 
    YIELD  nodeId, communityId
    RETURN gds.util.asNode(nodeId).username AS username, communityId"""
communities = graph.run(query10).to_data_frame()
print(communities)

Get the top 10 most active users along with the number of posts they have made.


In [None]:
query11 = """
    MATCH (u:User)-[:TWEETED]->(t:Tweet)
    WITH u, COUNT(t) AS number_of_posts
    RETURN u.username AS username, number_of_posts
    ORDER BY number_of_posts DESC
    LIMIT 10
"""
active_users = graph.run(query11).to_data_frame()
print('The top 10 most active users along with the number of posts they have made:')
print(active_users)

Get the volumes of each type of tweets (where None is a tweet)


In [None]:
query12 = """
    MATCH (t:Tweet)
    RETURN t.type AS type, COUNT(t) AS volume
"""
types = graph.run(query12).to_data_frame()
print('The volumes of each type of tweets are:')
print(types)
