In [148]:
import json
from datetime import datetime
import numpy as np
import pandas as pd

In [127]:
class Tweet:
    """
    A single Tweet containing contextual attributes
    """
    def __init__(self, tweet_data):
        """Constructor to set the tweet attributes.
        """
        # mandatory fields
        self.id = tweet_data['id']
        self.author_id = tweet_data['author_id']
        self.retweet_count = tweet_data['public_metrics']['retweet_count']
        self.reply_count = tweet_data['public_metrics']['reply_count']
        self.like_count = tweet_data['public_metrics']['like_count']
        self.quote_count = tweet_data['public_metrics']['quote_count']
        self.impression_count = tweet_data['public_metrics']['impression_count']
        self.text = tweet_data['text']
        self.time_created = self.parse_timestamp(tweet_data['created_at'])
        # self.year_created = self.parse_timestamp(tweet_data['created_at']).year
        # optional fields - use auxiliary get methods
        self.tweets_retweeted = self.get_referenced_tweets(tweet_data)[0]
        self.tweets_replied_to = self.get_referenced_tweets(tweet_data)[1]
        self.mentions = self.get_mentions(tweet_data)
        self.hashtags = self.get_hashtags(tweet_data)
        self.expanded_urls = self.get_expanded_urls(tweet_data)
    
    def get_referenced_tweets(self,tweet_data):
        """Return a tuple of referenced retweets and replies.

        `tweets_retweeted`: tweets this tweet retweeted
        `tweets_replied_to`: tweets this tweet replied to
        """
        tweets_retweeted = [] # tweets this tweet retweeted
        tweets_replied_to = [] # tweets this tweet replied to

        try:
            for tweet_dict in tweet_data['referenced_tweets']:
                if tweet_dict['type'] == 'retweeted':
                    tweets_retweeted.append(tweet_dict['id'])
                if tweet_dict['type'] == 'replied_to':
                    tweets_replied_to.append(tweet_dict['id'])
            return (tweets_retweeted, tweets_replied_to)
        except KeyError:
            return (tweets_retweeted, tweets_replied_to)
    
    def get_mentions(self, tweet_data):
        """Returns a list of mentioned users in the tweet content."""
        mentions = [] # list of mentioned user ids
        if 'entities' in tweet_data:
            if 'mentions' in tweet_data['entities']:
                for mention_dict in tweet_data['entities']['mentions']:
                    mentions.append(mention_dict['id'])
        return mentions
    
    def get_hashtags(self, tweet_data):
        """Returns a list of hashtags used in the tweet content."""
        hashtags = [] # list of hashtags
        if 'entities' in tweet_data:
            if 'hashtags' in tweet_data['entities']:
                for hashtag_dict in tweet_data['entities']['hashtags']:
                    hashtags.append(hashtag_dict['tag'])
        return hashtags

    def get_expanded_urls(self, tweet_data):
        """Returns a list of expanded urls used in the tweet content."""
        urls = [] # list of urls
        if 'entities' in tweet_data:
            if 'urls' in tweet_data['entities']:
                for url_dict in tweet_data['entities']['urls']:
                    urls.append(url_dict['expanded_url'])
        return urls
    
    def parse_timestamp(self, timestamp_str):
        """Returns timestamp string as datetime object"""
        return datetime.strptime(timestamp_str, "%Y-%m-%dT%H:%M:%S.%fZ")

    def calculate_mention_similarity(self, tweet_j):
        """
        returns the mention (people) similarity score between two tweets defined as:
        number of common mentioned people in tweets i and j, normalised by the number of 
        people involved in both tweets:
             po(i,j) = |pi ∩ pj|/|pi ∪ pj|
        """
        pi = self.mentions
        pj = tweet_j.mentions
        common_p = list(set(pi) & set(pj))
        all_p = list(set(pi) | set(pj))

        try:
            if self.id == tweet_j.id: # when it is self comparison
                po_ij = 0
            else:
                po_ij = len(common_p)/len(all_p)
        except ZeroDivisionError:
            po_ij = 0

        return po_ij
    
    def __str__(self):
        """String representation of the tweet."""
        return (
            f"tweet_id: {self.id} \n\t"
            f"created_at: {self.time_created} \n\t"
            f"author_id: {self.author_id} \n\t"
            f"retweet_count: {self.retweet_count} \n\t"
            f"reply_count: {self.reply_count} \n\t"
            f"like_count: {self.like_count} \n\t"
            f"quote_count: {self.quote_count} \n\t"
            f"text: {self.text} \n\t"
            f"tweets retweeted: {self.tweets_retweeted} \n\t"
            f"tweets replied to: {self.tweets_replied_to} \n\t"
            f"mentioned users: {self.mentions} \n\t"
            f"hashtags: {self.hashtags} \n\t"
            f"urls: {self.expanded_urls} \n\t"
        )

In [128]:
class TweetColl:
    "Collection of tweets"

    def __init__(self):
        self.tweets = {} # a dictionary of tweet_id: tweet dictionary pairs

    def add_tweet(self, tweet):
        """Add a tweet to the collection."""
        self.tweets[tweet.id] = tweet
    
    def get_tweet(self, tweet_id):
        """Return a tweet by id."""
        return self.tweets[tweet_id]
    
    def __iter__(self):
        """Return an iterator over term--frequency pairs.
        Each element is a (id, tweet) tuple."""
        return iter(self.tweets.items())
    
    def compute_pairwise_mention_similarity(self):
        tweet_ids = list(self.tweets.keys())
        num_tweets = len(tweet_ids)
        similarity_matrix = np.zeros((num_tweets, num_tweets))
        
        for i, tweet_id1 in enumerate(tweet_ids):
            for j, tweet_id2 in enumerate(tweet_ids):
                if i <= j:
                    similarity = self.tweets[tweet_id1].calculate_mention_similarity(self.tweets[tweet_id2])
                    similarity_matrix[i, j] = similarity
                    similarity_matrix[j, i] = similarity
        
        # create a df wiht tweet ids as row and column headers replacing the index no
        dist_df = pd.DataFrame(similarity_matrix, index=tweet_ids, columns=tweet_ids)
        
        return dist_df

In [129]:
# read JSON data
dfile = 'sample_tweet.json'

tweetsColl = TweetColl()
with open(dfile, "r") as json_file:
    for line in json_file:
        tweet_data = json.loads(line)
        tweet = Tweet(tweet_data)
        tweetsColl.add_tweet(tweet)

In [130]:
# check tweets attributes
for id_, tweet in tweetsColl:
    print(f"{id_}:\n\t{tweet}")

549724391505141760:
	tweet_id: 549724391505141760 
	created_at: 2014-12-30 00:31:22 
	author_id: 1325481396 
	retweet_count: 0 
	reply_count: 0 
	like_count: 0 
	quote_count: 0 
	text: @BarackObama 1: Hydrogen FuelCells for Transportation-Homes,Solar-Wind-Biogas for Water Desalination-Homes-Towns-Farms. GMO Labeling:Exports 
	tweets retweeted: [] 
	tweets replied to: ['549722444064653312'] 
	mentioned users: ['813286'] 
	hashtags: [] 
	urls: [] 
	
549232209048403969:
	tweet_id: 549232209048403969 
	created_at: 2014-12-28 15:55:37 
	author_id: 220189825 
	retweet_count: 0 
	reply_count: 0 
	like_count: 1 
	quote_count: 0 
	text: After a hiatus, ready to start the blog again in Jan with more on a hydrogen hot air balloon, responsible investing, and (new) coal exports. 
	tweets retweeted: [] 
	tweets replied to: [] 
	mentioned users: [] 
	hashtags: [] 
	urls: [] 
	
545345919747903488:
	tweet_id: 545345919747903488 
	created_at: 2014-12-17 22:32:53 
	author_id: 1325481396 
	retweet_count: 

### Test Tweet Similarity

In [131]:
mention_sim = tweetsColl.compute_pairwise_mention_similarity()

mention_sim

Unnamed: 0,549724391505141760,549232209048403969,545345919747903488,544586893065977857,538452154823217152,537370296760938497,537353118233927680,527247306823725058,521083320944586752,521075583648669696,...,486381680216014848,486378858061365251,486377539552546819,486375944546816001,484279339081826304,466380682965811200,465547403559329792,453654384526626816,435502638583779328,428936927832596480
549724391505141760,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
549232209048403969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
545345919747903488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
544586893065977857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538452154823217152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
466380682965811200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
465547403559329792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
453654384526626816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
435502638583779328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [144]:
# check tweets that have some degree of people similarity
mention_sim[mention_sim['521075583648669696']>0]

Unnamed: 0,549724391505141760,549232209048403969,545345919747903488,544586893065977857,538452154823217152,537370296760938497,537353118233927680,527247306823725058,521083320944586752,521075583648669696,...,486381680216014848,486378858061365251,486377539552546819,486375944546816001,484279339081826304,466380682965811200,465547403559329792,453654384526626816,435502638583779328,428936927832596480
527247306823725058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
521083320944586752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [145]:
print(tweetsColl.tweets['521075583648669696'])

tweet_id: 521075583648669696 
	created_at: 2014-10-11 23:11:14 
	author_id: 2599145570 
	retweet_count: 2 
	reply_count: 0 
	like_count: 1 
	quote_count: 0 
	text: Compact, safe #hydrogen storage. Clean energy exports from #ausbiz! Support us on @Indiegogo http://t.co/WTu2VLtdVi http://t.co/APY0LMAKx7 
	tweets retweeted: [] 
	tweets replied to: [] 
	mentioned users: ['34732474'] 
	hashtags: ['hydrogen', 'ausbiz'] 
	urls: ['http://igg.me/at/EnergyHproject/x', 'https://twitter.com/energyhproject/status/521075583648669696/photo/1'] 
	


In [147]:
print(tweetsColl.tweets['527247306823725058'])

tweet_id: 527247306823725058 
	created_at: 2014-10-28 23:55:28 
	author_id: 1277133248 
	retweet_count: 2 
	reply_count: 0 
	like_count: 0 
	quote_count: 0 
	text: RT @energyhproject: Compact, safe #hydrogen storage. Clean energy exports from #ausbiz! Support us on @Indiegogo http://t.co/WTu2VLtdVi htt… 
	tweets retweeted: ['521075583648669696'] 
	tweets replied to: [] 
	mentioned users: ['2599145570', '34732474'] 
	hashtags: ['hydrogen', 'ausbiz'] 
	urls: ['http://igg.me/at/EnergyHproject/x'] 
	
