In [1]:
import sys
import time
import random

In [2]:
import numpy as np
import pandas as pd

In [4]:
# Genuine users
gen_users = pd.read_csv('dataset/users.csv')

# Genuine tweets
gen_tweets = pd.read_csv('dataset/tweets.csv')

In [5]:
def create_digital_dna_from_tweets(tweets_df):
    '''For each user id in tweets_df return a digital DNA string based on posting behaviour.'''
    
    # Add columns for counts of tweets, replies and retweets.
    tweets_df['num_retweets'] = np.where(tweets_df['retweeted_status_id'] == 0, 0, 1)
    tweets_df['num_replies'] = np.where(tweets_df['in_reply_to_status_id'] == 0, 0, 1)
    tweets_df['num_tweets'] = np.where((tweets_df['num_retweets'] == 0) & (tweets_df['num_replies'] == 0), 1, 0)

    # DNA alphabet for tweet (A), retweet (C) and reply (T).
    tweets = tweets_df['num_tweets'] == 1
    retweets = tweets_df['num_retweets'] == 1
    replies = tweets_df['num_replies'] == 1

    tweets_df.loc[:, 'DNA'] = np.where(retweets, 'C', np.where(replies, 'T', 'A'))

    # Sort tweets by timestamp.
    tweets_df = tweets_df[['user_id', 'timestamp', 'DNA']]
    tweets_df = tweets_df.sort_values(by=['timestamp'])

    # Create digital DNA string for account.
    dna = tweets_df.groupby(by=['user_id'])['DNA'].agg(lambda x: ''.join(x))
    
    return dna

In [6]:
print('Users shape:', gen_users.shape)
print('Tweets shape:', gen_tweets.shape)

Users shape: (100, 42)
Tweets shape: (248533, 25)


In [7]:
gen_tweets_dna = create_digital_dna_from_tweets(gen_tweets)

In [8]:
print(gen_tweets_dna[14418605])

CCCCCAACAATTAAAAAAACCCCCCCAATTACTCCCCCTCCATATCCTTCCCCTCCCCACCCAACCCCCCCTACCCCCCAATCCCCCCCCACCTTTTCCATCACCCTCCCTCACTCCCCCTACCCCCTCTCCTTACCCCCCCCCAACCACCCCACAAAATTACCCATTTCCCCCCCCCCCCTCCCCCTCCCCCCCCCCAACCCCCTTCCTCCACCCCCAAACCAACCCCCCCTTCTCCCCACAACCACCCTAACATCCCCCCCTTCTCCCCCCCCCACTCCACACACCCCCCCCTTCCCCCCTTCCTTCTTCTCCTCTCACTTTCCCCCCCTCTACCCCCCCCCCCTCCTCTCTTCACATCCCCCATCTCCTCCCTCTCCCCCCCTCCCCCCCCCCCTTCCTCCCCCCCCCCCCCCCCACCACTTCCCCAACCCAACTCCTTTACCCATTTCCATCTCCCCCCCCCCCTCCACACAACCATCCTTACCCCCCCCCCCCCCCACTCCCCCATCTCCACCCCCCCTCCCCCCACCCCCCAACCTCCCCCTAATCTCCACCCCCCACCACCCCCTCCTACCCCCTTCCCCCCCCCCCCCCCCCCCCCCCCTACCCCCCCCCCCCACCCATCCCCCCCCCCACTACCACACCCCCCCTCCCAAACTTCCCCCCCCCCCCCCCCCCCACCCCCCCCTCCATCACCCTCCCATCCCCACCCATCACCCACCCCCCCCCCCCCCCACCTCCTCCCCACCCCCCCCCCTCCCCCCACCCCTCCATCCTTTCCCCCCATCCTCCACCTCCCCTTCCCAATTCCCCACCCCCCAACCCCCCCCCCCCCCCCTTTCACCCCCCCCCACTCCCCCCCCCTCATTCCCCTACCCAAAACCCCACCCACCACCCAACCCCCCCCCCCTTTTCCCCCACCACCCCCCCCCACCCCCCCTTTTAATCTCTCCCCCCCCCCCCCATTCCCCACCCCCATCCCCCCCCCCCTCTCCCC