In [1]:
import sys
import time
import random
from datetime import datetime

In [2]:
import numpy as np
import pandas as pd

In [3]:
from tensorflow.keras.losses import binary_crossentropy

from keras import backend as K
import tensorflow as tf

In [4]:
tf.config.run_functions_eagerly(True)

***

In [5]:
# Genuine users and tweets
gen_users = pd.read_csv('dataset/users.csv')
gen_tweets = pd.read_csv('dataset/tweets.csv')

In [6]:
def create_digital_dna_from_profile(users_df):
    df = users_df
    
    # Applying necessary replacements
    # 1 - Description is available, 0 - Description not available
    # 1 - URL is available, 0 - URL is not available
    df['description'] = np.where(pd.isnull(users_df['description']) == True, 0, 1)
    df['url'] = np.where(pd.isnull(users_df['url']) == True, 0, 1)
    
    # Changed user data
    return df

def create_digital_dna_from_tweets(tweets_df):
    '''For each user id in tweets_df return a digital DNA string based on posting behaviour.'''
    
    # Add columns for counts of tweets, replies and retweets.
    tweets_df['num_retweets'] = np.where(tweets_df['retweeted_status_id'] == 0, 0, 1)
    tweets_df['num_replies'] = np.where(tweets_df['in_reply_to_status_id'] == 0, 0, 1)
    tweets_df['num_tweets'] = np.where((tweets_df['num_retweets'] == 0) & (tweets_df['num_replies'] == 0), 1, 0)
    
    tweets = tweets_df['num_tweets'] == 1
    retweets = tweets_df['num_retweets'] == 1
    replies = tweets_df['num_replies'] == 1

    # DNA alphabet for tweet (A), retweet (C) and reply (T).
    tweets_df.loc[:, 'DNA'] = np.where(retweets, ' C', np.where(replies, ' T', ' A'))

    # Sort tweets by timestamp.
    tweets_df = tweets_df[['user_id', 'timestamp', 'DNA']]
    tweets_df = tweets_df.sort_values(by=['timestamp'])

    # Create digital DNA string for account.
    dna = tweets_df.groupby(by=['user_id'])['DNA'].agg(lambda x: ''.join(x))
    
    return dna

In [7]:
# Assigning float valeus in fields where necessary
def time_to_number(time_str):
    return np.float32(datetime.fromisoformat(time_str).timestamp())

def number_to_float(number):
    return np.float32(number)

def time_to_float(df, column_name):
    for index, value in enumerate(df[column_name]):
        df[column_name].at[index] = time_to_number(value)

def to_float(df, column_name):
    for index, value in enumerate(df[column_name]):
        df[column_name].at[index] = number_to_float(value)

In [8]:
to_float(gen_users, "statuses_count")
to_float(gen_users, "followers_count")
to_float(gen_users, "friends_count")
to_float(gen_users, "favourites_count")
to_float(gen_users, "listed_count")

time_to_float(gen_users, "timestamp")
time_to_float(gen_users, "updated")

In [9]:
print('Users shape:', gen_users.shape)
print('Tweets shape:', gen_tweets.shape)

Users shape: (100, 42)
Tweets shape: (248533, 25)


***

# User Data - Section:

## Data Processing

In [10]:
# Filtering data which we need
filtered_user_data = gen_users.filter(["id", "statuses_count", "followers_count",
                                      "friends_count", "favourites_count", "listed_count",
                                      "url", "description", "timestamp", "updated"])

In [11]:
# Processing user data and tweets of users
processed_user_data = create_digital_dna_from_profile(filtered_user_data)
print('Users data shape:', processed_user_data.shape)

# This code can be used if we change the implementation
# and make a single model for both data types
# # Compile user data with dna tweets
# processed_user_data['tweets_dna'] = ""
# processed_tweets = create_digital_dna_from_tweets(gen_tweets)
# for i in range(processed_user_data.shape[0]):
#     user_id = processed_user_data['id'][i]
#     dna = processed_tweets[user_id]
#     processed_user_data['tweets_dna'].at[i] = dna

Users data shape: (100, 10)


In [12]:
twitter_account = processed_user_data.filter(["statuses_count", "followers_count", "friends_count", 
                                              "favourites_count", "listed_count", "url", "description",
                                              "timestamp", "updated"])

## Processed Data

In [13]:
twitter_account

Unnamed: 0,statuses_count,followers_count,friends_count,favourites_count,listed_count,url,description,timestamp,updated
0,60463,568,387,15599,1,1,0,1328857216.0,1458039808.0
1,135,208,263,43,2,0,1,1253338112.0,1458040064.0
2,283536,7785,424,1157,217,1,1,1241342208.0,1458039296.0
3,1770,179,132,1224,0,0,0,1376278528.0,1458039680.0
4,4790,3325,1327,235,5,1,1,1306796800.0,1458039680.0
...,...,...,...,...,...,...,...,...,...
95,8860,178,493,715,6,0,1,1256503680.0,1458039808.0
96,12502,478,353,28547,0,0,1,1374653440.0,1458040192.0
97,1189,369,337,250,0,0,1,1423388032.0,1458039808.0
98,103,26,189,40,1,0,0,1340321024.0,1458039808.0


***

## Parameters

In [14]:
input_train      = twitter_account.values.reshape(twitter_account.shape[0], twitter_account.shape[1])

batch_size       = 10
no_epochs        = 10
validation_split = 0.1
verbosity        = 1

latent_dim       = 9

In [15]:
input_train.shape

(100, 9)

***

## Model

In [16]:
"""
Dropping tweets data because this model will be trianed just on the user data.
Converting all the other fields to float64 then giving it the shape for input.
The final variable contains the numpy array made from the dataframe.
"""
# Needed in case when we have tweets with user data too
# training_data              = twitter_account.drop(['tweets_dna'], axis = 1)
training_data              = twitter_account
training_data              = training_data.apply(pd.to_numeric)
training_data              = training_data.values.reshape(training_data.shape[0], training_data.shape[1])
features_in_feature_vector = training_data.shape[1]
input_shape                = (features_in_feature_vector, )

In [17]:
mean               = np.mean(training_data, axis=0)
standard_deviation = np.std(training_data, axis=0)
training_data      = (training_data - mean) / standard_deviation

## Encoder

In [18]:
i       = tf.keras.layers.Input(shape=input_shape, name='encoder_input')
x       = tf.keras.layers.Dense(features_in_feature_vector, activation='relu')(i)
x       = tf.keras.layers.BatchNormalization(name='encoder_output')(x)
mu      = tf.keras.layers.Dense(latent_dim, name='latent_mu')(x)
sigma   = tf.keras.layers.Dense(latent_dim, name='latent_sigma')(x)

In [19]:
# Define sampling with reparameterization trick
def sample_z(args):
    mu, sigma = args
    batch     = K.shape(mu)[0]
    dim       = K.int_shape(mu)[1]
    eps       = K.random_normal(shape=(batch, dim))
    return mu + K.exp(sigma / 2) * eps

In [20]:
# Reparameterization for correct gradient
z = tf.keras.layers.Lambda(sample_z, output_shape=(latent_dim, ), name='z')([mu, sigma])

In [21]:
# Instantiate encoder
encoder = tf.keras.Model(i, [mu, sigma, z], name='encoder')
encoder.summary()

Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      [(None, 9)]          0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 9)            90          encoder_input[0][0]              
__________________________________________________________________________________________________
encoder_output (BatchNormalizat (None, 9)            36          dense[0][0]                      
__________________________________________________________________________________________________
latent_mu (Dense)               (None, 9)            90          encoder_output[0][0]             
____________________________________________________________________________________________

## Decoder

In [22]:
d_i    = tf.keras.layers.Input(shape=(latent_dim, ), name='decoder_input')
x      = tf.keras.layers.Dense(features_in_feature_vector, activation='relu')(d_i)
x      = tf.keras.layers.Reshape(input_shape)(x)
o      = tf.keras.layers.BatchNormalization(name='decoder_output')(x)

In [23]:
# Instantiate decoder
decoder = tf.keras.Model(d_i, o, name='decoder')
decoder.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
decoder_input (InputLayer)   [(None, 9)]               0         
_________________________________________________________________
dense_1 (Dense)              (None, 9)                 90        
_________________________________________________________________
reshape (Reshape)            (None, 9)                 0         
_________________________________________________________________
decoder_output (BatchNormali (None, 9)                 36        
Total params: 126
Trainable params: 108
Non-trainable params: 18
_________________________________________________________________


# VAE - User Data

In [24]:
# Instantiate VAE
vae_outputs = decoder(encoder(i)[2])
vae         = tf.keras.Model(i, vae_outputs, name='vae')
vae.summary()

Model: "vae"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_input (InputLayer)   [(None, 9)]               0         
_________________________________________________________________
encoder (Functional)         [(None, 9), (None, 9), (N 306       
_________________________________________________________________
decoder (Functional)         (None, 9)                 126       
Total params: 432
Trainable params: 396
Non-trainable params: 36
_________________________________________________________________


## Loss Function

In [25]:
"""
We are calcuating the binary crossentropy here.
KL Divergence and Reconstruction are the two
metrics which we are using to make prediction.
"""
def kl_reconstruction_loss(true, pred):
    # Reconstruction loss
    reconstruction_loss = binary_crossentropy(K.flatten(true), K.flatten(pred)) * input_shape[0]
    # KL divergence loss
    kl_loss = -0.5 * (1 + sigma - K.square(mu) - K.exp(sigma))
    kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
    # Total loss = 50% rec + 50% KL divergence loss
    return (reconstruction_loss + kl_loss)

## Training

In [26]:
"""
We are using the adam optimizer from Keras in combination
of our own defined loss function which is the mean of
reconstruction loss and the KL divergence.
"""
vae.compile(optimizer='adam', loss=kl_reconstruction_loss)

In [27]:
vae.fit(x = training_data, y = training_data, 
        epochs = no_epochs, 
        batch_size = batch_size, 
        validation_split = validation_split)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1793285ae20>

In [28]:
print('Original Data')
pd.DataFrame(training_data)

Original Data


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.265273,-0.197579,-0.174506,1.577222,-0.218345,1.333333,-2.478479,0.136333,0.266865
1,-0.498134,-0.255395,-0.289660,-0.568039,-0.211684,-0.750000,0.403473,-1.220599,1.114057
2,7.785771,0.961468,-0.140145,-0.414412,1.220414,1.333333,0.403473,-1.436142,-1.427518
3,-0.450343,-0.260052,-0.411315,-0.405172,-0.225006,-0.750000,-2.478479,0.988402,-0.156730
4,-0.362067,0.245194,0.698440,-0.541561,-0.191701,1.333333,0.403473,-0.260050,-0.156730
...,...,...,...,...,...,...,...,...,...
95,-0.243100,-0.260213,-0.076067,-0.475366,-0.185040,-0.750000,0.403473,-1.163719,0.266865
96,-0.136643,-0.212033,-0.206080,3.362825,-0.225006,-0.750000,0.403473,0.959202,1.537653
97,-0.467326,-0.229538,-0.220939,-0.539493,-0.225006,-0.750000,0.403473,1.834868,0.266865
98,-0.499070,-0.284624,-0.358381,-0.568453,-0.218345,-0.750000,-2.478479,0.342315,0.266865


In [29]:
print('Predicted Data')
pd.DataFrame(vae.predict(training_data))

Predicted Data




Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.459924,-0.361592,-0.390097,0.306006,0.186672,-0.458823,-0.414342,-0.178994,-0.346786
1,-0.370724,0.613755,0.725240,-0.393276,-0.369279,0.077663,-0.162324,-0.327061,0.040184
2,2.549719,-0.361592,-0.390097,-0.393276,0.807741,4.555582,-0.116114,4.617607,-0.387958
3,-0.220392,-0.194841,0.912972,1.014233,-0.369279,-0.458823,0.421045,-0.327061,-0.364405
4,0.051455,-0.361592,1.665366,2.095130,0.165052,-0.458823,1.559290,1.801005,0.458394
...,...,...,...,...,...,...,...,...,...
95,1.780359,0.175638,0.145170,-0.393276,-0.369279,-0.458823,0.131665,-0.327061,0.382156
96,-0.320495,-0.361592,-0.390097,-0.393276,-0.171544,-0.458823,-0.414342,0.252296,0.684358
97,-0.459924,-0.313995,0.264632,0.272935,0.024236,-0.051141,0.078057,-0.327061,-0.387958
98,1.110853,-0.361592,0.045516,-0.393276,-0.027030,1.568549,0.043900,-0.327061,1.180165


***

# Tweets Section:

## Data Processing

In [30]:
tweets_dna = create_digital_dna_from_tweets(gen_tweets).to_numpy()
print('Tweets shape:', tweets_dna.shape)

Tweets shape: (100,)


## Term Vectors

In [31]:
"""
This part is being used to generate vocabulory which will be used to
convert the text into indices. Then this data can be converted to
matrices and fed to the learning models.
Sequence legth is set by checkcing the maximum lenth of string in the
dataset so that we don't have to lose any detail which help in
converting the representation back to the original text.
"""
VOCAB_SIZE = 5
MAX_SEQUENCE_LENGHT = 3500
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE, output_sequence_length=MAX_SEQUENCE_LENGHT)
encoder.adapt(tweets_dna)

In [32]:
"""
Checking if the generated tokens are right which
will be used as terms for strigs.
"""
vocab = np.array(encoder.get_vocabulary())
print(vocab)

['' '[UNK]' 'a' 'c' 't']


In [33]:
"""
Checking if the original and the processed data
because presence of unknown chunks would make it
impossible to convert data back to text.
"""
encoded_example = encoder(tweets_dna)[:3].numpy()

for n in range(1):
    print("Original: ", tweets_dna[n])
    print("Processed: ", " ".join(vocab[encoded_example[n]]))
    print()

Original:   A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A T A A A A A A T T A A A T A T A A A T A A A T A A T A T T A T T A A A A A A A A A A A A A A A A A A A A T A A A A A A T A A A A A A A A A A A A A A A A T A A T A T A A A A A A T A A A A A A A A A A A A A A A A A T A A T T A A A A A T T A A T T T A A A T A A A A A T A T A A A A A A A A A A T T A T A T T T A T A A A T A T A A A T A A A A A A A T A A A A A T T A A T A A A A T A A A A T A A A A T T A T T T T A T T T T A A A A T T A A A T A T T A T A A A A A T T A A A T T A A T T A T T T A T T A A T T A A T T T T T A T A A T T T A A T T A T T T T A A T T A A T A A T T T T T T A A A A T A T A T T A T A T T A A A A A A A T T T T A T A T A A T A A A A T T T T T T T T T T T T T T T T T T T A T A T T A T A T T T T T A T T A A A A T A A T T A A T A A T A A T A A A T A A A A A A T T A A A A A T T T A C A T C A A C A A A C T A T A C T C A T C T A T T A A A A A T A C T T C A A A C C A A C C C C A A T A A A T A A A A 

***

## Model

In [34]:
text_input_shape                 = (encoder(tweets_dna).shape[1], )
text_input_diensions             = len(encoder.get_vocabulary())
text_embedding_output_dimensions = 64

## Encoder

In [35]:
encoder_input       = tf.keras.layers.Input(shape=text_input_shape)
embedding_layer     = tf.keras.layers.Embedding(input_dim=text_input_diensions, output_dim=text_embedding_output_dimensions, mask_zero=True)(encoder_input)
LTSM_layer          = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(text_embedding_output_dimensions))(embedding_layer)
dense_layer         = tf.keras.layers.Dense(text_embedding_output_dimensions, activation='relu')(LTSM_layer)
dense_final_layer   = tf.keras.layers.Dense(1)(dense_layer)
text_mu             = tf.keras.layers.Dense(text_input_diensions, name='text_latent_mu')(dense_final_layer)
text_sigma          = tf.keras.layers.Dense(text_input_diensions, name='text_latent_sigma')(dense_final_layer)

In [36]:
# Define sampling with reparameterization trick
def sample_z(args):
    mu, sigma = args
    batch     = tf.keras.backend.shape(text_mu)[0]
    dim       = tf.keras.backend.int_shape(text_mu)[1]
    eps       = tf.keras.backend.random_normal(shape=(batch, dim))
    return text_mu + tf.keras.backend.exp(text_sigma / 2) * eps

z = tf.keras.layers.Lambda(sample_z, output_shape=(len(encoder.get_vocabulary()), ), name='z')([text_mu, text_sigma])

In [37]:
model = tf.keras.Model(encoder_input, [text_mu, text_sigma, z], name='Text Compression Model')

In [38]:
model.summary()

Model: "Text Compression Model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 3500)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 3500, 64)     320         input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 128)          66048       embedding[0][0]                  
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 64)           8256        bidirectional[0][0]              
_____________________________________________________________________________

## Decoder

# VAE - Tweets

## Loss Function

## Training