In [1]:
import sys
import time
import random

In [2]:
import keras
from keras.layers import Conv2D, Conv2DTranspose, Input, Flatten, Dense, Lambda, Reshape
from keras.layers import BatchNormalization
from keras.models import Model
from keras.datasets import mnist
from keras.losses import binary_crossentropy
from keras import backend as K

In [3]:
import numpy as np
import pandas as pd

In [4]:
import matplotlib.pyplot as plt

***

In [5]:
# Genuine users and tweets
gen_users = pd.read_csv('dataset/users.csv')
gen_tweets = pd.read_csv('dataset/tweets.csv')

In [6]:
gen_tweets

Unnamed: 0,id,text,source,user_id,truncated,in_reply_to_status_id,in_reply_to_user_id,in_reply_to_screen_name,retweeted_status_id,geo,...,favorited,retweeted,possibly_sensitive,num_hashtags,num_urls,num_mentions,created_at,timestamp,crawled_at,updated
0,594073273480130560,How Randolph Hodgson and Neals Yard Dairy gave...,"<a href=""http://www.apple.com"" rel=""nofollow"">...",887281,,0,0,,0,,...,,,,0,1,0,Fri May 01 09:38:00 +0000 2015,2015-05-01 11:38:00,2015-05-01 12:59:53,2015-05-01 12:59:53
1,594066507723833345,“Twitter’s multi-billion dollar mistake happen...,"<a href=""http://www.apple.com"" rel=""nofollow"">...",887281,,0,0,,0,,...,,,,0,1,1,Fri May 01 09:11:07 +0000 2015,2015-05-01 11:11:07,2015-05-01 12:59:53,2015-05-01 12:59:53
2,593739179655323649,The evolution of advertising in the legal sect...,"<a href=""http://twitter.com/download/iphone"" r...",887281,,0,0,,0,,...,,,,0,1,0,Thu Apr 30 11:30:26 +0000 2015,2015-04-30 13:30:26,2015-05-01 12:59:53,2015-05-01 12:59:53
3,593737857149345792,RT @rorysutherland: Plan Bee - http://t.co/030...,"<a href=""http://twitter.com/download/iphone"" r...",887281,,0,0,,593693743548649472,,...,,,,0,1,4,Thu Apr 30 11:25:10 +0000 2015,2015-04-30 13:25:10,2015-05-01 12:59:53,2015-05-01 12:59:53
4,593282967134466051,RT @davewiner: Some say the Other Internet is ...,"<a href=""http://twitter.com/download/iphone"" r...",887281,,0,0,,593213234737446912,,...,,,,0,1,1,Wed Apr 29 05:17:36 +0000 2015,2015-04-29 07:17:36,2015-05-01 12:59:53,2015-05-01 12:59:53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248528,573687545311162368,RT @FreestyIeRaps: They killed this hoe,"<a href=""http://twitter.com/download/iphone"" r...",3073742769,,0,0,,570307641035100161,,...,,,,0,0,1,Fri Mar 06 03:32:24 +0000 2015,2015-03-06 04:32:24,2015-05-02 01:41:44,2015-05-02 01:41:44
248529,573687532149424128,"RT @QuiNigga_: ""@SavageBars: HE KILLED IT AGAIN","<a href=""http://twitter.com/download/iphone"" r...",3073742769,,0,0,,572641966216884225,,...,,,,0,0,2,Fri Mar 06 03:32:20 +0000 2015,2015-03-06 04:32:20,2015-05-02 01:41:44,2015-05-02 01:41:44
248530,573687515342835712,RT @FreestyIeRaps: HE KILLED IT,"<a href=""http://twitter.com/download/iphone"" r...",3073742769,,0,0,,572622978447503360,,...,,,,0,0,2,Fri Mar 06 03:32:16 +0000 2015,2015-03-06 04:32:16,2015-05-02 01:41:44,2015-05-02 01:41:44
248531,573659786216079360,RT @DreecNation: S/o to my new follower @QuiNi...,"<a href=""http://twitter.com/download/iphone"" r...",3073742769,,0,0,,573651460103106560,,...,,,,0,0,2,Fri Mar 06 01:42:05 +0000 2015,2015-03-06 02:42:05,2015-05-02 01:41:44,2015-05-02 01:41:44


In [7]:
def create_digital_dna_from_profile(users_df):
    df = users_df
    
    # Applying necessary replacements
    # D - Description is available, E - Description not available
    # U - URL is available, V - URL is not available
    df['description'] = np.where(pd.isnull(users_df['description']) == True, "E", "D")
    df['url'] = np.where(pd.isnull(users_df['url']) == True, "V", "U")
    
    # Changed user data
    return df

def create_digital_dna_from_tweets(tweets_df):
    '''For each user id in tweets_df return a digital DNA string based on posting behaviour.'''
    
    # Add columns for counts of tweets, replies and retweets.
    tweets_df['num_retweets'] = np.where(tweets_df['retweeted_status_id'] == 0, 0, 1)
    tweets_df['num_replies'] = np.where(tweets_df['in_reply_to_status_id'] == 0, 0, 1)
    tweets_df['num_tweets'] = np.where((tweets_df['num_retweets'] == 0) & (tweets_df['num_replies'] == 0), 1, 0)

    # DNA alphabet for tweet (A), retweet (C) and reply (T).
    tweets = tweets_df['num_tweets'] == 1
    retweets = tweets_df['num_retweets'] == 1
    replies = tweets_df['num_replies'] == 1

    tweets_df.loc[:, 'DNA'] = np.where(retweets, 'C', np.where(replies, 'T', 'A'))

    # Sort tweets by timestamp.
    tweets_df = tweets_df[['user_id', 'timestamp', 'DNA']]
    tweets_df = tweets_df.sort_values(by=['timestamp'])

    # Create digital DNA string for account.
    dna = tweets_df.groupby(by=['user_id'])['DNA'].agg(lambda x: ''.join(x))
    
    return dna

In [8]:
print('Users shape:', gen_users.shape)
print('Tweets shape:', gen_tweets.shape)

Users shape: (100, 42)
Tweets shape: (248533, 25)


In [9]:
# Filtering data which we need
filtered_user_data = gen_users.filter(["id", "statuses_count", "followers_count",
                                      "friends_count", "favourites_count", "listed_count",
                                      "url", "description", "timestamp", "updated"])

In [10]:
# Processing user data and tweets of users
processed_user_data = create_digital_dna_from_profile(filtered_user_data)
processed_user_data['tweets_dna'] = ""
processed_tweets = create_digital_dna_from_tweets(gen_tweets)

print('(Processed) Users shape:', processed_user_data.shape)
print('(Processed) Tweets shape:', processed_tweets.shape)

# Compile user data with dna tweets
for i in range(processed_user_data.shape[0]):
    user_id = processed_user_data['id'][i]
    dna = processed_tweets[user_id]
    processed_user_data['tweets_dna'][i] = dna

(Processed) Users shape: (100, 11)
(Processed) Tweets shape: (100,)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_user_data['tweets_dna'][i] = dna


In [11]:
twitter_account = processed_user_data.filter(["statuses_count", "followers_count", "friends_count", 
                                              "favourites_count", "listed_count", "url", "description",
                                              "timestamp", "updated", "tweets_dna"])

In [12]:
input_train = twitter_account.values.reshape(twitter_account.shape[0], twitter_account.shape[1])
input_shape = (1, 10, )

batch_size = 20
no_epochs = 50
validation_split = 0.1
verbosity = 1

latent_dim = 1

In [13]:
input_train.shape

(100, 10)

## Filtered Data

In [14]:
twitter_account

Unnamed: 0,statuses_count,followers_count,friends_count,favourites_count,listed_count,url,description,timestamp,updated,tweets_dna
0,60463,568,387,15599,1,U,E,2012-02-10 11:59:44,2016-03-15 16:02:35,CCCCCCCACCCCCCAAAAAACACAAAACCCCCCCCCCACCCACCCC...
1,135,208,263,43,2,V,D,2009-09-19 10:28:16,2016-03-15 16:06:58,ACCAACACCAAAAAAAAAATTACACAAAATCTAAATAAAAAAAAAA...
2,283536,7785,424,1157,217,U,D,2009-05-03 14:17:01,2016-03-15 15:53:59,CTCCCCATACCCCTACAAAAAACACCAACAAACCCAAACTACCAAA...
3,1770,179,132,1224,0,V,E,2013-08-12 08:35:01,2016-03-15 16:02:01,CAAAACAAATAACAAAAAAAAAAACCACAAAATAAACAAAAAAAAA...
4,4790,3325,1327,235,5,U,D,2011-05-31 04:07:05,2016-03-15 16:01:50,ACAAAAAAAAAAACAAAAAAAAAAAAAAAAAAACACATCCATAAAA...
...,...,...,...,...,...,...,...,...,...,...
95,8860,178,493,715,6,V,D,2009-10-26 01:48:05,2016-03-15 16:03:12,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...
96,12502,478,353,28547,0,V,D,2013-07-24 13:11:27,2016-03-15 16:09:55,ATTCAAACTTATTCAAACAACAACCACCAATCCTCATATCATCCAT...
97,1189,369,337,250,0,V,D,2015-02-08 14:34:39,2016-03-15 16:04:18,ACCCAACCCACAACACACAACTCCAAATACTTAACCCAAAAAACAA...
98,103,26,189,40,1,V,E,2012-06-22 04:23:20,2016-03-15 16:03:16,TTTTTTTTTATTTTTTTTTTTTTATAATTATTTAATATAAAATTTT...


## Encoder

In [15]:
i       = Input(shape=input_shape, name='encoder_input')
x       = Dense(10, activation='relu')(i)
x       = BatchNormalization()(x)
mu      = Dense(latent_dim, name='latent_mu')(x)
sigma   = Dense(latent_dim, name='latent_sigma')(x)

In [16]:
# Define sampling with reparameterization trick
def sample_z(args):
  mu, sigma = args
  batch     = K.shape(mu)[0]
  dim       = K.int_shape(mu)[1]
  eps       = K.random_normal(shape=(batch, dim))
  return mu + K.exp(sigma / 2) * eps

In [17]:
# Use reparameterization trick to ensure correct gradient
z = Lambda(sample_z, output_shape=(latent_dim, ), name='z')([mu, sigma])

In [18]:
# Instantiate encoder
encoder = Model(i, [mu, sigma, z], name='encoder')
encoder.summary()

Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      [(None, 1, 10)]      0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 1, 10)        110         encoder_input[0][0]              
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 1, 10)        40          dense[0][0]                      
__________________________________________________________________________________________________
latent_mu (Dense)               (None, 1, 1)         11          batch_normalization[0][0]        
____________________________________________________________________________________________

## Decoder

In [19]:
d_i   = Input(shape=(latent_dim, ), name='decoder_input')
x     = Dense(10, activation='relu')(d_i)
x     = Reshape(input_shape)(x)
o     = BatchNormalization(name='decoder_output')(x)

In [20]:
# Instantiate decoder
decoder = Model(d_i, o, name='decoder')
decoder.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
decoder_input (InputLayer)   [(None, 1)]               0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                20        
_________________________________________________________________
reshape (Reshape)            (None, 1, 10)             0         
_________________________________________________________________
decoder_output (BatchNormali (None, 1, 10)             40        
Total params: 60
Trainable params: 40
Non-trainable params: 20
_________________________________________________________________


# VAE

In [21]:
# Instantiate VAE
vae_outputs = decoder(encoder(i)[2])
vae         = Model(i, vae_outputs, name='vae')
vae.summary()

Model: "vae"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_input (InputLayer)   [(None, 1, 10)]           0         
_________________________________________________________________
encoder (Functional)         [(None, 1, 1), (None, 1,  172       
_________________________________________________________________
decoder (Functional)         (None, 1, 10)             60        
Total params: 232
Trainable params: 192
Non-trainable params: 40
_________________________________________________________________


## Loss Function

In [22]:
"""
We are calcuating the binary crossentropy here.
KL Divergence and Reconstruction are the two
metrics which we are using to make prediction.
"""
def kl_reconstruction_loss(true, pred):
  # Reconstruction loss
  reconstruction_loss = binary_crossentropy(K.flatten(true), K.flatten(pred)) * input_shape[0]
  # KL divergence loss
  kl_loss = 1 + sigma - K.square(mu) - K.exp(sigma)
  kl_loss = K.sum(kl_loss, axis=-1)
  kl_loss *= -0.5
  # Total loss = 50% rec + 50% KL divergence loss
  return K.mean(reconstruction_loss + kl_loss)

## Training

In [23]:
"""
We are using the adam optimizer from Keras in combination
of our own defined loss function which is the mean of
reconstruction loss and the KL divergence.
"""
vae.compile(optimizer='adam', loss=kl_reconstruction_loss)

# Start Training
vae.fit(input_train, input_train, 
        epochs = no_epochs, 
        batch_size = batch_size, 
        validation_split = validation_split)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).