In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

In [2]:
def create_digital_dna_from_tweets(tweets_df):
    '''For each user id in tweets_df return a digital DNA string based on posting behaviour.'''
    
    # Add columns for counts of tweets, replies and retweets.
    tweets_df['num_retweets'] = np.where(tweets_df['retweeted_status_id'] == 0, 0, 1)
    tweets_df['num_replies'] = np.where(tweets_df['in_reply_to_status_id'] == 0, 0, 1)
    tweets_df['num_tweets'] = np.where((tweets_df['num_retweets'] == 0) & (tweets_df['num_replies'] == 0), 1, 0)
    
    tweets = tweets_df['num_tweets'] == 1
    retweets = tweets_df['num_retweets'] == 1
    replies = tweets_df['num_replies'] == 1

    # DNA alphabet for tweet (A), retweet (C) and reply (T).
    tweets_df.loc[:, 'DNA'] = np.where(retweets, ' C', np.where(replies, ' T', ' A'))

    # Sort tweets by timestamp.
    tweets_df = tweets_df[['user_id', 'timestamp', 'DNA']]
    tweets_df = tweets_df.sort_values(by=['timestamp'])

    # Create digital DNA string for account.
    dna = tweets_df.groupby(by=['user_id'])['DNA'].agg(lambda x: ''.join(x))
    
    return dna

In [3]:
gen_tweets = pd.read_csv('dataset/tweets.csv')

In [4]:
tweets_dna = create_digital_dna_from_tweets(gen_tweets).to_numpy()

In [5]:
"""
This part is being used to generate vocabulory which will be used to
convert the text into indices. Then this data can be converted to
matrices and fed to the learning models.
Sequence legth is set by checkcing the maximum lenth of string in the
dataset so that we don't have to lose any detail which help in
converting the representation back to the original text.
"""
VOCAB_SIZE = 5
MAX_SEQUENCE_LENGHT = 3500
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE, output_sequence_length=MAX_SEQUENCE_LENGHT)
encoder.adapt(tweets_dna)

In [6]:
"""
Checking if the generated tokens are right which
will be used as terms for strigs.
"""
vocab = np.array(encoder.get_vocabulary())
print(vocab)

['' '[UNK]' 'a' 'c' 't']


In [15]:
"""
Checking if the original and the processed data
because presence of unknown chunks would make it
impossible to convert data back to text.
"""
encoded_example = encoder(tweets_dna)[:3].numpy()

for n in range(1):
    print("Original: ", tweets_dna[n])
    print("Processed: ", " ".join(vocab[encoded_example[n]]))
    print()

Original:   A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A T A A A A A A T T A A A T A T A A A T A A A T A A T A T T A T T A A A A A A A A A A A A A A A A A A A A T A A A A A A T A A A A A A A A A A A A A A A A T A A T A T A A A A A A T A A A A A A A A A A A A A A A A A T A A T T A A A A A T T A A T T T A A A T A A A A A T A T A A A A A A A A A A T T A T A T T T A T A A A T A T A A A T A A A A A A A T A A A A A T T A A T A A A A T A A A A T A A A A T T A T T T T A T T T T A A A A T T A A A T A T T A T A A A A A T T A A A T T A A T T A T T T A T T A A T T A A T T T T T A T A A T T T A A T T A T T T T A A T T A A T A A T T T T T T A A A A T A T A T T A T A T T A A A A A A A T T T T A T A T A A T A A A A T T T T T T T T T T T T T T T T T T T A T A T T A T A T T T T T A T T A A A A T A A T T A A T A A T A A T A A A T A A A A A A T T A A A A A T T T A C A T C A A C A A A C T A T A C T C A T C T A T T A A A A A T A C T T C A A A C C A A C C C C A A T A A A T A A A A 

In [16]:
encoder_input       = tf.keras.layers.Input(shape=(encoder(tweets_dna).shape[1], ))
embedding_layer     = tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=64, mask_zero=True)(encoder_input)
LTSM_layer          = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(embedding_layer)
dense_layer         = tf.keras.layers.Dense(64, activation='relu')(LTSM_layer)
text_mu             = tf.keras.layers.Dense(len(encoder.get_vocabulary()), name='text_latent_mu')(dense_layer)
text_sigma          = tf.keras.layers.Dense(len(encoder.get_vocabulary()), name='text_latent_sigma')(dense_layer)

In [17]:
# Define sampling with reparameterization trick
def sample_z(args):
    mu, sigma = args
    batch     = tf.keras.backend.shape(text_mu)[0]
    dim       = tf.keras.backend.int_shape(text_mu)[1]
    eps       = tf.keras.backend.random_normal(shape=(batch, dim))
    return text_mu + tf.keras.backend.exp(text_sigma / 2) * eps

z = tf.keras.layers.Lambda(sample_z, output_shape=(len(encoder.get_vocabulary()), ), name='z')([text_mu, text_sigma])

In [18]:
model = tf.keras.Model(encoder_input, [text_mu, text_sigma, z], name='Text Compression Model')

In [19]:
model.summary()

Model: "Text Compression Model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 3500)]       0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 3500, 64)     320         input_5[0][0]                    
__________________________________________________________________________________________________
bidirectional_4 (Bidirectional) (None, 128)          66048       embedding_4[0][0]                
__________________________________________________________________________________________________
dense_8 (Dense)                 (None, 64)           8256        bidirectional_4[0][0]            
_____________________________________________________________________________

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
model_2 = tf.keras.Sequential([encoder, tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=64,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [None]:
model_2.summary()

In [None]:
print([layer.supports_masking for layer in model_2.layers])

In [None]:
# predict on a sample text without padding.

sample_tweet = tweets_dna[:1]

predictions = model_2.predict(sample_tweet)
print(predictions[0])

In [None]:
# predict on a sample text with padding

padding = "a " * 2000
predictions = model.predict(np.array([sample_tweet[0], padding]))
print(predictions[0])

In [None]:
model_2.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

# -------------------------------------------------------------------------------------------------------------

In [None]:
import pandas as pd

In [None]:
input_shape = (1, 11)
input_shape

In [None]:
a = pd.Series([1,2,3,4])

In [None]:
print(a.values.reshape(2,2))

In [None]:
b = pd.DataFrame(a)

In [None]:
print(type(b.values))

***

In [None]:
"""
#TODO
Get a word_dict to convert all of the sequences to using the same pattern.
So that the conversion back to data gets easier if necessary.
This cell is the starter code to convert 'url', 'description' and 'tweets_dna'
to a consistent sequence of numbers.
"""

t  = Tokenizer()
fit_text = 'CTCCCAACACTCCACCCACCAAAAATCATACAATATTATAAACCAT'
t.fit_on_texts(fit_text)
test_text = 'CTCCCAACACTCCACCCACCAAAAATCATACAATATTATAAACCAT'
sequences = t.texts_to_sequences(test_text)

print("word_index : ",t.word_index)
print("sequences : ",sequences,'\n')