# Trends-based recommendation
In this notebook we will classify the tweets into trends and these trends will help us decide which hashtags to recommend 

In [1]:
from files_reader import *
import tensorflow as tf

## Get the data

In [2]:
tweets_and_trends = []
tweets = []
trends = []

# tweets_and_trends += (filesReader.read_file(UK_tweets_file))
# tweets_and_trends += (filesReader.read_file(US_tweets_file))
# tweets_and_trends += (filesReader.read_file(AUS_tweets_file))
# tweets_and_trends += (filesReader.read_file(IR_tweets_file))
# tweets_and_trends += (filesReader.read_file(CAN_tweets_file))
tweets_and_trends += (filesReader.read_file(new_US_file))
tweets_and_trends += (filesReader.read_file(new_UK_file))
tweets_and_trends += (filesReader.read_file(new_AUS_file))
tweets_and_trends += (filesReader.read_file(new_CAN_file))
tweets_and_trends += (filesReader.read_file(new_IR_file))   
tweets_and_trends += (filesReader.read_file(new_SINGA_file))   
tweets_and_trends += (filesReader.read_file(new_SA_file))

random.shuffle(tweets_and_trends)

tweets, trends = filesReader.split_tweets_and_trends(tweets_and_trends)
print(f"We have {len(tweets)} tweets.")


We have 214369 tweets.


## Tokenize the text

In [3]:
from keras.preprocessing.text import Tokenizer

tweets_tokenizer = Tokenizer(oov_token="<OOV>")
tweets_tokenizer.fit_on_texts(tweets)
tweets_word_index = tweets_tokenizer.word_index
tweets_index_word = tweets_tokenizer.index_word

print(f"We have {len(tweets_word_index)} different words")
print(tweets_word_index)

We have 66632 different words


## Create the padded sequences

In [4]:
from keras.utils import pad_sequences
sequence_length = 20

tweets_sequences = tweets_tokenizer.texts_to_sequences(tweets)
tweets_sequences_padded = pad_sequences(tweets_sequences, padding="post", maxlen=sequence_length)

## Map the trends to numbers

In [5]:
trends_map = {}

counter = 0

for trend in trends:
    if not (trend in trends_map):
        trends_map[trend] = counter
        counter += 1

no_of_trends = len(trends_map)
inv_trends_map = {v: k for k, v in trends_map.items()}
print(f"We have {no_of_trends} different trends")
print(trends_map)

We have 717 different trends
{'FAYouthCup': 0, 'bucciovertimechallenge': 1, 'rufc': 2, 'MayDay': 3, 'MetGala': 4, 'AMillionLittleThings': 5, 'afldeestigers': 6, 'MentalHealthAwarenessMonth': 7, 'NHLPlayoffs': 8, 'tuesdayvibe': 9, 'TheFlash': 10, 'GoAvsGo': 11, 'Anzac Day': 12, 'LeafsForever': 13, 'emergencyalert': 14, 'SHOONGxTAEYANGxLISA': 15, 'Gigi': 16, 'NationalPhysiciansDay': 17, 'Matthew Knies': 18, 'ChangeAWordRuinAMovieQuote': 19, 'NationalLingerieDay': 20, 'Bank Holiday Monday': 21, 'NationalSuperheroDay': 22, 'Showman': 23, 'WorldPenguinDay': 24, 'HonkaiStarRail': 25, 'PAKvNZ': 26, 'forgotten80s': 27, 'wednesdaythought': 28, 'Flames': 29, 'Labour Day': 30, 'Finally Friday': 31, 'TOTMUN': 32, 'Lebron': 33, 'LetsGoOilers': 34, 'wunvsyd': 35, 'Corrie': 36, 'Dundalk': 37, 'NRLStormWarriors': 38, 'FLAvsBOS': 39, 'NRLRoostersDragons': 40, 'NationalCancerCNSDay': 41, 'Bassitt': 42, 'WednesdayWisdom': 43, 'aflcrowspies': 44, '1802LoveDefiesTime': 45, 'Buchner': 46, 'ReaTsotellaMonday

## Create the trends sequences

In [6]:
trends_sequences = [trends_map[trend] for trend in trends]
print(trends_sequences)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 10, 7, 4, 11, 12, 13, 14, 15, 16, 7, 17, 18, 19, 20, 1, 21, 22, 23, 24, 25, 26, 3, 27, 28, 29, 30, 31, 32, 7, 33, 7, 34, 6, 35, 7, 3, 3, 36, 37, 29, 38, 39, 40, 41, 42, 3, 43, 8, 44, 3, 39, 7, 26, 45, 30, 46, 47, 48, 1, 49, 50, 1, 51, 29, 9, 52, 3, 5, 6, 53, 54, 55, 26, 56, 57, 58, 28, 59, 3, 53, 60, 61, 62, 58, 28, 25, 8, 63, 2, 43, 64, 27, 1, 65, 30, 66, 49, 67, 68, 64, 69, 70, 71, 72, 61, 3, 73, 74, 26, 75, 11, 76, 77, 78, 2, 79, 80, 71, 3, 81, 14, 82, 83, 84, 85, 86, 1, 3, 87, 88, 58, 79, 89, 90, 47, 91, 92, 50, 93, 94, 95, 96, 29, 97, 3, 98, 99, 100, 26, 4, 101, 64, 73, 102, 103, 34, 104, 105, 1, 106, 26, 107, 108, 13, 30, 109, 110, 111, 86, 69, 97, 29, 54, 112, 113, 108, 6, 108, 38, 3, 114, 77, 102, 13, 54, 115, 70, 116, 117, 118, 41, 119, 107, 120, 11, 40, 121, 73, 122, 123, 48, 63, 124, 66, 125, 8, 9, 126, 127, 35, 39, 3, 54, 128, 79, 45, 121, 76, 4, 129, 100, 69, 54, 130, 29, 131, 34, 3, 6, 61, 9, 132, 39, 11, 133, 134, 135, 71, 75, 136, 137, 

## Encode the trends

In [7]:
from keras.utils import to_categorical
import tensorflow as tf

encoded_trends = to_categorical(trends_sequences)
print(encoded_trends.shape)

(214369, 717)


## Prepare the pre-trained embeddings

In [8]:
from Embeddings.embeddings_matrix import get_embeddings_matrix

embeddings_index_path = "./Embeddings/embeddings_index_object.pkl"
embeddings_matrix, hits, misses = get_embeddings_matrix(tweets_word_index, embeddings_index_path)

print(f"Hits: {hits}, Misses: {misses}")

Hits: 46755, Misses: 19877


## Split the data

In [9]:
training_split = 0.8
training_tweets_count = int(0.8 * len(tweets_sequences_padded))

In [10]:
train_data = tweets_sequences_padded[0:training_tweets_count]
train_labels = encoded_trends[0:training_tweets_count]
test_data = tweets_sequences_padded[training_tweets_count:]
test_labels = encoded_trends[training_tweets_count:]

print(f'we have {len(train_data)} tweets for training and {len(test_data)} for testing')

we have 171495 tweets for training and 42874 for testing


## Build the model

In [11]:
#hyperparameters
embedding_dimensions = 300
lstm_units = 128
dropout_value = 0.2
conv_filters = 128
conv_kernel_size = 3
dense_layers = 10000

In [12]:
from keras import initializers

no_of_tweets_words = len(tweets_word_index) + 1

trends_classifier = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        no_of_tweets_words,
        embedding_dimensions,
        input_length=sequence_length,
        embeddings_initializer=initializers.Constant(embeddings_matrix),
        trainable=True
    ),
    tf.keras.layers.Conv1D(conv_filters, conv_kernel_size),
    tf.keras.layers.AveragePooling1D(),
    tf.keras.layers.Dropout(dropout_value),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units)),
    tf.keras.layers.Dropout(dropout_value),
    # tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(no_of_trends, activation='softmax')
])

trends_classifier.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

trends_classifier.summary()

# trends_classifier = tf.keras.models.load_model("trends_classifier")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 300)           19989900  
                                                                 
 conv1d (Conv1D)             (None, 18, 128)           115328    
                                                                 
 average_pooling1d (AverageP  (None, 9, 128)           0         
 ooling1D)                                                       
                                                                 
 dropout (Dropout)           (None, 9, 128)            0         
                                                                 
 bidirectional (Bidirectiona  (None, 256)              263168    
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (None, 256)               0

In [13]:
#learning rate callback
def lr_schedule(epoch):
    lr = 0.001
    if epoch > 2:
        lr = 0.0005
    return lr

lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_schedule)

In [14]:
epochs = 6
trends_classifier.fit(train_data, train_labels, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/6
Epoch 2/6

KeyboardInterrupt: 

In [None]:
trends_classifier.save("trends_classifier")

KeyboardInterrupt: 

In [24]:
import numpy as np

def predict(tweet, tweet_tokenizer, trends_map, inv_trends_map, pad_length, model):
    tweet_sequence = tweet_tokenizer.texts_to_sequences([tweet])[0]
    padded_tweet_sequence = pad_sequences([tweet_sequence], maxlen=pad_length, padding='post')
    prediction = (model.predict(padded_tweet_sequence))
    trends_indices = np.argsort(prediction, axis=-1)[0][-3:]
    return [inv_trends_map[trend_index] for trend_index in trends_indices]

In [25]:
tweet = "here is to the great leader stalin."

print(predict(tweet, tweets_tokenizer, trends_map, inv_trends_map, sequence_length, trends_classifier))

['New Month', 'MondayMood', 'MondayMotivation']
