In [33]:
import pandas as pd
import numpy as np
import nltk
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score



#We are going to use word vectorization to embed the words
import gensim

In [2]:
tweets = pd.read_csv('cleaned_tweets_dataset.csv')

# Preprocessing the tweets
Here, we shall do things like change common short hands into full form (in order to deal with the issue of punctuation removal)


In [3]:
#First, we convert words like can't, won't etc into cannot and will not
short_form_dict = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have",}
    
    



In [4]:
tweets.head()

Unnamed: 0,id,uncleaned_text,account_type,emoji,emoji_text,text,url,tags
0,7.87406e+17,"We can't extract the\n""Avenge"" from\n""Scavenge...",bot,{},[],we can t extract the avenge from scavengers,https://t.co/FpAstfF67l,[]
1,7.87406e+17,"We can't get the\n""Other"" out of\n""Geothermal""...",bot,{},[],we can t get the other out of geothermal,https://t.co/pw6xYLUph5,[]
2,7.87406e+17,"You can't spell\n""Panoply""\nwithout ""Ply"" http...",bot,{},[],you can t spell panoply without ply,https://t.co/XYsPhTkcDX,[]
3,7.87406e+17,"Put the ""Myrrh""\nin ""Commercial"" https://t.co/...",bot,{},[],put the myrrh in commercial,https://t.co/yyadIxV48k,[]
4,7.87406e+17,"We need ""Dying""\nto spell ""Muddying"" https://t...",bot,{},[],we need dying to spell muddying,https://t.co/WJxZL1TkfM,[]


In [5]:
#Things to keep: Capitalisation of words, punctuation, emojis and their positions

In [6]:
#Preprocessing steps (ie cleaning of text and including important information like emojis)

# Train test split, model training and word embedding

In [7]:
#create the X and y classes
tweets['isBot'] = 0
tweets.loc[tweets['account_type'] == 'bot', 'isBot'] = 1
y = tweets.isBot
X = tweets[['text']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [8]:
#first, just embed the cleaned tweets and see the results

#store the cleaned tweets in a list of lists representing the split
X_train_sentence = [d.split() for d in X_train.text.tolist()]


In [9]:
#create the word to vector
#try a 100 dimensional vector first
dim = 100
#Fit a Word2Vec model on our dataset
w2v = gensim.models.Word2Vec(sentences = X_train_sentence, vector_size = dim, window = 10, min_count = 1)


In [10]:
w2v.wv.most_similar('bad')

[('damn', 0.9481765031814575),
 ('man', 0.9414718151092529),
 ('honestly', 0.9409035444259644),
 ('yeah', 0.9401690363883972),
 ('shit', 0.9393621683120728),
 ('sad', 0.9388050436973572),
 ('exactly', 0.938416063785553),
 ('funny', 0.9370057582855225),
 ('lol', 0.9318917989730835),
 ('oh', 0.9297439455986023)]

In [11]:
len(w2v.wv.key_to_index)

50728

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_sentence)

In [13]:
tokenizer.word_index

{'the': 1,
 'to': 2,
 'a': 3,
 'i': 4,
 'and': 5,
 'you': 6,
 'of': 7,
 'in': 8,
 'for': 9,
 'is': 10,
 'it': 11,
 's': 12,
 'this': 13,
 'on': 14,
 'my': 15,
 'that': 16,
 'with': 17,
 't': 18,
 'be': 19,
 'we': 20,
 'at': 21,
 'your': 22,
 'are': 23,
 'me': 24,
 'have': 25,
 'so': 26,
 'all': 27,
 'was': 28,
 'can': 29,
 'just': 30,
 'but': 31,
 'from': 32,
 'not': 33,
 'out': 34,
 'what': 35,
 'by': 36,
 'one': 37,
 'they': 38,
 'm': 39,
 'our': 40,
 'as': 41,
 'more': 42,
 'get': 43,
 'up': 44,
 'if': 45,
 'like': 46,
 'about': 47,
 'he': 48,
 'do': 49,
 'an': 50,
 'new': 51,
 'will': 52,
 'now': 53,
 'love': 54,
 'how': 55,
 'time': 56,
 'when': 57,
 'who': 58,
 'or': 59,
 'there': 60,
 'no': 61,
 'day': 62,
 'has': 63,
 'today': 64,
 'thank': 65,
 'people': 66,
 'here': 67,
 're': 68,
 'don': 69,
 'see': 70,
 'been': 71,
 'good': 72,
 'us': 73,
 'know': 74,
 'their': 75,
 'his': 76,
 'back': 77,
 'great': 78,
 'would': 79,
 'go': 80,
 'some': 81,
 've': 82,
 'her': 83,
 'she': 84

In [14]:
#tokenize the words (this is for padding later)
X_train_tokens = tokenizer.texts_to_sequences(X_train_sentence)

In [15]:
#Since twitter only allows 280 characters, we shall keep the max length to
#the greater of 100 or whatever maximum length is found in the dataset
max_length = max(100, max(len(x) for x in X_train_tokens))


In [16]:
#Next, we can pad the sequences
X_train_tokens = pad_sequences(X_train_tokens, maxlen = max_length)

In [17]:
X_train_tokens.shape

(42947, 100)

In [18]:
vocabulary_size = len(tokenizer.word_index) + 1
vocab = tokenizer.word_index

In [19]:
#create the embedding matrix according to our word embeddings
embedding_mat = np.zeros((vocabulary_size, dim))
num_words = 0
#insert the word embeddings into our initialised object
for word, token in tokenizer.word_index.items():

    #get the corresponding vector to the word and the token
    #if the word is present in the dictionary, then we append the corresponding matrix else we continue
    if word in w2v.wv:
        vector = w2v.wv[word]
        embedding_mat[token] = vector
        num_words +=1
        



In [20]:
print(num_words, vocabulary_size)

50728 50729


In [21]:
w2v.wv['i']

array([-2.3992848 ,  3.1841776 ,  1.2062366 , -1.9689982 ,  1.051914  ,
       -2.9550207 ,  2.6452806 ,  3.466314  ,  1.2474681 , -0.6973632 ,
        1.2842598 , -1.6237582 ,  2.0778978 , -0.04110769,  0.43601024,
       -1.6178536 ,  2.0111864 , -0.5543657 , -1.0898633 , -0.5951887 ,
        1.119117  , -1.5618718 ,  1.6210287 ,  1.0846449 , -2.7858665 ,
        0.43388546,  2.809095  , -0.5093762 , -1.0577548 ,  0.21375081,
        2.579838  , -1.1874669 ,  1.3204618 , -0.24106614, -0.15340355,
        1.8027278 ,  2.8504913 , -0.7817102 , -1.4898494 ,  1.1824074 ,
        2.1488767 ,  0.7653103 ,  0.7508155 , -0.62070227,  2.2415252 ,
        0.09766399,  0.3771947 ,  0.19460534,  0.569336  , -1.0239731 ,
        0.18750551, -1.0761305 , -0.42708784, -2.7616012 , -1.3874998 ,
        2.8107812 , -0.22384486,  1.0448225 , -0.04655965, -0.5599327 ,
        1.3539746 ,  0.50677794,  1.5367647 ,  1.572473  ,  0.73857605,
       -1.3069929 ,  0.08728168,  2.0257118 , -2.5137622 ,  0.96

In [22]:
#Since the dataset is an imbalanced dataset, we shall assign higher weights for the bot examples
#the weights assigned to the classes shall be a reciprocal of the frequencies
class_weights = {0:1.334,1:5} 

In [23]:

#creating the neural network, starting with a relatively simple architechture
model = Sequential()
#create the embedding layer for the conversion of inputs to embedded vectors
model.add(Embedding(vocabulary_size, output_dim = dim, weights = [embedding_mat], input_length = max_length, trainable = False))
#All these LSTMS run for a 100 "timesteps" as determined by the max length parameter the units is the determine the outputsize
model.add(LSTM(units = 256, recurrent_dropout=0.3))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])

In [24]:
model.fit(X_train_tokens, y_train,  epochs = 8, class_weight = class_weights)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x29a03de9730>

In [25]:
#applying the preprocessing steps on the test set and seeing the performance
X_test_sentence = [d.split() for d in X_test.text.tolist()]
X_test_tokens = tokenizer.texts_to_sequences(X_test_sentence)

In [26]:
X_test_tokens = pad_sequences(X_test_tokens, maxlen = max_length)

In [27]:
y_pred = np.where(model.predict(X_test_tokens) > 0.5, 1,0)



In [28]:
y_pred = y_pred.reshape((y_pred.shape[0],))

In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.72      0.78      8527
           1       0.34      0.55      0.42      2210

    accuracy                           0.69     10737
   macro avg       0.60      0.64      0.60     10737
weighted avg       0.75      0.69      0.71     10737



In [30]:
y_pred_train = model.predict(X_train_tokens)



In [31]:
y_pred_train = np.where(y_pred_train > 0.5, 1,0)
y_pred_train = y_pred_train.reshape((y_pred_train.shape[0],))

In [32]:
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.91      0.76      0.83     34385
           1       0.42      0.70      0.52      8562

    accuracy                           0.75     42947
   macro avg       0.66      0.73      0.68     42947
weighted avg       0.81      0.75      0.77     42947

