In [8]:
import pandas as pd
import numpy as np
import nltk
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score



#We are going to use word vectorization to embed the words
import gensim

In [9]:
tweets = pd.read_csv('train.csv')

In [10]:
tweets.head()

Unnamed: 0.1,Unnamed: 0,screen_name,text,account.type,class_type
0,0,imranyebot,YEA now that note GOOD,bot,others
1,1,zawvrk,Listen to This Charming Man by The Smiths htt...,human,human
2,2,zawarbot,wish i can i would be seeing other hoes on the...,bot,others
3,3,ahadsheriffbot,The decade in the significantly easier schedul...,bot,others
4,4,kevinhookebot,"""Theim class=\""alignnone size-full wp-image-60...",bot,rnn


In [11]:
tweets['account.type'].value_counts()

human    10358
bot      10354
Name: account.type, dtype: int64

# Preprocessing the tweets
Here, we shall do things like change common short hands into full form (in order to deal with the issue of punctuation removal)


In [12]:
#First, we convert words like can't, won't etc into cannot and will not
short_form_dict = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have",}
    
    



In [13]:
tweets.head()

Unnamed: 0.1,Unnamed: 0,screen_name,text,account.type,class_type
0,0,imranyebot,YEA now that note GOOD,bot,others
1,1,zawvrk,Listen to This Charming Man by The Smiths htt...,human,human
2,2,zawarbot,wish i can i would be seeing other hoes on the...,bot,others
3,3,ahadsheriffbot,The decade in the significantly easier schedul...,bot,others
4,4,kevinhookebot,"""Theim class=\""alignnone size-full wp-image-60...",bot,rnn


In [14]:
#Things to keep: Capitalisation of words, punctuation, emojis and their positions

In [15]:
#Preprocessing steps (ie cleaning of text and including important information like emojis)

# Train test split, model training and word embedding

In [16]:
#create the X and y classes
tweets['isBot'] = 0
tweets.loc[tweets['account.type'] == 'bot', 'isBot'] = 1
y = tweets.isBot
X = tweets[['text']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [17]:
#first, just embed the cleaned tweets and see the results

#store the cleaned tweets in a list of lists representing the split
X_train_sentence = [d.split() for d in X_train.text.tolist()]


In [18]:
#create the word to vector
#try a 100 dimensional vector first
dim = 100
#Fit a Word2Vec model on our dataset
w2v = gensim.models.Word2Vec(sentences = X_train_sentence, vector_size = dim, window = 10, min_count = 1)


In [19]:
w2v.wv.most_similar('bad')

[('got', 0.9991912245750427),
 ('song', 0.9988420009613037),
 ('even', 0.9986646771430969),
 ('im', 0.9986466765403748),
 ('saying', 0.9985396265983582),
 ('follow', 0.9983105659484863),
 ('actually', 0.9982374906539917),
 ('little', 0.9981598854064941),
 ('tweets', 0.9980735778808594),
 ('give', 0.9980161786079407)]

In [20]:
len(w2v.wv.key_to_index)

49396

In [21]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_sentence)

In [22]:
tokenizer.word_index

{'the': 1,
 'to': 2,
 'a': 3,
 'of': 4,
 'and': 5,
 'i': 6,
 'in': 7,
 'is': 8,
 'for': 9,
 'you': 10,
 'on': 11,
 'that': 12,
 'my': 13,
 'this': 14,
 'with': 15,
 'it': 16,
 'be': 17,
 'have': 18,
 'are': 19,
 '"the': 20,
 'at': 21,
 'not': 22,
 'will': 23,
 'people': 24,
 'me': 25,
 'your': 26,
 'we': 27,
 'as': 28,
 'all': 29,
 'an': 30,
 'but': 31,
 'some': 32,
 'from': 33,
 'was': 34,
 'so': 35,
 'they': 36,
 'like': 37,
 'our': 38,
 'can': 39,
 'just': 40,
 'if': 41,
 'do': 42,
 'get': 43,
 'about': 44,
 'has': 45,
 'by': 46,
 'new': 47,
 'more': 48,
 'what': 49,
 'up': 50,
 'who': 51,
 'one': 52,
 'when': 53,
 'their': 54,
 'out': 55,
 'java': 56,
 'been': 57,
 'would': 58,
 'time': 59,
 '&amp;': 60,
 'good': 61,
 'how': 62,
 'or': 63,
 'no': 64,
 'he': 65,
 'make': 66,
 'see': 67,
 '-': 68,
 'his': 69,
 'still': 70,
 'great': 71,
 'look': 72,
 'now': 73,
 'any': 74,
 "it's": 75,
 'only': 76,
 'think': 77,
 'thank': 78,
 'going': 79,
 'than': 80,
 'because': 81,
 'need': 82,
 '

In [23]:
#tokenize the words (this is for padding later)
X_train_tokens = tokenizer.texts_to_sequences(X_train_sentence)

In [24]:
#Since twitter only allows 280 characters, we shall keep the max length to
#the greater of 100 or whatever maximum length is found in the dataset
max_length = max(100, max(len(x) for x in X_train_tokens))


In [25]:
#Next, we can pad the sequences
X_train_tokens = pad_sequences(X_train_tokens, maxlen = max_length)

In [26]:
X_train_tokens.shape

(16569, 100)

In [27]:
vocabulary_size = len(tokenizer.word_index) + 1
vocab = tokenizer.word_index

In [28]:
#create the embedding matrix according to our word embeddings
embedding_mat = np.zeros((vocabulary_size, dim))
num_words = 0
#insert the word embeddings into our initialised object
for word, token in tokenizer.word_index.items():

    #get the corresponding vector to the word and the token
    #if the word is present in the dictionary, then we append the corresponding matrix else we continue
    if word in w2v.wv:
        vector = w2v.wv[word]
        embedding_mat[token] = vector
        num_words +=1
        



In [29]:
print(num_words, vocabulary_size)

28962 43954


In [30]:
w2v.wv['i']

array([-0.83360064,  1.8637209 , -0.16292998, -1.2810354 ,  2.5605867 ,
       -2.6962118 ,  1.4038603 ,  2.54276   , -0.8279036 , -1.2101963 ,
        0.6456239 , -4.13444   , -1.699847  ,  0.20016766, -0.2736352 ,
       -0.76784545,  0.254238  , -1.8366936 ,  0.6614056 , -3.8135579 ,
        1.1256603 ,  1.2417873 ,  1.5602096 , -1.9141973 , -0.38714257,
       -0.16193832, -1.9047168 , -1.2341831 , -1.1333822 , -1.2892557 ,
        2.1744235 ,  0.0411486 ,  0.73604226, -1.2472005 , -1.1338079 ,
        2.4912446 , -0.7618581 ,  0.3052468 , -0.47535712, -2.0741103 ,
        0.44636378, -0.8069056 , -0.85886824,  1.1107172 ,  2.8274627 ,
       -0.2805508 , -0.5989201 , -0.5323028 ,  1.2754575 ,  1.50086   ,
        0.10030042, -1.0064597 ,  0.3108339 , -1.1862323 ,  0.27735576,
       -0.3447321 ,  0.82573676,  0.26811567, -1.0243837 ,  0.8139931 ,
       -0.22126296,  0.08597779,  1.2582867 ,  0.9683419 , -1.9033693 ,
        1.5906106 ,  0.16434781,  2.4334266 , -2.1853802 ,  2.00

In [31]:
#Since the dataset is an imbalanced dataset, we shall assign higher weights for the bot examples
#the weights assigned to the classes shall be a reciprocal of the frequencies
 

In [32]:

#creating the neural network, starting with a relatively simple architechture
model = Sequential()
#create the embedding layer for the conversion of inputs to embedded vectors
model.add(Embedding(vocabulary_size, output_dim = dim, weights = [embedding_mat], input_length = max_length, trainable = False))
#All these LSTMS run for a 100 "timesteps" as determined by the max length parameter the units is the determine the outputsize
model.add(LSTM(units = 256, recurrent_dropout=0.3))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])

In [33]:
model.fit(X_train_tokens, y_train,  epochs = 8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x27233f45a90>

In [34]:
#applying the preprocessing steps on the test set and seeing the performance
X_test_sentence = [d.split() for d in X_test.text.tolist()]
X_test_tokens = tokenizer.texts_to_sequences(X_test_sentence)

In [35]:
X_test_tokens = pad_sequences(X_test_tokens, maxlen = max_length)

In [36]:
y_pred = np.where(model.predict(X_test_tokens) > 0.5, 1,0)



In [37]:
y_pred = y_pred.reshape((y_pred.shape[0],))

In [38]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.45      0.55      2072
           1       0.60      0.80      0.68      2071

    accuracy                           0.63      4143
   macro avg       0.65      0.63      0.62      4143
weighted avg       0.65      0.63      0.62      4143



In [39]:
y_pred_train = model.predict(X_train_tokens)



In [40]:
y_pred_train = np.where(y_pred_train > 0.5, 1,0)
y_pred_train = y_pred_train.reshape((y_pred_train.shape[0],))

In [41]:
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.74      0.73      0.73      8286
           1       0.73      0.75      0.74      8283

    accuracy                           0.74     16569
   macro avg       0.74      0.74      0.74     16569
weighted avg       0.74      0.74      0.74     16569

