In [55]:
import pandas as pd
import numpy as np
import nltk
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix



#We are going to use word vectorization to embed the words
import gensim

In [56]:
bot_tweets = pd.read_csv('processed_bot_tweets.csv')

In [57]:
human_tweets = pd.read_csv('processed_human_tweets.csv')

In [58]:
del bot_tweets['Unnamed: 0']

In [59]:
del human_tweets['Unnamed: 0']

In [60]:
bot_tweets['isBot'] = 1

In [61]:
human_tweets['isBot'] = 0

In [62]:
tweets = pd.concat([bot_tweets, human_tweets])

In [63]:
tweets.rename(mapper = {'0':'text'}, axis = 1, inplace = True)

In [64]:
tweets['text'] = tweets['text'].astype('str')

# Preprocessing the tweets
Here, we shall do things like change common short hands into full form (in order to deal with the issue of punctuation removal)


In [65]:
#First, we convert words like can't, won't etc into cannot and will not
short_form_dict = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have",}
    
    



In [66]:
tweets.head()

Unnamed: 0,text,isBot
0,extract aveng scaveng,1
1,get geotherm,1
2,spell panopli without pli,1
3,put myrrh commerci,1
4,need die spell muddi,1


In [67]:
#Things to keep: Capitalisation of words, punctuation, emojis and their positions

In [68]:
#Preprocessing steps (ie cleaning of text and including important information like emojis)

# Train test split, model training and word embedding

In [69]:
#create the X and y classes
#tweets['isBot'] = 0
#tweets.loc[tweets['account_type'] == 'bot', 'isBot'] = 1
y = tweets.isBot
X = tweets[['text']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [70]:
#first, just embed the cleaned tweets and see the results

#store the cleaned tweets in a list of lists representing the split
X_train_sentence = [d.split() for d in X_train.text.tolist()]


In [71]:
X_train.text.tolist()

['sweepstak monstertruckjumpl',
 'stand hr stop social disguis infrastructur',
 'hey watch ring power stream tonight birthday',
 'bad move juve tie guy year make qualiti move',
 'sure dopplegang mimic',
 'job done cypru big credit tonight',
 'twitterversari twitter year sinc sep via',
 'miss',
 'sorri miss chocol ring doughnut keep eye peel alway new tasti product launch',
 'right amount phallic',
 'feel home mont carlo',
 'take break creat video recov sinu infect move later week settl new hous feel well start pump episod stay cozi beauti photo credit',
 'super excit play lovevalleyfest year saturday main stage see ticket sale',
 'know similarli afflict',
 'sinc june low capr fare xbi spi capr mkt cap still paltri fulli fund p multi billion tam mileston payment partner high likelihood fda let compani file bla acceler approv come wk',
 'octob mean hellrais love movi incorpor storylin blood gut gore write writingcommun horrormovi horror horrorfamili horrorart lgbtq',
 'live chat elana my

In [72]:
#create the word to vector
#try a 100 dimensional vector first
dim = 100
#Fit a Word2Vec model on our dataset
w2v = gensim.models.Word2Vec(sentences = X_train_sentence, vector_size = dim, window = 10, min_count = 1)


In [73]:
w2v.wv.most_similar('bad')

[('knew', 0.9966903328895569),
 ('enough', 0.9966732859611511),
 ('damn', 0.9957500100135803),
 ('meant', 0.9956751465797424),
 ('tbh', 0.9955178499221802),
 ('cuz', 0.9952368140220642),
 ('okay', 0.9946511387825012),
 ('lol', 0.9945940375328064),
 ('noth', 0.9945084452629089),
 ('thought', 0.9943203926086426)]

In [74]:
len(w2v.wv.key_to_index)

37890

In [75]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_sentence)

In [76]:
tokenizer.word_index

{'get': 1,
 'one': 2,
 'thank': 3,
 'love': 4,
 'like': 5,
 'time': 6,
 'go': 7,
 'new': 8,
 'day': 9,
 'u': 10,
 'see': 11,
 'year': 12,
 'make': 13,
 'today': 14,
 'peopl': 15,
 'look': 16,
 'know': 17,
 'good': 18,
 'need': 19,
 'work': 20,
 'check': 21,
 'come': 22,
 'want': 23,
 'back': 24,
 'great': 25,
 'would': 26,
 'happi': 27,
 'got': 28,
 'think': 29,
 'say': 30,
 'watch': 31,
 'right': 32,
 'way': 33,
 'show': 34,
 'take': 35,
 'life': 36,
 'week': 37,
 'much': 38,
 'help': 39,
 'best': 40,
 'first': 41,
 'game': 42,
 'use': 43,
 'thing': 44,
 'still': 45,
 'follow': 46,
 'pleas': 47,
 'person': 48,
 'last': 49,
 'live': 50,
 'let': 51,
 'feel': 52,
 'post': 53,
 'play': 54,
 'even': 55,
 'via': 56,
 'photo': 57,
 'friend': 58,
 'never': 59,
 'tri': 60,
 'realli': 61,
 'call': 62,
 'alway': 63,
 'team': 64,
 'win': 65,
 'start': 66,
 'hope': 67,
 'world': 68,
 'could': 69,
 'well': 70,
 'famili': 71,
 'th': 72,
 'join': 73,
 'support': 74,
 'night': 75,
 'also': 76,
 'give'

In [77]:
#tokenize the words (this is for padding later)
X_train_tokens = tokenizer.texts_to_sequences(X_train_sentence)

In [78]:
#Since twitter only allows 280 characters, we shall keep the max length to
#the greater of 100 or whatever maximum length is found in the dataset
max_length = max(100, max(len(x) for x in X_train_tokens))


In [79]:
#Next, we can pad the sequences
X_train_tokens = pad_sequences(X_train_tokens, maxlen = max_length)

In [80]:
X_train_tokens.shape

(42947, 100)

In [81]:
vocabulary_size = len(tokenizer.word_index) + 1
vocab = tokenizer.word_index

In [82]:
#create the embedding matrix according to our word embeddings
embedding_mat = np.zeros((vocabulary_size, dim))
num_words = 0
#insert the word embeddings into our initialised object
for word, token in tokenizer.word_index.items():

    #get the corresponding vector to the word and the token
    #if the word is present in the dictionary, then we append the corresponding matrix else we continue
    if word in w2v.wv:
        vector = w2v.wv[word]
        embedding_mat[token] = vector
        num_words +=1
        



In [83]:
print(num_words, vocabulary_size)

37890 37891


In [84]:
w2v.wv['i']

array([-0.00994994,  0.05659168,  0.00541038, -0.00117029, -0.01302109,
       -0.0743858 ,  0.02437423,  0.11208599, -0.01724135, -0.03248841,
       -0.01765264, -0.07905414, -0.03936373,  0.01748016,  0.01877048,
       -0.04825697,  0.04178107, -0.06070923,  0.01050115, -0.12047462,
        0.05213865,  0.02698007,  0.01679382, -0.03649551, -0.02672715,
        0.01660547, -0.04656833, -0.03415567, -0.02553605,  0.00404084,
        0.03856372,  0.02344867, -0.02986369, -0.04164566, -0.02026955,
        0.02847963, -0.0239586 , -0.05074574, -0.03531399, -0.10714308,
       -0.01250105, -0.03862583, -0.0139458 , -0.01863185,  0.03815575,
       -0.02158216, -0.06530283, -0.01011082,  0.00289004,  0.04077569,
        0.0254184 , -0.05409361, -0.02887317, -0.00153661, -0.02382375,
        0.02088186,  0.02506712, -0.00692   , -0.03460557,  0.0345263 ,
        0.0130231 , -0.00073897, -0.03044673, -0.0022704 , -0.06477474,
        0.01851139,  0.02593491,  0.03839053, -0.07896655,  0.07

In [85]:
#Since the dataset is an imbalanced dataset, we shall assign higher weights for the bot examples
#the weights assigned to the classes shall be a reciprocal of the frequencies
class_weights = {0:1.334,1:6} 

In [86]:

#creating the neural network, starting with a relatively simple architechture
model = Sequential()
#create the embedding layer for the conversion of inputs to embedded vectors
model.add(Embedding(vocabulary_size, output_dim = dim, weights = [embedding_mat], input_length = max_length, trainable = False))
#All these LSTMS run for a 100 "timesteps" as determined by the max length parameter the units is the determine the outputsize
model.add(LSTM(units = 256, recurrent_dropout=0.3))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])

In [None]:
model.fit(X_train_tokens, y_train,  epochs = 8, class_weight = class_weights)

Epoch 1/8
Epoch 2/8
Epoch 3/8

In [None]:
#applying the preprocessing steps on the test set and seeing the performance
X_test_sentence = [d.split() for d in X_test.text.tolist()]
X_test_tokens = tokenizer.texts_to_sequences(X_test_sentence)

In [None]:
X_test_tokens = pad_sequences(X_test_tokens, maxlen = max_length)

In [None]:
y_pred = np.where(model.predict(X_test_tokens) > 0.5, 1,0)

In [None]:
y_pred = y_pred.reshape((y_pred.shape[0],))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
y_pred_train = model.predict(X_train_tokens)

In [None]:
y_pred_train = np.where(y_pred_train > 0.5, 1,0)
y_pred_train = y_pred_train.reshape((y_pred_train.shape[0],))

In [None]:
print(classification_report(y_train, y_pred_train))