<a href="https://colab.research.google.com/github/RAHULRAANU/ALL-TOGETHER/blob/main/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install swifter

In [None]:
import os
import pandas as pd
import numpy as np
import swifter
import re
import string
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import tensorflow
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline


import warnings
warnings.filterwarnings('ignore')

In [None]:
# reading path 
path1 = r'/content/cybersecurity_tweets.csv'
path2 = r'/content/not_cybersecurity_tweets.csv'

In [None]:
csv1 = pd.read_csv(path1)
csv2 = pd.read_csv(path2)

df = pd.concat([csv1,csv2])

In [None]:
df

In [None]:
df = shuffle(df)
df

In [None]:
df.text



In [None]:
df.drop_duplicates(inplace = True)

## Text Preprocessing


In [None]:
class text_preprocess:
    
    def __init__(self):
        pass

    def convert_to_lower(self, text):
        return text.lower()

    def remove_emojis(self, text):
        text = re.sub(r"(?:\@|https?\://)\S+", " ", text) #remove links and mentions
        text = re.sub(r"<.*?>"," ",text)

        wierd_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emotions
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u'\U00010000-\U0010ffff'
            u"\u200d"
            u"\u2640-\u2642"
            u"\u2600-\u2B55"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\u3030"
            u"\ufe0f"
            u"\u2069"
            u"\u2066"
            u"\u200c"
            u"\u2068"
            u"\u2067"
            "]+", flags=re.UNICODE)

        rm_emoji = wierd_pattern.sub(r'', text)
        return rm_emoji

    def remove_html(self, text):
        html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        rm_html = re.sub(html, ' ', text)
        return rm_html

    def remove_URL(self,text):
        url = re.compile(r'https?://\S+|www\.\S+')
        URL = url.sub(r' ', text)
        return URL
    
    def remove_non_ascii(self, text):
        return re.sub(r'[^\x00-\x7f]',r' ', text) # or ''.join([x for x in text if x in string.printable]) 
    
    
    def remove_numbers(self, text):
        number_pattern = r'\d+'
        without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
        return without_number


    def remove_punctuation(self,text):
        return text.translate(str.maketrans('', '', string.punctuation))


    def remove_extra_white_spaces(self, text):
        single_char_pattern = re.compile(r'\s+[a-zA-Z]\s+')
        without_sc = re.sub(single_char_pattern, r" ", text)
#         without_sc = text.replace(' ', '')
        return without_sc


    def preprocessText(self,text):            
        return self.remove_extra_white_spaces(self.remove_non_ascii(self.remove_URL(self.remove_html(self.remove_punctuation(self.remove_numbers(self.remove_emojis(self.convert_to_lower(text))))))))


In [None]:
if __name__ == "__main__":    
    text_prpocess_obj = text_preprocess()
    df.text = df.text.swifter.apply(lambda x: text_prpocess_obj.preprocessText(x))

In [None]:
df.text

## Tokenization

In [None]:
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

# !pip install spacy
# !python -m spacy download en_core_web_sm
# import spacy
# nlp = spacy.load("en_core_web_sm")  # giving installation error

In [None]:
# Tokenizing the tweet base texts.
def tokenize(text):
    my_doc = nlp(text)
    token_list = []
    for token in my_doc:
        token_list.append(token.text)
    return token_list    

In [None]:
df.text = df.text.swifter.apply(lambda x: tokenize(x))

In [None]:
df.text

## Remove Stopwords

In [None]:
def remove_stopwords(text):    
    filtered_sentence =[] 
    for word in text:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    return " ".join(filtered_sentence)

In [None]:
df.text = df.text.swifter.apply(lambda x: remove_stopwords(x))

In [None]:
df.text

## Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('words')
nltk.download('wordnet')
nltk.download('omw-1.4')


w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

words = set(nltk.corpus.words.words())
# words = nltk.word_tokenize(corpus)

class lemmatization:
    
    def __init__(self):
        pass
    
    def lemmatizing_space(self, text):   
        return " ".join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

    # def lemmatizing_words(self, text):
    #     return " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha())

    # def lemmatize(self, text):
    #     return self.lemmatizing_space(self.lemmatizing_words(text))

In [None]:
if __name__ == "__main__":
  lemmatization_obj = lemmatization()
  df.text = df.text.swifter.apply(lambda x: lemmatization_obj.lemmatizing_space(x))

In [None]:
df

## Stemming

In [None]:
# # Stemming
# from nltk.stem.snowball import SnowballStemmer

# snow_stemmer = SnowballStemmer(language='english')
  
# def stemmizing(text):    
#     #stem of each word
#     stem_words = []
#     for w in text:
#         x = snow_stemmer.stem(w)
#         stem_words.append(x)
#     return "".join(stem_words)

In [None]:
# df['text'] = df.text.swifter.apply(lambda x: stemmizing(x))

In [None]:
# df.text

##  Remove words from a string of length between 2

In [None]:
def removelt2wordslength(text):    
    for x in text:
        xx = re.compile(r'\W*\b\w{1,2}\b')
        rm_word = re.sub(xx, '', text)
        return rm_word

In [None]:
df.text = df.text.swifter.apply(lambda x: removelt2wordslength(x))

In [None]:
df.text

## Lstm
We will not going to create RNN model due to its vanishing gradient problem instead of that we will going to create LSTM model.LSTMs have an additional state called ‘cell state’ through which the network makes adjustments in the information flow. The advantage of this state is that the model can remember or forget the leanings more selectively. First of all we are going to do tokenization then we will generate sequence of n-grams.After that we will going to do padding.Padding is required because all the sentences are of different length so we need to make them of same length.We will going to do this by adding 0 in the end of the text with the help of pad_sequences function of keras


In [None]:
df = df.dropna()
df

In [None]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences


In [None]:
max_words = 100000
max_len = 19
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(df['text'].values)
sequences = tok.texts_to_sequences(df['text'].values)
sequences_matrix = pad_sequences(sequences, padding = 'post', maxlen= max_len)


In [None]:
sequences_matrix

In [None]:
Y = df[['label']]

In [None]:
Y

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(sequences_matrix,Y, test_size = 0.27, random_state = 2529 ,stratify = Y)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

## lstm architecture
# Embedding : Generates embedding vector for each input sequence
# Dense : Fully connected layer for classification
# bidirectional : Another type of rnn simultaneously learn forward and backword direction of information flow
# Lstm : long short term memory, its a variant of RNN which has memory state cell to learn the context of words are at the further along the text to carry contextual meaning rahther than just neighbouring words as in case of rnn

In [None]:
import keras
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, Dropout

embid_dim = 512
lstm_out = 128


model = keras.Sequential()
model.add(Embedding(max_words, embid_dim, input_length = sequences_matrix.shape[1]))
model.add(Bidirectional(LSTM(lstm_out)))
model.add(Dropout(0.4))
model.add(Dense(512, activation = 'relu'))
model.add(Dropout(0.4))
model.add(Dense(1,activation = 'sigmoid'))

model.summary()

In [None]:
from tensorflow.keras.optimizers import Adam, SGD

model.compile(loss='binary_crossentropy',optimizer= Adam(learning_rate = 0.01), metrics=['accuracy'])

In [None]:
# history = model.fit(X_train,Y_train,epochs=10, validation_data= (X_test, Y_test),
#           callbacks=[EarlyStopping(monitor='val_loss',min_delta=0, patience=3, verbose=1, mode='auto' )])

# batch_size = 64
# earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=3, verbose=0, mode='auto')
# history = model.fit(X_train, Y_train, epochs = 10, batch_size=batch_size, verbose = 1, validation_data= (X_test, Y_test),callbacks=[earlystop])

batch_size = 128
earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=3, verbose=0, mode='auto')
history = model.fit(X_train, Y_train, epochs = 11, batch_size=batch_size, verbose = 1, validation_data= (X_test, Y_test),callbacks=[earlystop])

## Model Evaluation
# Plot Accuracy and Loss

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

In [None]:
Y_pred = model.predict(X_test)
Y_pred

In [None]:
Y_pred[0]

In [None]:
test_accuracy = accuracy_score(Y_test,Y_pred.round())
test_accuracy

In [None]:
conf_matrix = confusion_matrix(Y_test,Y_pred.round())
conf_matrix

In [None]:
# heatmap of confusion matrix
import seaborn as sns

sns.heatmap(confusion_matrix(Y_test, Y_pred.round()), annot=True, fmt = '1d');

In [None]:
# classification report
print(classification_report(Y_test,  Y_pred.round()))

## Testing
the model will ouput a prediction score between 0 and 1 . we can classify two classes by defining a threshold value for it. in our case i have set 0.5 as Threshold value, if the score above it, Then it will classified as 
Cybersecurity text

In [None]:
def decode_text(score):
  return "cybersecurity_text" if score > 0.50 else "Not_cybersecurity_text"

score = model.predict(X_test)  
print(len(score))

In [None]:
Y_pred.round()

In [None]:
# Y_pred.mean() # for threshold

In [None]:
y_pred_text = [decode_text(score) for score in score]
# y_pred_text

In [None]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences


In [None]:
tokk = Tokenizer()

In [None]:
text =  ['Threat: Someone with the potential to cause harm by damaging or destroying the official data of a system or organization.']

In [None]:
tokk.fit_on_texts(text)
seq = tokk.texts_to_sequences(text)
seqmatrix = pad_sequences(seq, padding = 'post', maxlen= max_len)

In [None]:
test = model.predict(seqmatrix)

In [None]:
test

In [None]:
y_pred_text_other = [decode_text(score) for score in test]

In [None]:
y_pred_text_other

In [None]:
model.save('lstm.h5')