In [None]:
# Mount Drive
import nltk
from nltk.corpus import stopwords
import numpy as np
import os
import pandas as pd
import json
import tensor

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from string import punctuation

In [None]:
raw_data_1 = pd.read_json(f"data/train.json")
raw_data_2 = pd.read_csv(f"data/PII43k.csv", on_bad_lines='skip')
raw_data_2['full_text'] = raw_data_2["Filled Template"]
raw_data_2["tokens"] = raw_data_2["Tokenised Filled Template"]
raw_data_2["labels"] = raw_data_2["Tokens"]
raw_data_2.drop(columns=['Template', 'Filled Template', 'Tokenised Filled Template', 'Tokens'])
all_data = pd.concat([raw_data_1, raw_data_2], ignore_index=True)
raw_data = all_data.drop(columns=['Template', 'Filled Template', 'Tokenised Filled Template', 'Tokens'])

In [None]:
all_data

In [None]:
# Create a set of punctuation and special characters to omit from tokens.
omitted_characters = set(punctuation)
omitted_characters.add("\n\n")
omitted_characters.add("\n")
omitted_characters.add("\r\n")
omitted_characters.add("\r")
omitted_characters.add(" ")
omitted_characters.add("•")
#print(omitted_characters)

In [None]:
def clean_tokens(data):
    cleaned_data = data.copy()
    for idx in cleaned_data.index:
        updated_tokens = []
        tokens = cleaned_data.at[idx, 'tokens']
        for token in tokens:
            token = token.lower().strip()
            if token not in omitted_characters:
                updated_tokens.append(token)

        cleaned_data.at[idx, 'tokens'] = updated_tokens

    return cleaned_data

def binarize_data(data):
    """
    Makes the labels of the data binary (either 0 or 1).
    """
    for index, document_labels in enumerate(data['labels']):
        data.loc[index, 'labels'] = 0
        for label in document_labels:
            if label != 'O':
                data.at[index, 'labels'] = 1

    return data

def remove_stopwords(data):
    nltk.download('stopwords')
    stop_words = set(list(stopwords.words('english')) + ["and", "2021", "1999", "a", "4", "t."])
    for row, tokens in enumerate(data['tokens']):
        for token_index, word in enumerate(tokens):
            if word.lower() in stop_words:
                data['tokens'][row].pop(token_index)

    return data

def use_word2vec(data):
    doc_texts = data['tokens'].tolist()
    model = Word2Vec(doc_texts, vector_size=100, window=5, min_count=5, workers=4)
    document_mean_vectors = []
    for doc in doc_texts:
        #new vector of words for each document
        word_vectors = []
        for word in doc:
            if word in model.wv:
              word_vectors.append(model.wv[word])
            else:
              word_vectors.append(np.zeros(model.vector_size))
        #calculate mean vector for the document
        document_mean_vectors.append(np.mean(word_vectors, axis=0))

    #this should be X when doing test/train/split
    return document_mean_vectors

In [None]:
# Clean data

#raw_data = create_sentence_tokens(raw_data)
cleaned_data = binarize_data(raw_data)
cleaned_data = clean_tokens(cleaned_data)
cleaned_data = remove_stopwords(cleaned_data)
# cleaned_data.drop(['full_text', 'trailing_whitespace', 'document'], axis=1, inplace=True)
cleaned_data.head()

In [None]:
#Prepare X and y for the model
y = cleaned_data['labels']
y = y.astype(int)

X = use_word2vec(cleaned_data)
#X = cleaned_data.drop(['labels', 'document', 'tokens', 'trailing_whitespace'], axis=1)
corpus = cleaned_data['full_text']

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [None]:
voc_size=50000 #how many unique words do I have

In [None]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]

sent_length=400 #how many words in a sentence
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)

In [None]:
embedding_vector_features= 100 ##features representation - every index will be represented by this many features
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(50))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

In [None]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=y

In [None]:
X_final

In [None]:
y_final

In [None]:
X_final.shape,y_final.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=5)

In [None]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=3,batch_size=128)

In [None]:
y_pred=model.predict(X_test)

In [None]:
y_pred=np.where(y_pred > 0.5, 1,0)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

# This code was references from W3Schools: https://w3schools.com/python/python_ml_auc_roc.asp
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve

def plot_roc_curve(true_y, y_prob):
    """
    plots the roc curve based of the probabilities
    """

    fpr, tpr, thresholds = roc_curve(true_y, y_prob)
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()

plot_roc_curve(y_test, y_pred)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))