In [1]:
# Mount Drive
import nltk
from nltk.corpus import stopwords
import numpy as np
import os
import pandas as pd
import json

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from string import punctuation

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
# Set up Google Drive.
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'ML_Final_Project'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'MyDrive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))

['data', 'Emmas_Version_aka_V2.ipynb', 'Using_AI_Data_for_Training.ipynb', 'First.ipynb', 'RNN notes.gdoc', 'Project Proposal Report.gdoc', 'Final_Project_ML.ipynb', 'Sams_Version.ipynb', 'RNN.ipynb', 'CS254 Project Proposal Report Format Draft.gdoc']


In [4]:
raw_data_1 = pd.read_json(f"{GOOGLE_DRIVE_PATH}/data/pii_dataset_1.json")
raw_data_2 = pd.read_csv(f"{GOOGLE_DRIVE_PATH}/data/PII43k.csv", on_bad_lines='skip')
raw_data_2['full_text'] = raw_data_2["Filled Template"]
raw_data_2["tokens"] = raw_data_2["Tokenised Filled Template"]
raw_data_2["labels"] = raw_data_2["Tokens"]
raw_data_2.drop(columns=['Template', 'Filled Template', 'Tokenised Filled Template', 'Tokens'])
all_data = pd.concat([raw_data_1, raw_data_2], ignore_index=True)
raw_data = all_data.drop(columns=['Template', 'Filled Template', 'Tokenised Filled Template', 'Tokens'])

In [5]:
all_data

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,Template,Filled Template,Tokenised Filled Template,Tokens
0,7.0,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",,,,
1,10.0,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",,,,
2,16.0,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...",,,,
3,20.0,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT...",,,,
4,56.0,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...",,,,
...,...,...,...,...,...,...,...,...,...
49561,,Write a blog post for Stanton LLC about the ro...,"['write', 'a', 'blog', 'post', 'for', 'stanton...",,"['O', 'O', 'O', 'O', 'O', 'B-NAME', 'I-NAME', ...",Write a blog post for [NAME_1] about the role ...,Write a blog post for Stanton LLC about the ro...,"['write', 'a', 'blog', 'post', 'for', 'stanton...","['O', 'O', 'O', 'O', 'O', 'B-NAME', 'I-NAME', ..."
49562,,14. Calculate the return on investment for Con...,"['14', '.', 'calculate', 'the', 'return', 'on'...",,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NA...",14. Calculate the return on investment for [NA...,14. Calculate the return on investment for Con...,"['14', '.', 'calculate', 'the', 'return', 'on'...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NA..."
49563,,Please write an email to Roberta Gutmann V at ...,"['please', 'write', 'an', 'email', 'to', 'robe...",,"['O', 'O', 'O', 'O', 'O', 'B-FULLNAME', 'I-FUL...",Please write an email to [FULLNAME_1] at [EMAI...,Please write an email to Roberta Gutmann V at ...,"['please', 'write', 'an', 'email', 'to', 'robe...","['O', 'O', 'O', 'O', 'O', 'B-FULLNAME', 'I-FUL..."
49564,,Can you help me write a project closure report...,"['can', 'you', 'help', 'me', 'write', 'a', 'pr...",,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",Can you help me write a project closure report...,Can you help me write a project closure report...,"['can', 'you', 'help', 'me', 'write', 'a', 'pr...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [6]:
# Create a set of punctuation and special characters to omit from tokens.
omitted_characters = set(punctuation)
omitted_characters.add("\n\n")
omitted_characters.add("\n")
omitted_characters.add("\r\n")
omitted_characters.add("\r")
omitted_characters.add(" ")
omitted_characters.add("•")
#print(omitted_characters)

In [7]:
def clean_tokens(data):
    cleaned_data = data.copy()
    for idx in cleaned_data.index:
        updated_tokens = []
        tokens = cleaned_data.at[idx, 'tokens']
        for token in tokens:
            token = token.lower().strip()
            if token not in omitted_characters:
                updated_tokens.append(token)

        cleaned_data.at[idx, 'tokens'] = updated_tokens

    return cleaned_data

def binarize_data(data):
    """
    Makes the labels of the data binary (either 0 or 1).
    """
    for index, document_labels in enumerate(data['labels']):
        data.loc[index, 'labels'] = 0
        for label in document_labels:
            if label != 'O':
                data.at[index, 'labels'] = 1

    return data

def remove_stopwords(data):
    nltk.download('stopwords')
    stop_words = set(list(stopwords.words('english')) + ["and", "2021", "1999", "a", "4", "t."])
    for row, tokens in enumerate(data['tokens']):
        for token_index, word in enumerate(tokens):
            if word.lower() in stop_words:
                data['tokens'][row].pop(token_index)

    return data

def use_word2vec(data):
    doc_texts = data['tokens'].tolist()
    model = Word2Vec(doc_texts, vector_size=100, window=5, min_count=5, workers=4)
    document_mean_vectors = []
    for doc in doc_texts:
        #new vector of words for each document
        word_vectors = []
        for word in doc:
            if word in model.wv:
              word_vectors.append(model.wv[word])
            else:
              word_vectors.append(np.zeros(model.vector_size))
        #calculate mean vector for the document
        document_mean_vectors.append(np.mean(word_vectors, axis=0))

    #this should be X when doing test/train/split
    return document_mean_vectors

In [9]:
# Clean data

#raw_data = create_sentence_tokens(raw_data)
cleaned_data = binarize_data(raw_data)
cleaned_data = clean_tokens(cleaned_data)
cleaned_data = remove_stopwords(cleaned_data)
# cleaned_data.drop(['full_text', 'trailing_whitespace', 'document'], axis=1, inplace=True)
cleaned_data.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7.0,Design Thinking for innovation reflexion-Avril...,"[design, thinking, innovation, reflexion, avri...","[True, True, True, True, False, False, True, F...",1
1,10.0,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[diego, estrada, , design, thinking, assignmen...","[True, False, False, True, True, False, False,...",1
2,16.0,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[reporting, process, , gilberto, gamboa, , cha...","[True, False, False, True, True, False, False,...",1
3,20.0,Design Thinking for Innovation\n\nSindy Samaca...,"[design, thinking, innovation, , sindy, samaca...","[True, True, True, False, False, True, False, ...",1
4,56.0,Assignment: Visualization Reflection Submitt...,"[assignment, , visualization, , reflection, , ...","[False, False, False, False, False, False, Fal...",1


In [10]:
#Prepare X and y for the model
y = cleaned_data['labels']
y = y.astype(int)

X = use_word2vec(cleaned_data)
#X = cleaned_data.drop(['labels', 'document', 'tokens', 'trailing_whitespace'], axis=1)
corpus = cleaned_data['full_text']

In [11]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [12]:
voc_size=50000 #how many unique words do I have

In [13]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]

sent_length=400 #how many words in a sentence
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)

In [14]:
embedding_vector_features= 100 ##features representation - every index will be represented by this many features
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(50))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 400, 100)          5000000   
                                                                 
 lstm (LSTM)                 (None, 50)                30200     
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 5030251 (19.19 MB)
Trainable params: 5030251 (19.19 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [15]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=y

In [None]:
X_final

In [None]:
y_final

In [None]:
X_final.shape,y_final.shape

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=5)

In [17]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=3,batch_size=128)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7d9d59df89a0>

In [18]:
y_pred=model.predict(X_test)



In [19]:
y_pred=np.where(y_pred > 0.5, 1,0)

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[1189,    1],
       [ 192, 8532]])

In [25]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

# This code was references from W3Schools: https://w3schools.com/python/python_ml_auc_roc.asp
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve


# In Development
# def plot_roc_curve(true_y, y_prob):
#     """
#     plots the roc curve based of the probabilities
#     """

#     fpr, tpr, thresholds = roc_curve(true_y, y_prob)
#     plt.plot(fpr, tpr)
#     plt.xlabel('False Positive Rate')
#     plt.ylabel('True Positive Rate')
#     plt.show()

# plot_roc_curve(y_test, y_pred)

In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      1.00      0.92      1190
           1       1.00      0.98      0.99      8724

    accuracy                           0.98      9914
   macro avg       0.93      0.99      0.96      9914
weighted avg       0.98      0.98      0.98      9914

