### Seed-Guided LDA with negative sentences input from BERT model
by: Anood Alkatheeri

#### 1) Importing negative sentences from JSON file (and removing duplicates)

In [22]:
import json
import lda.datasets as gldad
import numpy as np
from lda import guidedlda as glda
from collections import defaultdict
import random
import pandas as pd

f = open('extracted_neg_sentences_24th_report.json')

data = json.load(f)
  
# Iterating through the json
original_sentences24 = []

for i in data:
    sentence = i['sentence']
    if sentence not in original_sentences24:
        original_sentences24.append(sentence)

f = open('extracted_neg_sentences_31st_report.json')

data = json.load(f)
  
# Iterating through the json
original_sentences31 = []

for i in data:
    sentence = i['sentence']
    if sentence not in original_sentences31:
        original_sentences31.append(sentence)

f = open('extracted_neg_sentences_32nd_report.json')

data = json.load(f)
  
# Iterating through the json
original_sentences32 = []

for i in data:
    sentence = i['sentence']
    if sentence not in original_sentences32:
        original_sentences32.append(sentence)


In [23]:
combined_sentences = original_sentences24 + original_sentences31 + original_sentences32
len(combined_sentences)

1053

In [24]:
random.choices(combined_sentences, k=7)

['One of the issues of concern at DCPP was that more equipment is being broken during maintenance activities than has occurred historically.',
 'The EDGs are operable but there are issues with obsolete control parts and margin management questions need to be quantified.',
 'Unit 2 LER 2021-002-00 was submitted on 12/14/2021, which documented that the Unit 2 reactor was manually tripped on October 15, 2021, in response to increasing water level in a secondary-side feedwater heater.',
 'The primary issue of concern to the NRC was the fact that an outdated calculation was contained in a section of the current FEMA-approved design report.',
 'He stated during 2018- 2019 the plant experienced performance issues with status control events primarily in the Operations Department.',
 'Unit 2 EDG 2-3 is currently in a Red status window due to a significant fuel oil leak that occurred when the EDG 2-3 was run for maintenance and this event resulted in a critical equipment clock reset for the plan

#### 2) Pre-processing: removing stop words/punctuations/numbers, lemmatization, lowerization...

In [25]:
import re
import spacy

# load the spacy model and stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = nlp.Defaults.stop_words

def preprocess(combined_sentences):
    list_sentences = []
    # assume that 'df' is a list of lists containing strings
    for i in range(len(combined_sentences)):
        
        #join the list into a single string
        text = combined_sentences[i]
        
        #remove months
        text = re.sub('{January|February|March|April|May|June|July|August|September|October|November|December}', '', text)
        
        #remove prefixes
        prefixes = "(Mr|St|Mrs|Ms|Dr|Ph.D|Chair|ViceChair|Volume)"
        text  = re.sub(prefixes, '', text)

        #lemmatization
        lemma_list = [token.lemma_ for token in nlp(text)]

        #remove stopwords and -PRON- tags
        clean_list = [re.sub('-PRON-', '', word) for word in lemma_list if word not in stop_words]

        #join the cleaned tokens back into a string
        clean_text = ' '.join(clean_list).lower()

        #removes numbers and punctuations.
        clean_text=re.sub(r'\d+', '', clean_text)
        clean_text=re.sub(r'[\W_]+', ' ', clean_text).rstrip()

        #remove single character terms or 2-letter words
        terms_list = [word for word in clean_text.split() if len(word) > 2]
        clean_text = ' '.join(terms_list).lower()
            
        #assign the cleaned text back to list
        list_sentences.append(clean_text)
        
    return list_sentences
        
list_sentences = preprocess(combined_sentences)

In [26]:
random.choices(list_sentences, k=7)

['deficiency radiological area pertain existence expire chemical primary chemistry laboratory sample procedure auxiliary salt water lack documentation need use supplemental sampling pump sample flow available normal sampling method',
 'nuclear safety oversight committee express concern leadership turnover acknowledge identify specific negative trend issue occur leadership turnover',
 'outage total yellow condition actually occur versus originally plan',
 'atus regulatory excellence action plan linnen state plan develop dcpp address issue weakness communication nrc weakness reportability determination nrc substantive cross cutting issue problem evaluation adverse trend safety system functional failure ssff',
 'baldwin state cause inadequate design control circuit system assure fan operate time properly understand documentation deficiency',
 'size leak judge render edg unable run duration time need accident analysis failure classify maintenance rule functional failure critical equipment 

#### 3) Creating sentence-term matrix and generating unique vocabulary

Convert a collection of text documents to a matrix of token counts.

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_2 = vectorizer.fit_transform(list_sentences)
X_2.shape

(1053, 2486)

Convert matrix to DataFrame.

In [28]:
feature_names = vectorizer.get_feature_names_out()

X_df = pd.DataFrame(X_2.toarray(), columns=feature_names)

X_df.iloc[:10,0:20]

Unnamed: 0,ability,able,abnormal,abnormally,abort,abrupt,absence,absorb,accelerate,acceptable,acceptance,accepted,access,accident,accidents,accommodate,accordance,accordingly,account,accumulate
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Convert DataFrame to Array.

In [29]:
X_array = X_2.toarray()
X_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

Extracting and encoding unique words/vocabulary (Creating Word-id dictionary).

In [32]:
vocab = tuple(feature_names)
word2id = dict((v, idx) for idx, v in enumerate(vocab))
len(vocab)

2486

#### 4) Defining seed words that correspond to each safety trait

Source of seed words: INPO 12-012- Traits of a Healthy Nuclear Safety Culture - April 2013.pdf

In [139]:
seed_words = [
['responsibility', 'accountability', 'help', 'support', 'trained', 'qualified', 'understand', 'complete', 'involvement'],
['complacency', 'complacent', 'challenge', 'error', 'hazard', 'caution', 'discrepancy', 'anomaly', 'assumption', 'question', 'uncertain', 'unknown', 'risk', 'trend', 'unexpected', 'unclear', 'degrading', 'aging'],
['communication', 'licensee', 'event', 'report', 'documentation', 'request', 'LER', 'information', 'safety', 'prompt', 'share', 'respond', 'listen', 'concern', 'expectation', 'clear'],
['leadership', 'management', 'leader', 'owner', 'ownership', 'program', 'guidance', 'policy', 'resource', 'staffing', 'oversight', 'reinforce', 'priority', 'plan', 'delegate', 'align', 'define', 'manage', 'resolve', 'address', 'translate', 'funding', 'implementation', 'violation'],
['thorough', 'conservative', 'systematic', 'consistent', 'process', 'choice', 'consequence', 'authority', 'future', 'timely', 'executive', 'senior'],
['trust', 'respect', 'opinion', 'dignity', 'fair', 'disagree', 'receptive', 'valuable', 'tolerate', 'value', 'insight', 'perspective', 'collaboration', 'conflict', 'listening'],
['learn', 'training', 'assessment', 'improve', 'performance', 'scrutiny', 'monitor', 'adopt', 'idea', 'benchmarking', 'knowledge', 'competent', 'skills', 'develop', 'acquire'],
['identify', 'corrective', 'action','issue', 'yellow', 'red', 'prevent', 'foreign', 'poor', 'inadequate', 'degraded', 'evaluation', 'problem', 'cause', 'root', 'investigation', 'investigate', 'recommendation', 'resolution', 'mitigate'],
['environment', 'fear', 'harassment', 'discrimination', 'promote', 'severity', 'failure', 'submit', 'report', 'expired', 'raise'],
['engineering', 'control', 'activity', 'contingency', 'production', 'schedule', 'work', 'margin', 'operate', 'maintain', 'maintenance', 'procedure', 'package', 'accurate', 'current', 'backlog', 'instruction', 'operation', 'design', 'requirement','standard']
]

# seed_words = [['responsible', 'encourage', 'accountable', 'help', 'ownership', 'support', 'trained', 'qualified', 'understand', 'non-cited', 'involvement', 'self-identifying'],
#               ['complacent', 'discrepancy', 'anomaly', 'assumption', 'question', 'uncertain', 'unknown', 'risk', 'investigate', 'unexpected', 'unclear', 'review', 'degrading', 'aging'],
#               ['communication', 'information', 'safety', 'share', 'respond', 'listen', 'concerns', 'expectations', 'clear'],
#               ['leadership', 'management', 'leaders', 'owner', 'guidance', 'decisions', 'policies', 'resources', 'staffing', 'expectations', 'oversight', 'reinforce', 'decision-making', 'mentoring', 'coaching', 'encourage', 'incentive', 'reward', 'priorities', 'plan', 'delegate', 'define', 'manage', 'support', 'resolve', 'motivate', 'address', 'resources', 'translate', 'funding', 'implementation', 'violation'],
#               ['decision-making', 'thorough', 'conservative', 'systematic', 'consistent', 'process', 'leaders', 'choices', 'consequences', 'reinforce', 'accountable', 'authority', 'responsibility'],
#               ['trust', 'respect', 'encourage', 'opinions', 'bullying', 'fair', 'productive', 'concerns', 'voice', 'discussions', 'value', 'insight', 'perspectives', 'collaboration', 'conflict', 'listening'],
#               ['learn', 'training', 'self-assessments', 'improve', 'performance', 'monitor', 'experience', 'assessment', 'corrective action', 'benchmarking', 'knowledge', 'competence', 'skills', 'develop', 'understand'],
#               ['identify', 'address', 'correct', 'corrective action', 'issues', 'deviation', 'degraded conditions', 'evaluation', 'problems', 'root cause', 'investigation', 'recommendation', 'cause analysis', 'resolution', 'mitigate', 'trends'],
#               ['concerns', 'confidence', 'address', 'feedback', 'listening', 'report'],
#               ['plan', 'control', 'activities', 'production', 'schedule', 'manage', 'execute', 'risk', 'coordinate', 'margins', 'operate', 'maintain', 'maintenance', 'document', 'complete', 'procedures', 'packages', 'accurate', 'current', 'backlog', 'follow', 'review', 'instructions', 'operations', 'design', 'requirements', 'operations']]

# seed_words = [['responsibility', 'authority', 'override', 'adherence', 'accountability', 'organization', 'meeting', 'standard', 'demonstrate', 'proper', 'reinforce', 'discussion', 'peer', 'personally', 'consistent', 'solicit', 'feedback', 'understand', 'foster', 'professional', 'teamwork', 'environment', 'raise', 'ownership', 'preparation', 'execute', 'assign', 'activity', 'work', 'participate', 'briefing', 'qualify', 'communicate', 'coordinate', 'boundary', 'sense', 'operation'],
#               ['complacency', 'challenge', 'discrepancy', 'error', 'anomaly', 'undesirable', 'complex', 'unpredictable', 'oversight', 'caution', 'hazard', 'radioactive', 'core', 'decay', 'heat', 'fuel', 'cooling', 'question', 'degrade', 'equipment', 'risk', 'uncertain', 'unexpected', 'attitude', 'resolve', 'rationalize', 'abnormal', 'investigate', 'supervisor', 'consult', 'expert', 'unclear', 'oppose', 'view', 'management', 'decision', 'contrary', 'possibility', 'mistake', 'inherit', 'risk', 'contingency', 'undesired'],
#               ['communication', 'safety', 'worker', 'equipment', 'labeling', 'operating', 'documentation', 'formal', 'informal', 'convey', 'flow', 'organization', 'group', 'frequent', 'status', 'supervisor', 'information', 'shift', 'turnover', 'briefing', 'meeting', 'daily', 'prompt', 'unintended', 'conflicting', 'ask', 'share', 'reason', 'implication', 'openly', 'candidly', 'respond', 'forthright', 'audit', 'solicit', 'listen', 'assess', 'expectation', 'reliability'],
#               ['leadership', 'commitment', 'decision', 'lead', 'advocate', 'corporate', 'policy', 'resource', 'reliability', 'staffing', 'sufficient', 'qualified', 'personnel', 'facility', 'maintain', 'emergency', 'executive', 'senior', 'manager', 'evaluation', 'ensure', 'expectation', 'disciplinary', 'consistent', 'raise', 'foster', 'oversight', 'cost', 'schedule', 'goal', 'establish', 'align', 'priority', 'systematic', 'process', 'change', 'implement', 'authority', 'plan', 'ownership', 'accountability', 'role', 'recommendation', 'feedback', 'governance', 'monitor', 'perspective', 'tool', 'survey', 'review', 'culture', 'detract', 'act', 'unsafe', 'decision-making'],
#               ['expectation', 'systematic', 'unexpected', 'uncertain', 'reinforce', 'conservative', 'decision', 'consistent', 'process', 'process', 'seek', 'group', 'organization', 'safety', 'risk', 'bias', 'effectiveness', 'future', 'choice', 'timely', 'commensurate', 'executive', 'senior', 'reinforce', 'procedure', 'reactor', 'margin', 'operation', 'shift', 'accountability', 'authority', 'responsibility'],
#               ['trust', 'respect', 'communication', 'opinion', 'employees', 'everyone', 'dignity', 'capability', 'experience', 'valuable', 'asset', 'group', 'bullying', 'humiliating', 'tolerate', 'behavior', 'disagree', 'fair', 'concern', 'suggestion', 'question', 'problems', 'receptive', 'differing', 'discussion', 'expertise', 'value', 'experience', 'perspective', 'program', 'personnel', 'lack', 'information', 'share', 'timely', 'milestone', 'positive', 'negative', 'confidentiality', 'conflict', 'objective', 'resolution', 'equitable', 'consistent', 'defined', 'result', 'professional'],
#               ['opportunities', 'implement', 'learn', 'training', 'assessment', 'benchmarking', 'stimulate', 'performance', 'improve', 'scrutiny', 'monitoring', 'institutionalized', 'procedures', 'adopt', 'ideas', 'routine', 'critical', 'practice', 'corrective', 'topics', 'needs', 'knowledge', 'skills', 'acquire', 'best', 'corrective', 'action', 'transfer', 'retention', 'strategy', 'competent', 'develop'],
#               ['issue', 'impact', 'identify', 'evaluate', 'address', 'correct', 'commensurate', 'significant', 'deviation', 'standard', 'action', 'corrective', 'document', 'threshold', 'describe', 'prioritize', 'assign', 'resolution', 'evaluation', 'classify', 'report', 'operability', 'investigation', 'root', 'cause', 'understand', 'conduct', 'resolution', 'trend', 'mitigate', 'routine'],
#               ['conscious', 'environments', 'fear', 'retaliation', 'intimidation', 'harassment', 'discrimination', 'policies', 'rights', 'responsibility', 'leaders', 'ownership', 'investigate', 'establish', 'support', 'promote', 'concern', 'raise'],
#               ['planning', 'controlling', 'work', 'activities', 'process', 'schedule', 'execute', 'criticize', 'management', 'incorporate', 'contingency', 'action', 'coordinate', 'probabilistic', 'consider', 'conflicting', 'modification', 'design', 'margin', 'operate', 'maintain', 'backlog', 'engineering', 'fission', 'prevent', 'documentation', 'procedures', 'complete', 'adherence', 'human', 'error', 'status', 'validate', 'implementation']]
              
safety_traits = ['Personal Accountability', 'Questioning Attitue', 'Effective Safety Communication', 
                 'Leadership Safety Values and Actions','Decision Making', 'Respectful Work Environment', 
                 'Continuous Learning','Problem Identification and Resolution','Environment for Raising Concerns',
                 'Work Processes']

seed_words_count = sum([len(listt) for listt in seed_words])
print("Total number of seed words:", seed_words_count)


Total number of seed words: 161


Pre-processing seed words (changing them to their basic form "lemmatization")

In [140]:
#Lemmatizing seed words

lemma_seed_words = []

for listt in seed_words:
    new_listt = []
    for words in listt:
        doc = nlp(words)
        i = 0
        for token in doc:
            if token.text == "-":
                continue
            new_listt.append(token.lemma_)
            i+=1
    lemma_seed_words.append(new_listt)
print(lemma_seed_words)
print("\nNumber of words:", sum([len(x) for x in lemma_seed_words]))

[['responsibility', 'accountability', 'help', 'support', 'train', 'qualify', 'understand', 'complete', 'involvement'], ['complacency', 'complacent', 'challenge', 'error', 'hazard', 'caution', 'discrepancy', 'anomaly', 'assumption', 'question', 'uncertain', 'unknown', 'risk', 'trend', 'unexpected', 'unclear', 'degrade', 'age'], ['communication', 'licensee', 'event', 'report', 'documentation', 'request', 'LER', 'information', 'safety', 'prompt', 'share', 'respond', 'listen', 'concern', 'expectation', 'clear'], ['leadership', 'management', 'leader', 'owner', 'ownership', 'program', 'guidance', 'policy', 'resource', 'staff', 'oversight', 'reinforce', 'priority', 'plan', 'delegate', 'align', 'define', 'manage', 'resolve', 'address', 'translate', 'funding', 'implementation', 'violation'], ['thorough', 'conservative', 'systematic', 'consistent', 'process', 'choice', 'consequence', 'authority', 'future', 'timely', 'executive', 'senior'], ['trust', 'respect', 'opinion', 'dignity', 'fair', 'disa

#### 5) Initializing Guided LDA model and mapping seed words to vocabulary terms 
Issue: many seed words do not exist in training data (Increase training size?)

In [141]:
model = glda.GuidedLDA(n_topics=10, n_iter=100, random_state=7, refresh=20)

lemm_seed_topics = {}
for t_id, st in enumerate(lemma_seed_words):
    for word in st:
        try:
            lemm_seed_topics[word2id[word]] = t_id
        except:
            continue
lemm_seed_topics

{1908: 0,
 2200: 0,
 2274: 6,
 1760: 0,
 2326: 0,
 387: 0,
 318: 1,
 762: 1,
 1008: 1,
 651: 1,
 140: 1,
 1768: 1,
 1942: 1,
 2289: 1,
 2332: 1,
 2321: 1,
 585: 7,
 56: 1,
 381: 2,
 1269: 2,
 777: 2,
 1875: 8,
 670: 2,
 1883: 2,
 1125: 2,
 1972: 2,
 1735: 2,
 1906: 2,
 406: 2,
 813: 2,
 343: 2,
 1249: 3,
 1330: 3,
 1248: 3,
 1570: 3,
 1571: 3,
 1728: 3,
 993: 3,
 1644: 3,
 1900: 3,
 2130: 3,
 1564: 3,
 1844: 3,
 1710: 3,
 1635: 3,
 68: 3,
 1329: 3,
 1898: 3,
 41: 3,
 2282: 3,
 945: 3,
 1067: 3,
 2400: 3,
 2248: 4,
 438: 4,
 443: 4,
 1721: 4,
 949: 4,
 2258: 4,
 801: 4,
 2016: 4,
 1902: 5,
 1525: 5,
 2379: 5,
 425: 5,
 1254: 6,
 132: 6,
 1074: 6,
 1602: 6,
 1420: 6,
 1052: 6,
 203: 6,
 1230: 6,
 621: 6,
 1054: 7,
 494: 7,
 27: 7,
 1205: 7,
 2479: 7,
 1825: 7,
 1697: 7,
 913: 7,
 1650: 7,
 1082: 7,
 774: 7,
 1715: 7,
 300: 7,
 1954: 7,
 1192: 7,
 1191: 7,
 1817: 7,
 1897: 7,
 1405: 7,
 752: 8,
 2031: 8,
 844: 8,
 2170: 8,
 820: 8,
 1780: 8,
 740: 9,
 472: 9,
 31: 9,
 461: 9,
 1725: 9,
 1

In [142]:
len(lemm_seed_topics)

119

#### 6) Fitting LDA model, guided by seed words.

seed_confidence: Measures how much the model is biasing the seeded words towards the seeded topics.

In [143]:
model.fit(X_array, seed_topics=lemm_seed_topics, seed_confidence=1)

INFO:lda:n_documents: 1053
INFO:lda:vocab_size: 2486
INFO:lda:n_words: 16371
INFO:lda:n_topics: 10
INFO:lda:n_iter: 100
INFO:lda:<0> log likelihood: -195413
INFO:lda:<20> log likelihood: -127885
INFO:lda:<40> log likelihood: -125077
INFO:lda:<60> log likelihood: -123758
INFO:lda:<80> log likelihood: -123093
INFO:lda:<99> log likelihood: -122794


<lda.guidedlda.GuidedLDA at 0x7fd46baa3190>

#### 7) Printing top words per topic (topic-word distributions)

A mix of words that make up each topic.

In [144]:
n_top_words = 50
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('{}: [{}]'.format(safety_traits[i], ', '.join(topic_words)))
    print('\n')

Personal Accountability: [pump, fuel, unit, result, reactor, water, outage, repair, dcpp, auxiliary, spent, door, power, pool, system, trip, containment, problem, find, dcisc, plant, leak, valve, service, occur, asw, pressure, motor, fan, remove, additional, pipe, coolant, include, replace, feedwater, refueling, leakage, heat, loss, shut, challenge, december, cooling, vessel, fire, core, start, increase, potential]


Questioning Attitue: [dcpp, tube, review, conclude, cause, significant, following, inspection, reference, system, dcisc, outage, safety, fwh, fuel, failure, experience, concern, maintenance, unit, perform, mid, feedwater, design, repair, leak, nrc, damage, opportunity, shut, heater, plant, miss, identify, replacement, refueling, unnecessary, transfer, handling, challenge, inspector, activity, fact, occur, equipment, resident, conclusion, follow, reduce, length]


Effective Safety Communication: [safety, low, system, concern, dcpp, corrective, remain, fuel, margin, issue, p

Test model classification of the word "leadership"

In [145]:
np.set_printoptions(suppress=True)
word_index = vocab.index('leadership')
probabilities = model.word_topic_[word_index]
max_prob = max(probabilities)
max_prob_index = np.argmax(probabilities)
print(list(np.round(probabilities,4)))
print("Most probable topic:", safety_traits[max_prob_index])

[0.0009, 0.0009, 0.0009, 0.9018, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.091]
Most probable topic: Leadership Safety Values and Actions


#### 8) Using the GuidedLDA model to classify test sentences to safety trait labels (from training data)
Probability threshold assumption: 10%
<br> Actual labels are unknown

In [146]:
test = list_sentences[5]
original_test = combined_sentences[5]
print(original_test,"\n")
print(test, "\n")
counter = {}
traits = []
sum_prob = defaultdict(list)
for word in test.split():
    vocab_index = vocab.index(word)
    probabilities = model.word_topic_[vocab_index]
    max_prob = max(probabilities)
    max_prob_index = np.argmax(probabilities)
    if max_prob > 0.5:
        counter[safety_traits[max_prob_index]]=counter.get(safety_traits[max_prob_index],0)+1
    i = 0
    while i < len(probabilities):
        sum_prob[safety_traits[i]].append(probabilities[i])
        i+=1
for trait in safety_traits:
    avg_prob = sum(sum_prob[trait])/len(sum_prob[trait])
    print(trait+":", round(avg_prob,4))
    if (avg_prob >= 0.1) and (trait in counter):
        if trait not in traits:
            traits.append(trait)
print('\nLikely Traits:', traits)
print('\nCounter:', counter)

Problems have been mostly due to age-related issues and lack of adequate inspection, maintenance, and component replacement, especially electrical contacts. 

problem age relate issue lack adequate inspection maintenance component replacement especially electrical contact 

Personal Accountability: 0.0177
Questioning Attitue: 0.0807
Effective Safety Communication: 0.1125
Leadership Safety Values and Actions: 0.0156
Decision Making: 0.0974
Respectful Work Environment: 0.0671
Continuous Learning: 0.0192
Problem Identification and Resolution: 0.0734
Environment for Raising Concerns: 0.0898
Work Processes: 0.4267

Likely Traits: ['Work Processes']

Counter: {'Work Processes': 4}


In [147]:
test = list_sentences[0]
original_test = combined_sentences[0]
print(original_test,"\n")
print(test,"\n")
counter = {}
traits = []
sum_prob = defaultdict(list)
for word in test.split():
    vocab_index = vocab.index(word)
    probabilities = model.word_topic_[vocab_index]
    max_prob = max(probabilities)
    max_prob_index = np.argmax(probabilities)
    if max_prob > 0.5:
        counter[safety_traits[max_prob_index]]=counter.get(safety_traits[max_prob_index],0)+1
    i = 0
    while i < len(probabilities):
        sum_prob[safety_traits[i]].append(probabilities[i])
        i+=1
for trait in safety_traits:
    avg_prob = sum(sum_prob[trait])/len(sum_prob[trait])
    print(trait+":", round(avg_prob,4))
    if (avg_prob >= 0.1) and (trait in counter):
        if trait not in traits:
            traits.append(trait)
print('\nLikely Traits:', traits)
print('\nCounter:', counter)

DCPP acted prompt with corrective actions and submitted a Licensee Event Report when it discovered Technical Specification non-compliance on the Low Temperature Overpressure Protection System. 

dcpp act prompt corrective action submit licensee event report discover technical specification non compliance low temperature overpressure protection system 

Personal Accountability: 0.014
Questioning Attitue: 0.0194
Effective Safety Communication: 0.2547
Leadership Safety Values and Actions: 0.0705
Decision Making: 0.0174
Respectful Work Environment: 0.0836
Continuous Learning: 0.0363
Problem Identification and Resolution: 0.1046
Environment for Raising Concerns: 0.339
Work Processes: 0.0604

Likely Traits: ['Effective Safety Communication', 'Problem Identification and Resolution', 'Environment for Raising Concerns']

Counter: {'Effective Safety Communication': 4, 'Respectful Work Environment': 1, 'Environment for Raising Concerns': 6, 'Problem Identification and Resolution': 1}


In [148]:
test = list_sentences[57]
original_test = combined_sentences[57]
print(original_test,"\n")
print(test,"\n")
counter = {}
traits = []
sum_prob = defaultdict(list)
for word in test.split():
    vocab_index = vocab.index(word)
    probabilities = model.word_topic_[vocab_index]
    max_prob = max(probabilities)
    max_prob_index = np.argmax(probabilities)
    if max_prob > 0.5:
        counter[safety_traits[max_prob_index]]=counter.get(safety_traits[max_prob_index],0)+1
    i = 0
    while i < len(probabilities):
        sum_prob[safety_traits[i]].append(probabilities[i])
        i+=1
for trait in safety_traits:
    avg_prob = sum(sum_prob[trait])/len(sum_prob[trait])
    print(trait+":", round(avg_prob,4))
    if (avg_prob >= 0.1) and (trait in counter):
        if trait not in traits:
            traits.append(trait)
print('\nLikely Traits:', traits)
print('\nCounter:', counter)

This ultimately created an environment that promulgated a human error-likely environment.” More specifically, the RCE team determined that the environment consisted of poor communication, lack of engineering leadership, too much reliance on vendor designs, time pressure, and distractions. 

ultimately create environment promulgate human error likely environment specifically rce team determine environment consist poor communication lack engineering leadership reliance vendor design time pressure distraction 

Personal Accountability: 0.1157
Questioning Attitue: 0.0224
Effective Safety Communication: 0.0277
Leadership Safety Values and Actions: 0.257
Decision Making: 0.0391
Respectful Work Environment: 0.0295
Continuous Learning: 0.1518
Problem Identification and Resolution: 0.2703
Environment for Raising Concerns: 0.0211
Work Processes: 0.0654

Likely Traits: ['Personal Accountability', 'Leadership Safety Values and Actions', 'Continuous Learning', 'Problem Identification and Resolution

#### 9) Evaluating Model Accuracy
Using manually-labelled texts from "Diablo Canyon - On Nuclear Safety and Safety Culture - Hector & Parker 04-08-16"

In [2]:
import csv
import pandas as pd

test_data = []

# opening the CSV file
with open('test_set.csv', mode ='r', encoding='utf-8') as csvfile:    
    csvreader = csv.reader(csvfile, delimiter=',',)
    next(csvreader)
    for row in csvreader:
        sentence = row[0]
        true_labels = [i for i in row[1:] if i]
        test_data.append((sentence,true_labels))

df = pd.read_csv('test_set.csv')
df

Unnamed: 0,sentence,labels,labels.1,labels.2,labels.3,labels.4,labels.5
0,"DCISC identified 11 Non-cited Violations, one ...",leadership safety values and actions,problem identification and resolution,personal accountability,environment for raising concerns,effective safety communication,questioning attitude
1,The number of violations has increased.,leadership safety values and actions,work processes,continuous learning,,,
2,The DCISC has identified a number of potential...,effective safety communication,,,,,
3,New regulatory requirements were not adequatel...,leadership safety values and actions,work processes,,,,
4,The DCISC learned in December 2013 that 16 imp...,leadership safety values and actions,personal accountability,effective safety communication,,,
5,The DCPP Fuel Handling System has been problem...,problem identification and resolution,,,,,
6,Additional efforts also need to be devoted to ...,personal accountability,work processes,continuous learning,,,
7,The loss of power to Unit 2 4kV Bus G during R...,leadership safety values and actions,work processes,,,,
8,Three Station Level Human Performance Event Cl...,leadership safety values and actions,work processes,continuous learning,,,
9,Equipment problems and failures increased the ...,personal accountability,environment for raising concerns,effective safety communication,questioning attitude,,


In [150]:
test_sentences = [x[0] for x in test_data]
clean_sentences = preprocess(test_sentences)
clean_sentences

['dcisc identify non cited violations severity level violation',
 'number violation increase',
 'dcisc identify number potential nuclear safety issue use closed cooling dcpp',
 'new regulatory requirement adequately translate specific calculation plant design basis fail demonstrate prefer offsite power source adequate capacity capability supply minimum require terminal voltage plant engineering safety feature follow limit transmission system contingency',
 'dcisc learn december impaired fire door repair replace funding deferral find unacceptable follow dcisc find door repair replace remain high priority plant door life cycle management plan',
 'dcpp fuel handling system problematic refueling outage problem age relate issue lack adequate inspection maintenance component replacement especially electrical contact',
 'additional effort need devote reduce operator burden workaround backlog deficient critical component require involvement station work group operations',
 'loss power unit bus

Using GuidedLDA model to predict labels of test sentences. Actual labels are known.

In [163]:
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

all_accuracies = []

index = 0
for sentence in test_sentences:
    test = clean_sentences[index]
    original_test = test_sentences[index]
    print(original_test,"\n")
    print(test,"\n")
    counter = {}
    traits = []
    sum_prob = defaultdict(list)
    for word in test.split():
        try:
            vocab_index = vocab.index(word)
            probabilities = model.word_topic_[vocab_index]
            max_prob = max(probabilities)
            max_prob_index = np.argmax(probabilities)
        except:
            continue
        if max_prob > 0.5:
            counter[safety_traits[max_prob_index]]=counter.get(safety_traits[max_prob_index],0)+1
        i = 0
        while i < len(probabilities):
            sum_prob[safety_traits[i]].append(probabilities[i])
            i+=1
    for trait in safety_traits:
        avg_prob = sum(sum_prob[trait])/len(sum_prob[trait])
        print(trait+":", round(avg_prob,4))
        if (avg_prob >= 0.1) and (trait in counter):
            if trait not in traits:
                traits.append(trait.lower())

    accuracy = jaccard(traits, test_data[index][1])
    all_accuracies.append(accuracy)
    
    print('\nLikely Traits:', sorted(traits))
    print('\nActual Traits:', sorted(test_data[index][1]))
    print('\nCounter:', counter)
    print("\naccuracy score:", accuracy)
    print(100*'-')

    index+=1

DCISC identified 11 Non-cited Violations, one Severity Level IV violation. 

dcisc identify non cited violations severity level violation 

Personal Accountability: 0.0533
Questioning Attitue: 0.0669
Effective Safety Communication: 0.1686
Leadership Safety Values and Actions: 0.2537
Decision Making: 0.1316
Respectful Work Environment: 0.0722
Continuous Learning: 0.1038
Problem Identification and Resolution: 0.0801
Environment for Raising Concerns: 0.0243
Work Processes: 0.0456

Likely Traits: ['decision making', 'leadership safety values and actions']

Actual Traits: ['effective safety communication', 'environment for raising concerns', 'leadership safety values and actions', 'personal accountability', 'problem identification and resolution', 'questioning attitude']

Counter: {'Decision Making': 1, 'Leadership Safety Values and Actions': 1}

accuracy score: 0.14285714285714285
----------------------------------------------------------------------------------------------------
The numbe

In [168]:
test_acc = sum(all_accuracies)/len(all_accuracies)
print("Overall Model Accuracy on Test Data:", round(test_acc*100,2), "%")

Overall Model Accuracy on Test Data: 28.31 %
