In [172]:
import json
from glob import glob
import csv

In [156]:
def sort_by_line_number(elem):
    return elem['line_number']

def sort_by_weight(elem):
    return elem['weight']

In [87]:
input_file_pattern = 'zero-carbon-bill/input/*.json'
input_folder_name = 'zero-carbon-bill/input/'

filenames = glob(input_file_pattern)

In [198]:
# Import key phrases

key_phrases_file = 'zero-carbon-bill/key-phrases/key-phrases.json'
with open(key_phrases_file) as f:
    contents = f.read()

raw_key_phrases = json.loads(contents)
key_phrases = {}

for item in raw_key_phrases:
    filename = item['File']
    if key_phrases.get(filename, None) is None:
        key_phrases[filename] = []
    phrases = item['KeyPhrases']
    for phrase in phrases:
        phrase.pop('BeginOffset')
        phrase.pop('EndOffset')
    key_phrases[filename] = phrases

In [199]:
# Import sentiment analysis

sentiment_analysis_file = 'zero-carbon-bill/sentiment-analysis/output.json'
with open(sentiment_analysis_file) as f:
    contents = f.read()

raw_sentiment_analysis = json.loads(contents)
sentiment_analysis = {}

for item in raw_sentiment_analysis:
    filename = item['File']
    if sentiment_analysis.get(filename, None) is None:
        sentiment_analysis[filename] = []
    sentiment_analysis[filename].append(item)

In [200]:
# Import topic modelling: topics

topic_terms = {}

with open('zero-carbon-bill/topic-modelling/topic-terms.csv', newline='') as topic_terms_file:
    reader = csv.reader(topic_terms_file, delimiter=',')
    for row in reader:
        if row[0] == 'topic':
            next
        else:
            topic_number = row[0]
            item = {'term': row[1], 'weight': row[2]}
            if topic_terms.get(topic_number, None) is None:
                topic_terms[topic_number] = []
            topic_terms[topic_number].append(item)

for key, value in topic_terms.items():
    topic_terms[key] = sorted(value, key=sort_by_weight, reverse=True)

In [201]:
# Import topic modelling: documents

docs_with_topics = {}

with open('zero-carbon-bill/topic-modelling/doc-topics.csv', newline='') as doc_topics:
    reader = csv.reader(doc_topics, delimiter=',')
    for row in reader:
        if row[0] == 'docname':
            next
        else:
            filename = row[0]
            item = {'topic': row[1], 'confidence': row[2]}
            
            if docs_with_topics.get(filename, None) is None:
                docs_with_topics[filename] = []
            docs_with_topics[filename].append(item)

In [202]:
def key_phrases_for_submission(filename):
    if input_folder_name in filename:
        input_file_name = filename
        filename = filename.replace(input_folder_name, '')
    else:
        input_file_name = input_folder_name + filename

    p = key_phrases.get(filename, None)
    return(p)

In [203]:
def topics_for_submission(filename):
    if input_folder_name in filename:
        input_file_name = filename
        filename = filename.replace(input_folder_name, '')
    else:
        input_file_name = input_folder_name + filename

    return(docs_with_topics.get(filename, None))

In [204]:
def sentiment_analysis_for_submission(filename):    
    if input_folder_name in filename:
        input_file_name = filename
        filename = filename.replace(input_folder_name, '')
    else:
        input_file_name = input_folder_name + filename

    sentiment = sentiment_analysis[filename]
    
    with open(input_file_name) as f:
        submission_contents = f.readlines()

    labelled_line_numbers = []
    
    for l in sentiment:
        line_number = l['Line']
        
        sentiment_details = {}
        
        predicted_sentiment = l['Sentiment']
        sentiment_details['sentiment'] = predicted_sentiment.lower()
        sentiment_details['confidence'] = l['SentimentScore'][predicted_sentiment.title()]
                
        line_details = {}
        
        line_details['line_number'] = line_number
        line_details['original_text'] = submission_contents[line_number]
        line_details['sentiment'] = sentiment_details
        
        labelled_line_numbers.append(line_details)


    return(sorted(labelled_line_numbers, key=sort_by_line_number))

In [205]:
a = sentiment_analysis_for_submission(filenames[6])

print(filenames[6])
for line in a:
    print(line['original_text'])
    print(line['sentiment'])
    print('***')

zero-carbon-bill/input/00183_Shaun_Francis.pdf.txt.json
Massive Global Warming hasn't happened, nor has massive Climate Change. Also there has been no change in sea levels.

{'sentiment': 'neutral', 'confidence': 0.5912350416183472}
***


In [207]:
def everything_for_file(filename):
    sentiment = sentiment_analysis_for_submission(filename)
    key_phrases = key_phrases_for_submission(filename)
    topics = topics_for_submission(filename)
    return({'sentiment': sentiment, 'key_phrases': key_phrases, 'topics': topics})

In [218]:
everything_for_file(filenames[6])

{'sentiment': [{'line_number': 0,
   'original_text': "Massive Global Warming hasn't happened, nor has massive Climate Change. Also there has been no change in sea levels.\n",
   'sentiment': {'sentiment': 'neutral', 'confidence': 0.5912350416183472}}],
 'key_phrases': [{'Score': 0.9999314593576598, 'Text': 'Massive'},
  {'Score': 0.9999999701976798, 'Text': "n't happened, nor"},
  {'Score': 1.0, 'Text': '.'},
  {'Score': 1.0, 'Text': 'Also there has been no'},
  {'Score': 0.9980705931186065, 'Text': 'sea levels'}],
 'topics': [{'topic': '016', 'confidence': '1.0'}]}

In [219]:
topic_terms['016']

[{'term': 'level', 'weight': '0.21933216'},
 {'term': 'high', 'weight': '0.20494124'},
 {'term': 'urgency', 'weight': '0.2029609'},
 {'term': 'prioritise', 'weight': '0.20183815'},
 {'term': 'bill', 'weight': '0.034151934'},
 {'term': 'business', 'weight': '0.027950712'},
 {'term': 'emission', 'weight': '0.01914937'},
 {'term': 'agriculture', 'weight': '0.016041243'},
 {'term': 'massive', 'weight': '0.013707493'},
 {'term': 'work', 'weight': '0.009518109'}]