In [29]:
import pandas as pd
import numpy as np
import glob
import docx
import openpyxl
import time
import os
from stanfordnlp.server import CoreNLPClient
from sklearn.feature_extraction.text import TfidfVectorizer
from operator import itemgetter
from string import digits
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import word_tokenize
from nltk import tokenize
from nltk.util import ngrams

In [3]:
def SentimentScore(paragraph, weight=1):
    sentences = []
    lines_list = tokenize.sent_tokenize(paragraph)
    sentences.extend(lines_list)
    sid = SentimentIntensityAnalyzer()
    sentiment = 0
    for sentence in sentences:
        ss = sid.polarity_scores(sentence)
        score = ss['compound']
        if score < 0:
            sentiment += score*weight
        else:
            sentiment += score
    return round(sentiment/len(sentences),2)

def neg(bucket):
    if bucket < 4:
        return bucket + 4
    else:
        return bucket - 4
    
def remove_nan(l):
    return [x for x in l if not pd.isnull(x)]

def get_text(doc_names):
    documents = []
    titles = []
    remove_digits = str.maketrans('', '', digits)
    for i in range(len(doc_names)):
        doc_name = doc_names[i]
        paragraphs = docx.Document(doc_name).paragraphs
        par_text = [paragraph.text.replace('\xa0', ' ') for paragraph in paragraphs]
        text = par_text[0]
        titles.append(text)
        for i in range(1, len(par_text)):
            par = par_text[i]
            if par:
                text += ' ' + par
        documents.append(text.lower().translate(remove_digits))
    return titles, documents

def bin_of_words(filename):
    df = pd.read_excel(filename)
    return [sorted(list(set(remove_nan(df[col].values)))) for col in df.columns]

In [4]:
negators = ['no', 'not', 'lack', 'never', 'none', 'neither', 'nobody', 'few', 'hardly',\
            'little', 'rarely', 'scarcely', 'seldom',]

In [10]:
doc_names = sorted(glob.glob('/home/roguehydra/Documents/Jaar 4/TOFU/SA/Webpages/TEST/*.doc*'))
titles, documents = get_text(doc_names)

1: Dream 

2: Economic

3: Health

4: Environment

5: Nightmare

In [11]:
bins_of_words = bin_of_words('BINS - version 2.xlsx')

In [12]:
word_matrix = np.empty((len(documents), len(bins_of_words)), dtype=object)

In [13]:
for i in range(len(documents)):
    document = documents[i]
    for j in range(len(bins_of_words)):
        if str(word_matrix[i][j]) == 'None':
            word_matrix[i][j] = []
        bucket = bins_of_words[j]
        for word in bucket:
            if word in document:
                for _ in range(document.count(word)):
                    tmp = word_matrix[i][j].copy()
                    tmp.append(word)
                    word_matrix[i][j] = tmp

In [23]:
negations = []
for i in range(len(documents)):
    negated = []
    text = documents[i]
    token=nltk.word_tokenize(text)
    bigrams=ngrams(token,2)
    trigrams=ngrams(token,3)
    for gram in bigrams:
        if gram[0] in negators:
            negated.append(gram[1])
    for gram in trigrams:
        if gram[0] in negators:
            negated.append(gram[2])
            negated.append(gram[1] + ' ' + gram[2])
    negations.append(negated)

In [25]:
rows = [0,4]
for i in range(len(negations)):
    negated = negations[i]
    for word in negated:
        for j in rows:
            if word in word_matrix[i][j]:
                tmp = word_matrix[i][j].copy()
                tmp.pop(tmp.index(word))
                word_matrix[i][j] = tmp
                
                j2 = neg(j)
                tmp2 = word_matrix[i][j2].copy()
                tmp2.append(word)
                word_matrix[i][j2] = tmp2 
                break


In [26]:
score_matrix = np.zeros((len(documents), len(bins_of_words)))

In [27]:
for i in range(len(documents)):
    for j in range(len(bins_of_words)):
        score_matrix[i][j] = len(word_matrix[i][j])

In [30]:
for i in range(len(documents)):
    print("Document {} \n\n{}\n".format(i+1, titles[i]))
    doc = documents[i]
    vader_score = SentimentScore(doc,2)
    positive = score_matrix[i][0]
    negative = score_matrix[i][4]
    total_sentiment = positive - negative
    
    economic = score_matrix[i][1] 
    health = score_matrix[i][2] 
    environment = score_matrix[i][3]
    total_topic =  economic + health + environment
    if total_sentiment > 0:
        sen_score = total_sentiment/positive
    elif total_sentiment < 0:
        sen_score = total_sentiment/negative
    elif total_sentiment == 0:
        sen_score = 0
        
    score = (sen_score + vader_score) / 2
    if score > 0.1:
        judgement = 'positive'
    elif score < -0.1:
        judgement = 'negative'
    elif score == 0:
        judgement = 'neutral'
    elif score > 0:
        judgement = 'neutral (leaning towards positive)'
    else:
        judgement = 'neutral (leaning towards negative)'
    
    
    print("Sentiment (score) : {}, ({})\n".format(judgement,round(score,2)))

    print("Economic score    : {}".format(round(economic/total_topic,2)))
    print("Health score      : {}".format(round(health/total_topic,2)))
    print("Evironmental score: {}".format(round(environment/total_topic,2)))
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")

Document 1 

Geothermal Basics

Sentiment (score) : positive, (0.45)

Economic score    : 0.36
Health score      : 0.11
Evironmental score: 0.53
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Document 2 

Geothermal Energy Information and Facts

Sentiment (score) : neutral (leaning towards positive), (0.09)

Economic score    : 0.32
Health score      : 0.12
Evironmental score: 0.56
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Document 3 

Heat Without Fire: Geothermal For A Cleaner, Sustainable Future In New York City

Sentiment (score) : positive, (0.45)

Economic score    : 0.27
Health score      : 0.19
Evironmental score: 0.54
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Document 4 

Geothermal Heating and Cooling Technologies

Sentiment (score) : positive, (0.3)

Economic score    : 0.32
Health score      : 0.18
Evironmental score: 0.5
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Document 5 

Geothermal for Canad

In [14]:
vectorizer = TfidfVectorizer(use_idf=True, stop_words='english', ngram_range=(1,3), min_df=2)
x = vectorizer.fit_transform(documents)
tf_idf_scores = x.toarray()
words = vectorizer.get_feature_names()
all_words = [n for m in bins_of_words for n in m]

In [15]:
for i in range(len(tf_idf_scores)):
    print('Document {}\n'.format(i+1))
    document = tf_idf_scores[i]
    sug = []
    for j in range(len(document)):
        score = document[j]
        if score > 0:
            word = words[j]
            if word not in all_words:
                sug.append((score, word))

    if sug != []:
        sug.sort(key=itemgetter(0), reverse=True)
        for s, word in sug[-100:-90]:
            print(word)
    print('- - - - - - - - - - - - - - - -\n')

Document 1

heat energy
life
located western
located western states
mile deep
recharge
reservoirs hot
reservoirs hot water
reservoirs tap
reservoirs tap steam
- - - - - - - - - - - - - - - -

Document 2

sulfide
tap steam
tap steam hot
transferring
turn
underground reservoirs tap
upper
use percent
used directly heating
used drive turbine
- - - - - - - - - - - - - - - -

Document 3

conditioning systems
efficient environmentally
efficient environmentally clean
end
energy efficient environmentally
energy resources
environmentally clean
epa
feasibility
geothermal energy resources
- - - - - - - - - - - - - - - -

Document 4

used heat
usually
variety
variety applications
ventilation
water rises
water rises surface
wide range
words
yellowstone
- - - - - - - - - - - - - - - -

Document 5

potential geothermal
power generation geothermal
power heat
producing
provided
pump geothermal
pump geothermal heat
real
reasonable
recent
- - - - - - - - - - - - - - - -

Document 6

electricity geothermal