# Sentiment Analysis

Think for this exploration I will primarily be drawing from _Opinion Digger: An Unsupervised Opinion Miner from Unstructured Product Reviews_.

### Load the Data

In [1]:
import pandas as pd

def load_data():
    article_table = pd.read_csv("../data/raw/kaggle1/articles1.csv")
    return article_table

In [2]:
articles = load_data()

### Load the Aspects

In [3]:
import json

aspect_data = {}
pos_sentences = []
document_sentences = []
sentence_documents = []
with open("../data/cache/kaggle_aspects/aspects.json") as in_file:
    aspect_data = json.load(in_file)

with open("../data/cache/kaggle_aspects/pos.json") as in_file:
    pos_sentences = json.load(in_file)
    
with open("../data/cache/kaggle_aspects/sent_doc.json") as in_file:
    sentence_documents = json.load(in_file)
    
with open("../data/cache/kaggle_aspects/doc_sent.json") as in_file:
    document_sentences = json.load(in_file)

In [4]:
aspect_data

{'republicans': {'count': 171,
  'sentences': [0,
   2,
   3,
   4,
   6,
   10,
   12,
   18,
   19,
   21,
   23,
   1654,
   1657,
   1696,
   1698,
   1703,
   1712,
   1713,
   1715,
   1718,
   1720,
   1721,
   1728,
   1731,
   1733,
   1746,
   2374,
   2376,
   2383,
   2388,
   2413,
   2426,
   2429,
   2487,
   2494,
   2495,
   2495,
   2507,
   2518,
   2519,
   2520,
   2527,
   2534,
   2547,
   2559,
   2617,
   3271,
   3272,
   3272,
   3274,
   3281,
   3283,
   3291,
   3301,
   3302,
   3303,
   3314,
   3346,
   3353,
   3383,
   3383,
   3387,
   3388,
   3389,
   3389,
   3399,
   3408,
   4512,
   4948,
   5030,
   5031,
   5394,
   5405,
   5416,
   5428,
   5440,
   5478,
   5482,
   5483,
   5522,
   7306,
   7314,
   7492,
   7496,
   8548,
   8552,
   8821,
   8873,
   10030,
   10042,
   10338,
   10871,
   11003,
   11020,
   11026,
   11027,
   11036,
   11363,
   11677,
   12582,
   12896,
   12908,
   12926,
   12930,
   12944,
   12960,
   12975,
 

In [5]:
pos_sentences

[[['washington', 'JJ'],
  ['congressional', 'JJ'],
  ['republicans', 'NNS'],
  ['have', 'VBP'],
  ['a', 'DT'],
  ['new', 'JJ'],
  ['fear', 'NN'],
  ['when', 'WRB'],
  ['it', 'PRP'],
  ['comes', 'VBZ'],
  ['to', 'TO'],
  ['their', 'PRP$'],
  ['health', 'NN'],
  ['care', 'NN'],
  ['lawsuit', 'NN'],
  ['against', 'IN'],
  ['the', 'DT'],
  ['obama', 'JJ'],
  ['administration', 'NN'],
  ['they', 'PRP'],
  ['might', 'MD'],
  ['win', 'VB']],
 [['the', 'DT'],
  ['incoming', 'JJ'],
  ['trump', 'NN'],
  ['administration', 'NN'],
  ['could', 'MD'],
  ['choose', 'VB'],
  ['to', 'TO'],
  ['no', 'DT'],
  ['longer', 'JJR'],
  ['defend', 'VB'],
  ['the', 'DT'],
  ['executive', 'NN'],
  ['branch', 'NN'],
  ['against', 'IN'],
  ['the', 'DT'],
  ['suit', 'NN'],
  ['which', 'WDT'],
  ['challenges', 'VBZ'],
  ['the', 'DT'],
  ['administration', 'NN'],
  ['authority', 'NN'],
  ['to', 'TO'],
  ['spend', 'VB'],
  ['billions', 'NNS'],
  ['of', 'IN'],
  ['dollars', 'NNS'],
  ['on', 'IN'],
  ['health', 'NN'],
  

### Aspect Pruning

In [2]:
pruned_data = {}

for aspect in aspect_data.keys():
    if aspect_data[aspect]["flr"] > 1.0:
        pruned_data[aspect] = aspect_data[aspect]


print(len(aspect_data))
print(len(pruned_data))

19495
15662


### Find closest adjectives to each aspect

In [3]:
from nltk.corpus import sentiwordnet as swn

import nltk
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to /home/dwl/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

In [4]:
from tqdm import tqdm

for aspect in tqdm(pruned_data.keys()):
    pos_aspect = pruned_data[aspect]["pos"]
    
    # add a key to the data dictionary to record sentiments
    pruned_data[aspect]["sentiments"] = {}
    
    # iterate through every sentence mentioning this aspect
    for sentence_index in pruned_data[aspect]["sentences"]:
        sentence = pos_sentences[sentence_index]
        
        aspect_score = [0.0, 0.0] # pos = [0], neg = [1]
        
        # find the index of the aspect
        try:
            aspect_index = sentence.index(pos_aspect[0]) # NOTE: if difference between NN and NNS, this won't catch, so that needs to be dealt with
        except:
            pruned_data[aspect]["sentiments"][sentence_index] = aspect_score
            continue
        
        # iterate on each side
        adjectives = []
        weights = [] # TODO: weight less the farther away from the word it is
        for i in range(1, 6):
            left = aspect_index - i
            right = aspect_index + len(pos_aspect) + i
            
            # check for adjectives
            if left >= 0:
                if sentence[left][1] == "JJ":
                    adjectives.append(sentence[left][0])
                    break
            
            if right < len(sentence):
                if sentence[right][1] == "JJ":
                    adjectives.append(sentence[right][0])
        
        #print(adjectives)
        
        # go through each adjective and get a combined score
        for adjective in adjectives:
            try:
                score = swn.senti_synset(adjective + ".a.01")
                aspect_score[0] += score.pos_score()
                aspect_score[1] += score.neg_score()
            except: continue

            
        #print(aspect_score)
        pruned_data[aspect]["sentiments"][sentence_index] = aspect_score
    #print(pruned_data[aspect])
        

        

100%|██████████| 15662/15662 [00:01<00:00, 8089.00it/s]


Try to find information about the documents of the first aspect

In [5]:
aspect = list(pruned_data.keys())[0]
print("aspect '" + aspect + "'")

doc_sentiment = {}

things = pruned_data[aspect]
for sentence_index in things['sentences']:
    
    sentiment = things['sentiments'][sentence_index]
    assoc_doc_index = sentence_documents[sentence_index]
    
    if assoc_doc_index not in doc_sentiment:
        doc_sentiment[assoc_doc_index] = [0.0, 0.0]
    
    doc_sentiment[assoc_doc_index][0] += sentiment[0]
    doc_sentiment[assoc_doc_index][1] += sentiment[1]
    
    #print(sentence_index, assoc_doc_index, sentiment)
    
print(doc_sentiment)

aspect 'republicans'
{0: [1.25, 0.75], 24: [0.25, 0.0], 25: [2.375, 2.125], 26: [0.0, 0.0], 39: [0.5, 0.375], 40: [0.0, 0.0], 41: [1.25, 0.25], 42: [0.0, 0.0], 43: [0.0, 0.0], 57: [1.5, 0.375], 58: [0.0, 0.0], 59: [0.75, 0.375], 75: [0.75, 0.0], 87: [0.0, 0.0], 89: [0.25, 0.125], 91: [0.0, 0.0], 93: [0.0, 0.0], 94: [0.0, 0.0]}


In [66]:
articles.iloc[57].content

'WASHINGTON  —   Congress opened for battle over the Affordable Care Act on Wednesday as Republicans pushed immediately forward to repeal the health care law and President Obama made a rare trip to Capitol Hill to defend it. The bitterness that has long marked the fight intensified as Republicans seized the opportunity to make good on a central campaign promise to get rid of the law, a pledge reinforced on Wednesday by Vice   Mike Pence, who met with House Republicans not far from where the president gathered with Democrats. The Affordable Care Act, Mr. Obama’s signature health care law, has created online insurance marketplaces, offered new protections to people seeking health insurance, and provided coverage to millions of people near the poverty line through expanded Medicaid. Health policy experts say that system could collapse if Republicans cut off funds for the expanded coverage and end penalties for people who go without health insurance. “The American people voted decisively f