# Sentiment analysis of NER extacted sentences from short stories


In [1]:
!pip install tensorflow-gpu torch pandas numpy scikit-learn transformers afinn nltk
!pip install --upgrade spacy



In [2]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.3.0/en_core_web_lg-3.3.0-py3-none-any.whl (400.7 MB)
[K     |████████████████████████████████| 400.7 MB 6.7 kB/s 
Installing collected packages: en-core-web-lg
  Attempting uninstall: en-core-web-lg
    Found existing installation: en-core-web-lg 2.2.5
    Uninstalling en-core-web-lg-2.2.5:
      Successfully uninstalled en-core-web-lg-2.2.5
Successfully installed en-core-web-lg-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


### I/O device registering

Current working directory is set to `/content` by default. You can also give access to your Google Drive to save models/results/... there.

In [3]:
from google.colab import drive
drive.mount("/content/drive/")

# Access your Drive data using folder '/content/drive/MyDrive'

!ls -lah /content/drive/MyDrive/NLP-Project/

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
total 8.0K
drwx------ 2 root root 4.0K May  4 10:34 data
drwx------ 2 root root 4.0K May  4 10:35 ner


### GPU device selection review

You can directly use system command `nvidia-smi` or use Python library (e.g. Tensorflow or PyTorch) to check this.

In [4]:
!nvidia-smi

Wed May  4 11:08:51 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
import tensorflow as tf
import os
print(f"Tensorflow version: {tf.__version__}")

# Restrict TensorFlow to only allocate 4GBs of memory on the first GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(f"The system contains '{len(gpus)}' Physical GPUs and '{len(logical_gpus)}' Logical GPUs")
  except RuntimeError as e:
    print(e)
else:
    print(f"Your system does not contain a GPU that could be used by Tensorflow!")

Tensorflow version: 2.8.0
The system contains '1' Physical GPUs and '1' Logical GPUs


## NER

Just the code that is available in the ner branch, but adapted to further use for sentiment analysis.


In [27]:
# Imports

import codecs
import afinn
import json
import matplotlib.pyplot as plt
import sklearn
import numpy as np
import pandas as pd
import spacy
import os
import pathlib
import nltk

from pathlib import PurePath

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [39]:
with codecs.open('/content/drive/MyDrive/NLP-Project/ner/1-1000.txt') as f:
  file = f.read()
  file = file.split("\n")

common_words = set(file)

datadir = pathlib.Path(os.getcwd()) / '/content/drive/MyDrive/NLP-Project/data/imapbook'
nlp = spacy.load('en_core_web_lg')


def process_ner(input_tokenized):

  out = []

  af = afinn.Afinn()
  sentiment = [af.score(x) for x in input_tokenized]
  alignment = np.sum(sentiment) / len(np.nonzero(sentiment)[0]) * -2

  for t in input_tokenized:

    cur = nlp(t)

    name_list = [x for x in cur.ents if x.label_ in ['PERSON']]
    name_list = [str(x).lower().replace("'s", "") for x in name_list]
    name_list = [x.split(' ') for x in name_list]
    name_list = process_list(name_list)
    name_list = [x for x in name_list if len(x) >= 2]
    name_list = [x for x in name_list if x not in common_words]

    if name_list != []:
      out.append(name_list)

  out = process_list(out)
  import collections
  out = collections.Counter(out)
  out = [x for x in out if out[x] >= 0.0005 * len(input_tokenized)]

  return out, alignment


def read(story, path):

  stories = os.listdir(path)
  stories = [i for i in stories if i.find(story) >= 0]
  novel = ''
  for s in stories:
    with codecs.open(path / s, 'r', encoding='utf-8') as f:
      data = f.read().replace('\r', ' ').replace('\n', ' ').replace("\'", "'")
    novel += ' ' + data

  return novel


def flatten(l):
  try:
    return flatten(l[0]) + (flatten(l[1:]) if len(l) > 1 else []) if type(l) is list else [l]
  except IndexError:
    return []


def process_list(oldlist):
  newlist = []
  for l in oldlist:
    if type(l) == list:
      newlist += process_list(l)
    else:
      newlist += [l]

  return newlist


def top_names(name_list, novel):

  vect = sklearn.feature_extraction.text.CountVectorizer(vocabulary=name_list, stop_words='english')
  name_frequency = vect.fit_transform([novel.lower()])
  name_frequency = pd.DataFrame(name_frequency.toarray(), columns=vect.get_feature_names_out())
  name_frequency = name_frequency.T
  name_frequency = name_frequency.sort_values(by=0, ascending=False)
  name_frequency = name_frequency[0:30]
  names = list(name_frequency.index)
  name_frequency = list(name_frequency[0])

  return names, name_frequency

def extract_and_output(chars, freqs, sent_tokens, story_selected, data_dir):
  # Here we now extract only the immediate sentence that has the NER in it
  # In the future we will exctract the immediate context as well
  # Maybe even semantic context

  output = []
  for char, freq in zip(chars[:4], freqs[:4]):
    out_dict = {}
    out_dict["character"] = char
    out_dict["frequency"] = freq
    out_dict["appearances"] = []

    # Exctract the immediate appearances in sentences
    for sent in sent_tokens:
      word_tokens = [word.lower() for word in nltk.word_tokenize(sent)]
      if char in word_tokens:
        out_dict["appearances"].append(sent)
    
    # Add the character data to the output
    output.append(out_dict)

  # Export the json file
  json_dump = json.dumps(output)

  with open(f"{data_dir}/ner/{story_selected}.json", "w") as f:
    f.write(json_dump)
    

if __name__ == '__main__':

  for file in os.listdir(datadir):
    if os.path.isfile(PurePath(datadir, file)):
      story_selected = PurePath(file).stem
      #print(story_selected)

      #story_selected = 'Henry_Red_Chief'

      story = read(story_selected, datadir)

      sent_tokens = nltk.sent_tokenize(story)

      out_ner, alignment = process_ner(sent_tokens)

      chars, freqs = top_names(out_ner, story)

      extract_and_output(chars, freqs, sent_tokens, story_selected, datadir)

  print('done')

done


In [40]:
# Run this at the end

drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

All changes made in this colab session should now be visible in Drive.
