In [None]:
import re
import spacy
import nltk
from nltk.tokenize import sent_tokenize
from transformers import pipeline, DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from concurrent.futures import ThreadPoolExecutor, as_completed

# Download the NLTK package required for sentence tokenization
nltk.download('punkt')

class SentimentAnalysis:
    def __init__(self):
        # Load the pre-trained SpaCy model
        self.nlp = spacy.load("en_core_web_trf")
        self.model_path = './sentiment_model'
        self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_path, local_files_only=True)
        self.model = DistilBertForSequenceClassification.from_pretrained(self.model_path, num_labels=2)

    def detect_characters(self, story):
        doc = self.nlp(story)
        characters = set()

        for ent in doc.ents:
            if ent.label_ == "PERSON":
                characters.add(ent.text)

        return list(characters)

    def extract_character_sentences(self, story, characters):
        sentences = sent_tokenize(story)
        character_sentences = {}

        for i, sentence in enumerate(sentences):
            for name in characters:
                if re.search(r'\b' + re.escape(name) + r'\b', sentence, re.IGNORECASE):
                    if name not in character_sentences:
                        character_sentences[name] = []
                    character_sentences[name].append(sentence)

        return character_sentences

    def fine_tuned_sentiment_analysis(self, character_sentences, max_length=512):
        character_sentiments = {}

        for name, contexts in character_sentences.items():
            if contexts:
                weighted_scores = []
                weight = 1
                for context in contexts:
                    inputs = self.tokenizer(context, padding=True, truncation=True, return_tensors='pt')
                    with torch.no_grad():
                        outputs = self.model(**inputs)
                        logits = outputs.logits

                    predicted_label = torch.argmax(logits, dim=1).item() * weight
                    compound_score = "positive" if predicted_label >= 0.5 else "negative"
                    weighted_scores.append(compound_score)
                if weighted_scores.count("positive") > weighted_scores.count("negative"):
                    character_sentiments[name] = "positive"
                else:
                    character_sentiments[name] = "negative"

        return character_sentiments

In [23]:
import csv
import json

with open('non_names_characters.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter='\n')
    non_names = [row[0].lower() for row in reader]


def process_story(i, row, sentiment):
    story = ' '.join(row)
    characters = sentiment.detect_characters(story)
    if not characters:
        characters = non_names
    character_sentences = sentiment.extract_character_sentences(story, characters)
    character_sentiments = sentiment.fine_tuned_sentiment_analysis(character_sentences)

    characters_info = {}
    characters_info["story_id"] = i + 52665
    characters_info["characters"] = {name: sentiment for name, sentiment in character_sentiments.items()}
    print(characters_info)
    return characters_info


sentiment = SentimentAnalysis()
with open('val_reformatted.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    header = next(reader)

    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_story, i, row, sentiment) for i, row in enumerate(reader)}
        stories = [future.result() for future in as_completed(futures)]

with open("val_dataset.json", "w") as outfile:
    json.dump(stories, outfile, indent=4)

{'story_id': 28687, 'characters': {'Rocky': 'positive'}}
{'story_id': 28695, 'characters': {'Sarah': 'positive'}}
{'story_id': 28689, 'characters': {'Jessica': 'positive'}}
{'story_id': 28682, 'characters': {'Jason': 'negative'}}
{'story_id': 28674, 'characters': {'Sarah': 'positive', 'Mike': 'positive'}}
{'story_id': 28692, 'characters': {'Chris': 'negative'}}
{'story_id': 28691, 'characters': {'John': 'positive'}}
{'story_id': 28686, 'characters': {'Jeremy': 'positive', 'Jim': 'negative'}}
{'story_id': 28694, 'characters': {'Sarah': 'positive'}}
{'story_id': 28688, 'characters': {'Abdul': 'positive'}}
{'story_id': 28705, 'characters': {'Henry': 'positive'}}
{'story_id': 28693, 'characters': {'i': 'positive', 'boyfriend': 'positive'}}
{'story_id': 28703, 'characters': {'Justin': 'positive'}}
{'story_id': 28696, 'characters': {'Mike': 'negative'}}
{'story_id': 52676, 'characters': {'Danny': 'positive'}}
{'story_id': 28699, 'characters': {'Richard': 'positive'}}
{'story_id': 28697, 'cha

KeyboardInterrupt: 

In [None]:
with open("ROC_train_stories_context.json", 'r') as file_characters:
    data = json.load(file_characters)
    data.sort(key=lambda x: x['story_id'])
    with open("ROC_train_stories_context1.json", "w") as outfile:
        json.dump(data, outfile, indent=4)

In [None]:
with open("new_final.json", 'r') as file_characters:
    data = json.load(file_characters)
    for story in data:
        print(story)

In [21]:
with open("new_final.json", "r") as miss:
    to_add = json.load(miss)

with open("ROC_train_stories_context1.json", "r") as outfile:
    orig = json.load(outfile)

i = 0
for item in orig:
    i+=1


print(i)


52665


In [22]:
with open("ROC_val_stories_context.json", 'r') as file_characters:
    data = json.load(file_characters)
    data.sort(key=lambda x: x['story_id'])
    with open("ROC_val_stories_context1.json", "w") as outfile:
        json.dump(data, outfile, indent=4)

In [26]:
moje = {}

with open("ROC_val_stories_context1.json", 'r') as file_characters:
    data = json.load(file_characters)
    for item in data:
        item['story_id'] = item['story_id'] + 52665


with open("ROC_val_stories_context2.json", "w") as outfile:
    json.dump(data, outfile, indent=4)