In [1]:

import torch
from torch.utils.data import Dataset
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline, AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import requests
import time
import csv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sent_tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
sent_model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
sent_pipeline = pipeline("sentiment-analysis", model=sent_model, tokenizer=sent_tokenizer)

Device set to use cpu


In [None]:
import csv

sentences = []
gold_labels = []
dates = []
found = []

with open("Data/test_with_predictions.csv", "r", encoding="utf-8") as f:
    combined_t = []
    combined_l = []

    reader = csv.reader(f)
    next(reader)

    for line in reader:
        token, ignor_label, label = line

        if token[:5] == "date:" and ignor_label == "O" and label == "":
            if combined_l:
                sentence_tokens = combined_t[1:] if combined_t[0] == '' else combined_t
                label_tokens = combined_l[1:] if combined_t[0] == '' else combined_l

                sentences.append(sentence_tokens)
                gold_labels.append(label_tokens)

                lista = [tok for tok, lab in zip(sentence_tokens, label_tokens) if lab != 'O']
                found.append(lista)

                combined_t = []
                combined_l = []

            dates.append(token[5:])
            continue

        combined_t.append(token)
        combined_l.append(label)

    if combined_l:
        sentence_tokens = combined_t[1:] if combined_t[0] == '' else combined_t
        label_tokens = combined_l[1:] if combined_t[0] == '' else combined_l

        sentences.append(sentence_tokens)
        gold_labels.append(label_tokens)

        lista = [tok for tok, lab in zip(sentence_tokens, label_tokens) if lab != 'O']
        found.append(lista)

print("Date:", dates[0])
print("Sentence:", " ".join(sentences[0]))
print("Entities (non-O):", found[0])


Date:  2020-11-27
Sentence: big tech and the fa ##ke news media have partnered to suppress . freedom of the press is gone , a thing of the past . that ’ s why they refuse to report the real facts and figures of the 2020 election or even , where ’ s hunter !  
Entities (non-O): []


In [13]:
print(dates[5004])
print(sentences[5004])
print(gold_labels[5004])
print(found[5004])

 2020-03-02
['michelle', '@', 'fis', '##ch', '##bac', '##hm', '##n', '##7', 'is', 'running', 'for', 'congress', 'in', 'minnesota', '.', 'nicole', 'is', 'strong', 'on', 'crime', 'and', 'borders', ',', 'cutting', 'taxes', ',', 'your', '#', '2a', ',', 'love', '##s', 'our', 'military', ',', 'vet', '##s', ',', 'and', 'will', 'stand', 'with', 'our', 'great', 'farmers', '.', 'michelle', 'has', 'my', 'complete', 'and', 'total', 'endorsement', '!', '', '']
['PER_B', 'O', 'ORG_B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'ORG_B', 'O', 'LOC_B', 'O', 'PER_B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PER_B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['michelle', 'fis', 'congress', 'minnesota', 'nicole', 'michelle']


In [None]:
import csv

output_file = "Data/processed_outputagg.csv"

with open(output_file, "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Date", "Sentence", "Found Entities"])

    for i in range(len(sentences)):
        writer.writerow([
            dates[i] if i < len(dates) else "",  
            " ".join(sentences[i]),
            str(found[i])  
        ])

print(f"CSV saved to {output_file}")


CSV saved to processed_outputagg.csv


In [None]:
import pandas as pd
import ast

df_text = pd.read_csv("Data/processed_outputagg.csv")
df_companies = pd.read_csv("../sp500_ticker_data_collection/ticker_data/sp500_with_appended_aliases.csv")

name_to_ticker = {}
for _, row in df_companies.iterrows():
    ticker = row['Ticker']
    name_to_ticker[row['Company'].lower()] = ticker
    try:
        aliases = ast.literal_eval(row['Aliases'])
        for alias in aliases:
            name_to_ticker[alias.lower()] = ticker
    except:
        pass

def match_entities(entities_str):
    try:
        entities = ast.literal_eval(entities_str)
        matched_companies = []
        matched_tickers = []
        for entity in entities:
            ticker = name_to_ticker.get(entity.lower())
            if ticker:
                matched_companies.append(entity)
                matched_tickers.append(ticker)
        return pd.Series([matched_companies, list(set(matched_tickers))])
    except:
        return pd.Series([[], []])

df_text[['Matched Companies', 'Matched Tickers']] = df_text['Found Entities'].apply(match_entities)

print(df_text[['Date', 'Sentence', 'Found Entities', 'Matched Companies', 'Matched Tickers']])
df_text.to_csv("Data/matched_with_tickers.csv", index=False)


              Date                                           Sentence  \
0       2020-11-27  big tech and the fa ##ke news media have partn...   
1       2020-11-16  the rate of rejected mail - in ball ##ot ##s i...   
2       2020-11-16  georgia won ’ t let us look at the all importa...   
3       2020-11-18  wo ##w ! governor kem ##p will hopefully see t...   
4       2020-11-16  european countries are sad ##ly getting clo ##...   
...            ...                                                ...   
15175   2020-01-03  iran never won a war , but never lost a negoti...   
15176   2020-01-01  thank you to the @ dc ##exa ##mine ##r washing...   
15177   2020-01-01  one of my greatest honor ##s was to have gotte...   
15178   2020-10-22  just signed an order to support the workers of...   
15179   2020-10-22  suburban women want safety & amp ; security . ...   

                                     Found Entities Matched Companies  \
0                                                [

In [None]:
c = df_text['Matched Tickers'].explode().value_counts()

c.to_csv("Data/ticker_counts.csv", header=["Count"])


In [None]:
import pandas as pd

tweets = pd.read_csv("Data/matched_with_tickers.csv")

tweets['Sentence'] = tweets['Sentence'].str[2:-1]

tweets['Matched Tickers'] = tweets['Matched Tickers'].apply(eval) 
filtered = tweets[tweets['Matched Tickers'].apply(lambda x: len(x) > 0)].copy()

sent_list = []
for sentence in filtered['Sentence']:
    sent = sent_pipeline(sentence) 
    sent_list.append(sent)

filtered['Sentiment'] = sent_list

tweets = tweets.merge(filtered[['Sentence', 'Sentiment']], on='Sentence', how='left')

tweets.to_csv("Data/processed_with_sentiment.csv", index=False)
