# Dataset Setup

In [1]:
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


### 1. sweatSmile/news-sentiment-data

In [2]:
ds1 = load_dataset("sweatSmile/news-sentiment-data")
ds1

DatasetDict({
    train: Dataset({
        features: ['title', 'num_comments', 'score', 'upvote_ratio'],
        num_rows: 400
    })
    test: Dataset({
        features: ['title', 'num_comments', 'score', 'upvote_ratio'],
        num_rows: 100
    })
})

In [3]:
df_train = ds1["train"].to_pandas()
df_test = ds1["test"].to_pandas()

df = pd.concat([df_train, df_test], ignore_index=True)


print (df.shape)
df.head()

(500, 4)


Unnamed: 0,title,num_comments,score,upvote_ratio
0,Russia lost about 60 vessels in southern Ukrai...,121,3193,0.96
1,/r/WorldNews Live Thread: Russian Invasion of ...,92,429,0.96
2,Orban's right-wing group meets EU parliament's...,3,35,0.72
3,RockYou2024: 10 billion passwords leaked in th...,638,6638,0.96
4,Racist attacks against Syrians continue in Tur...,72,352,0.85


In [5]:
from transformers import pipeline
import pandas as pd
from tqdm import tqdm

sentiment = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

results = []
for text in tqdm(df["title"].tolist()[:1000]):
    pred = sentiment(text)[0]["label"].lower()
    results.append(pred)

df["label"] = results

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
100%|██████████| 500/500 [00:32<00:00, 15.60it/s]


In [6]:
df.head()

Unnamed: 0,title,num_comments,score,upvote_ratio,label
0,Russia lost about 60 vessels in southern Ukrai...,121,3193,0.96,negative
1,/r/WorldNews Live Thread: Russian Invasion of ...,92,429,0.96,neutral
2,Orban's right-wing group meets EU parliament's...,3,35,0.72,neutral
3,RockYou2024: 10 billion passwords leaked in th...,638,6638,0.96,neutral
4,Racist attacks against Syrians continue in Tur...,72,352,0.85,negative


In [None]:
df.drop(columns=["num_comments","score","upvote_ratio"], inplace=True)

In [9]:
df.head()

Unnamed: 0,title,label
0,Russia lost about 60 vessels in southern Ukrai...,negative
1,/r/WorldNews Live Thread: Russian Invasion of ...,neutral
2,Orban's right-wing group meets EU parliament's...,neutral
3,RockYou2024: 10 billion passwords leaked in th...,neutral
4,Racist attacks against Syrians continue in Tur...,negative


In [10]:
df.to_csv("datasets/news_sentiment_hf.csv", index=False)
print("Saved pseudo-labeled dataset:", len(df))

Saved pseudo-labeled dataset: 500


### 2. Twitter financial news

In [11]:
ds2 = load_dataset("zeroshot/twitter-financial-news-sentiment")

In [17]:
ds2

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9543
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2388
    })
})

In [12]:
df_train = ds2["train"].to_pandas()
df_test = ds2["validation"].to_pandas()

df = pd.concat([df_train, df_test], ignore_index=True)


print (df.shape)
df.head()

(11931, 2)


Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral https://t....,0
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0


In [13]:
label_map = {0: "negative", 1: "positive", 2: "neutral"}
df["label"] = df["label"].map(label_map)
df.rename(columns={"text": "title"}, inplace=True)
df.head()

Unnamed: 0,title,label
0,$BYND - JPMorgan reels in expectations on Beyo...,negative
1,$CCL $RCL - Nomura points to bookings weakness...,negative
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",negative
3,$ESS: BTIG Research cuts to Neutral https://t....,negative
4,$FNKO - Funko slides after Piper Jaffray PT cu...,negative


In [14]:
df.to_csv("datasets/twitter_financial_news.csv", index=False)
print("Twitter Financial News saved:", len(df))
print(df["label"].value_counts())

Twitter Financial News saved: 11931
label
neutral     7744
positive    2398
negative    1789
Name: count, dtype: int64


### 3. AG News Dataset

In [15]:
ag_dataset = load_dataset("ag_news",split="train")


df_ag = ag_dataset.to_pandas()
df_ag.head()

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


In [17]:
df_ag = df_ag[["text","label"]]

df_ag['label'] = 'neutral'

df_ag.rename(columns={"text": "title"}, inplace=True)

df_ag.to_csv("datasets/ag_news.csv", index=False)

print("AG News saved:", len(df_ag))

AG News saved: 120000


### 4. news_sentiment_kaggle

In [21]:
df = pd.read_csv("datasets/news_sentiment.csv")

df.head(2)

Unnamed: 0,news_title,reddit_title,sentiment,text,url
0,Mark Cuban launches generic drug company,Billionaire Mark Cuban just launched a drug co...,1.0,Billionaire investor and Shark Tank star Mark ...,https://www.beckershospitalreview.com/pharmacy...
1,From Defendant to Defender: One Wrongfully Con...,"Man falsely imprisoned for 10 years, uses pris...",1.0,Attorney Jarrett Adams recently helped overtur...,https://www.nbcnews.com/news/us-news/defendant...


In [22]:
df.drop(columns=["reddit_title","text","url"], inplace=True)

label_map = {0: "negative", 1: "positive"}
df["sentiment"] = df["sentiment"].map(label_map)

df.rename(columns={"sentiment": "label", "news_title": "title"}, inplace=True)

df.head(2)

Unnamed: 0,title,label
0,Mark Cuban launches generic drug company,positive
1,From Defendant to Defender: One Wrongfully Con...,positive


In [23]:
df.to_csv("datasets/news_sentiment_kaggle.csv", index=False)
print("Saved news sentiment dataset:", len(df))

Saved news sentiment dataset: 848


### 5. Financial PhraseBank Dataset

In [27]:
df = pd.read_csv('datasets/financial_phrasebank.csv',encoding='ISO-8859-1')

In [28]:
df.head(3)

Unnamed: 0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...


In [29]:
df.columns = ['label','title']
df.head(3)

Unnamed: 0,label,title
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...


In [30]:
df.to_csv("datasets/financial_phrasebank.csv", index=False)
print("Saved financial phrasebank dataset:", len(df))

Saved financial phrasebank dataset: 4845
