# Financial Sentiment Analysis Dataset Exploration

This notebook loads, cleans and combines two financial sentiment datasets for further analysis.

In [67]:
# Import libraries
import pandas as pd
import numpy as np
from datasets import load_dataset

def load_dataset1():
    """Load and clean the first dataset."""
    ds = load_dataset("NickyNicky/finance-financialmodelingprep-stock-news-sentiments-rss-feed")
    df = pd.DataFrame(ds['train'])
    df.drop(['symbol', 'title', 'publishedDate', 'image', 'site', 'url', 'sentimentScore'], axis=1, inplace=True)
    # Lowercase the sentiment column
    df['sentiment'] = df['sentiment'].str.lower()
    return df

def load_dataset2():
    """Load, filter and clean the second dataset."""
    ds = load_dataset("NickyNicky/Finance_sentiment_and_topic_classification_En")
    df = pd.DataFrame(ds['train'])
    # Filter rows based on the exact system_prompt text
    df = df[df['system_prompt'] == 'You are a financial sentiment analysis expert. Your task is to analyze the sentiment expressed in the given financial text.Only reply with positive, neutral, or negative.']
    # Drop unused columns and rename to match dataset1
    df = df.drop(columns=['system_prompt', 'task_type'])
    df = df.rename(columns={"user_prompt": "text", "answer": "sentiment"})
    return df

def combine_datasets(df1, df2):
    """Concatenate the two dataframes."""
    return pd.concat([df1, df2], ignore_index=True)

def normalize_dataset(df, samples=14212):
    """Even out sentiment counts by sampling."""
    return df.groupby('sentiment').sample(samples, replace=True)

def remove_neutral(df):
    """Remove rows with neutral sentiment."""
    return df[df['sentiment'] != 'neutral']

In [68]:
# Load and clean the datasets
df_1 = load_dataset1()
df_2 = load_dataset2()

print('Dataset 1 shape:', df_1.shape)
print('Dataset 2 shape:', df_2.shape)

Dataset 1 shape: (142000, 2)
Dataset 2 shape: (28628, 2)


In [None]:
# Combine the two datasets and export to CSV
df = combine_datasets(df_1, df_2)
df.to_csv('stock_news_sentiments_RAW.csv', index=False)
print('Combined dataset sentiment distribution:')
print(df['sentiment'].value_counts())

Combined dataset sentiment distribution:
positive    134257
negative     22159
neutral      14212
Name: sentiment, dtype: int64


In [70]:
# Normalize the dataset (including neutral) and export
df_normalized = normalize_dataset(df, samples=14212)
print('Normalized (with neutral) distribution:')
print(df_normalized['sentiment'].value_counts())
df_normalized.to_csv('stock_news_sentiments_Normalized_Neu_Pos_Neg.csv', index=False)

Normalized (with neutral) distribution:
negative    14212
neutral     14212
positive    14212
Name: sentiment, dtype: int64


In [71]:
# Remove neutral sentiment, normalize for positive/negative, and export
df_no_neutral = remove_neutral(df)
df_normalized_neg_pos = df_no_neutral.groupby('sentiment').sample(22159, replace=True)
df_normalized_neg_pos.to_csv('stock_news_sentiments_Normalized_Pos_Neg.csv', index=False)