# Public Dataset Quickstart

Use this notebook to explore `public_data.csv` located in the same directory.
The steps below load the dataset, preview a few rows, and compute simple summaries
that you can adapt for your own analysis.

The dataset includes `day_label`, `week_label`, and `month_label` columns where `1` indicates an up movement and `-1` indicates down.


In [4]:
from pathlib import Path
import pandas as pd

DATA_PATH = Path('data/public_data.csv')
if not DATA_PATH.exists():
    raise FileNotFoundError(f'Expected to find {DATA_PATH} next to this notebook.')

data = pd.read_csv(DATA_PATH)
data.shape

(2098, 17)

In [3]:
data.head()

Unnamed: 0,Clarity,Freecomment,Logic,Persuasiveness,Readability,Usefulness,doc_id,financial_decision_day,financial_decision_month,financial_decision_week,generated_report,team_id,company_id,company_name,day_label,week_label,month_label
0,7,,7,6,7,7,7_36,0,1,1,<h1>CompanyX (MDT) Investment Report - Fiscal ...,7,36,MDT_q2_2022,1,-1,-1
1,7,,7,7,6,7,10_39,-1,-1,-1,<p>CompanyX's second-quarter 2020 earnings cal...,10,39,MYE_q2_2020,-1,1,1
2,7,,6,6,7,7,6_35,1,0,1,"<ul> <li>Q3 earnings per share $5.25, 4x highe...",6,35,LYB_q3_2021,1,1,-1
3,7,,7,7,7,7,3_32,1,1,1,<h2>Investment Analysis Report: CompanyX (KW)<...,3,32,KW_q3_2021,1,1,-1
4,7,,7,7,7,7,9_38,1,1,1,<p><strong>LONG with HIGH conviction</strong><...,9,38,MSI_q3_2021,-1,-1,-1


In [None]:
likert_columns = ['Clarity', 'Logic', 'Persuasiveness', 'Readability', 'Usefulness']
summary = (
    data[likert_columns]
    .agg(['mean', 'std', 'count'])
    .T
)
summary

In [None]:
decision_columns = ['financial_decision_day', 'financial_decision_week', 'financial_decision_month']
decision_counts = {
    col: data[col].value_counts().sort_index()
    for col in decision_columns
}

label_columns = ['day_label', 'week_label', 'month_label']
label_counts = {
    col: data[col]
        .map({1: 'up', -1: 'down'})
        .value_counts()
        .reindex(['up', 'down'], fill_value=0)
    for col in label_columns
}

{'financial_decision': decision_counts, 'label_counts': label_counts}
