## Sentiment(Fear & Greed Data Cleaning)

In [1]:
import pandas as pd
import numpy as np

## Load Dataset

In [3]:
sentiment = pd.read_csv(
    '../data/raw/sentiment_data/fear_greed_index.csv',
    parse_dates=["date"],
    dayfirst=True,
    low_memory=False
)

## Standardize columns

In [4]:
sentiment.columns = (
    sentiment.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace(r"[^\w\s]", "", regex=True)
)

In [6]:
sentiment.head()

Unnamed: 0,timestamp,value,classification,date
0,2018-02-01 05:30:00+00:00,30,Fear,2018-02-01
1,2018-02-02 05:30:00+00:00,15,Extreme Fear,2018-02-02
2,2018-02-03 05:30:00+00:00,40,Fear,2018-02-03
3,2018-02-04 05:30:00+00:00,24,Extreme Fear,2018-02-04
4,2018-02-05 05:30:00+00:00,11,Extreme Fear,2018-02-05


## Convert UNIX timestamp → UTC datetime

In [5]:
sentiment["timestamp"] = pd.to_datetime(sentiment["timestamp"], unit="s", utc=True)

## Ensure numeric

In [7]:
sentiment['value'] = pd.to_numeric(sentiment['value'], errors="coerce")

## Standardize classification

In [8]:
sentiment['classification'] = sentiment['classification'].str.strip().str.title()

## Check Duplicates

In [9]:
for col in sentiment.columns:
    print(f"{col} : {sentiment[col].duplicated().sum()} duplicates")

timestamp : 0 duplicates
value : 2554 duplicates
classification : 2639 duplicates
date : 0 duplicates


## Check for unique values

In [11]:
for col in sentiment.columns:
    print(f"{col} : {sentiment[col].nunique()} uniques.")

timestamp : 2644 uniques.
value : 90 uniques.
classification : 5 uniques.
date : 2644 uniques.


## Check for Null/Missing values

In [14]:
sentiment.isna().sum()

timestamp         0
value             0
classification    0
date              0
dtype: int64

## Sort & forward fill missing days

In [13]:
sentiment = sentiment.sort_values("date").ffill()

In [15]:
sentiment

Unnamed: 0,timestamp,value,classification,date
0,2018-02-01 05:30:00+00:00,30,Fear,2018-02-01
1,2018-02-02 05:30:00+00:00,15,Extreme Fear,2018-02-02
2,2018-02-03 05:30:00+00:00,40,Fear,2018-02-03
3,2018-02-04 05:30:00+00:00,24,Extreme Fear,2018-02-04
4,2018-02-05 05:30:00+00:00,11,Extreme Fear,2018-02-05
...,...,...,...,...
2639,2025-04-28 05:30:00+00:00,54,Neutral,2025-04-28
2640,2025-04-29 05:30:00+00:00,60,Greed,2025-04-29
2641,2025-04-30 05:30:00+00:00,56,Greed,2025-04-30
2642,2025-05-01 05:30:00+00:00,53,Neutral,2025-05-01


# Map classification → score

In [16]:
mapping = {
    "Extreme Fear": 0,
    "Fear": 25,
    "Neutral": 50,
    "Greed": 75,
    "Extreme Greed": 100
}

In [17]:
sentiment["sentiment_score"] = sentiment["classification"].map(mapping)

In [18]:
sentiment

Unnamed: 0,timestamp,value,classification,date,sentiment_score
0,2018-02-01 05:30:00+00:00,30,Fear,2018-02-01,25
1,2018-02-02 05:30:00+00:00,15,Extreme Fear,2018-02-02,0
2,2018-02-03 05:30:00+00:00,40,Fear,2018-02-03,25
3,2018-02-04 05:30:00+00:00,24,Extreme Fear,2018-02-04,0
4,2018-02-05 05:30:00+00:00,11,Extreme Fear,2018-02-05,0
...,...,...,...,...,...
2639,2025-04-28 05:30:00+00:00,54,Neutral,2025-04-28,50
2640,2025-04-29 05:30:00+00:00,60,Greed,2025-04-29,75
2641,2025-04-30 05:30:00+00:00,56,Greed,2025-04-30,75
2642,2025-05-01 05:30:00+00:00,53,Neutral,2025-05-01,50


## Save cleaned sentiment

In [19]:

sentiment.to_csv("../data/preprocessed/sentiment_data/sentiment_clean.csv", index=False)