# Combining Sentiment and Technical Datasets

In [1]:
import numpy as np
import pandas as pd
import os

# Keeping dataframes within viewing window
pd.set_option("display.width", None)
pd.set_option("display.max_columns", None)

# Ticker name
tick = "NVDA"

# Load datasets
ndata_path = f"../{tick.lower()}_data/sentiments.csv"
sdata_path = f"../../stock_data/{tick.lower()}.csv"
sentiment_df = pd.read_csv(
    ndata_path,
    index_col="Date",
    dtype={
        "Lsa_confidence":np.float32,
        "Luhn_confidence":np.float32,
        "Textrank_confidence":np.float32,
        "Lexrank_confidence":np.float32,
        "Lsa_sentiment":np.bytes_,
        "Luhn_sentiment":np.bytes_,
        "Textrank_sentiment":np.bytes_,
        "Lexrank_sentiment":np.bytes_,
    },
)

stock_df = pd.read_csv(
    sdata_path,
    index_col="Date",
)

# Remove time from dates
sentiment_df.index = pd.to_datetime(sentiment_df.index, errors="coerce").date
stock_df.index = pd.to_datetime(stock_df.index, errors="coerce", utc=True).to_series().dt.date

# Reverse sentiment dataframe
sentiment_df = sentiment_df[::-1]

# Remove empty stock price/indicator data & other unnecessary columns
stock_df.dropna(inplace=True)
sentiment_df.drop("Stock_symbol", axis=1, inplace=True)

In [2]:
sentiment_df

Unnamed: 0,Lsa_sentiment,Lsa_confidence,Luhn_sentiment,Luhn_confidence,Textrank_sentiment,Textrank_confidence,Lexrank_sentiment,Lexrank_confidence
2021-08-17,neutral,0.883882,neutral,0.883882,neutral,0.883882,neutral,0.883882
2021-08-18,neutral,0.558872,neutral,0.558872,neutral,0.558872,neutral,0.558872
2021-08-18,positive,0.921450,positive,0.921450,positive,0.921450,positive,0.921450
2021-08-18,positive,0.694652,positive,0.694652,positive,0.694652,positive,0.694652
2021-08-18,neutral,0.757233,neutral,0.757233,neutral,0.757233,neutral,0.757233
...,...,...,...,...,...,...,...,...
2023-12-16,positive,0.936409,positive,0.936409,positive,0.936409,positive,0.936409
2023-12-16,positive,0.623835,positive,0.623835,positive,0.623835,positive,0.623835
2023-12-16,neutral,0.948334,neutral,0.948334,neutral,0.948334,neutral,0.948334
2023-12-16,neutral,0.483542,neutral,0.483542,neutral,0.483542,neutral,0.483542


In [3]:
stock_df

Unnamed: 0_level_0,Close,Dividends,High,Low,Open,Stock Splits,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-03-03,0.478412,0.0,0.489415,0.463970,0.486664,0.0,1593704000
2011-03-04,0.475890,0.0,0.483684,0.470618,0.478870,0.0,959760000
2011-03-07,0.469243,0.0,0.480933,0.457322,0.479329,0.0,1019140000
2011-03-08,0.448153,0.0,0.476807,0.438296,0.474286,0.0,1947184000
2011-03-09,0.438754,0.0,0.451133,0.434857,0.446778,0.0,1318976000
...,...,...,...,...,...,...,...
2023-12-11,46.606258,0.0,47.509856,45.809616,47.469876,0.0,509728000
2023-12-12,47.635803,0.0,47.644798,46.025520,46.025520,0.0,372387000
2023-12-13,48.066616,0.0,48.572392,47.586830,47.607821,0.0,447792000
2023-12-14,48.328491,0.0,48.648349,47.400906,48.368474,0.0,391232000


In [4]:
# Summary AI model names
summ_bots = ["Lsa", "Luhn", "Textrank", "Lexrank"]

# Create each summary score
for bot in summ_bots:
    sentiment_df[bot] = np.where(
        sentiment_df[bot + "_sentiment"] == "positive",
        sentiment_df[bot + "_confidence"],
        -sentiment_df[bot + "_confidence"]
    )

# Drop unnecessary columns
sentiment_df.drop([bot + "_sentiment" for bot in summ_bots], axis=1, inplace=True)
sentiment_df.drop([bot + "_confidence" for bot in summ_bots], axis=1, inplace=True)

sentiment_df

Unnamed: 0,Lsa,Luhn,Textrank,Lexrank
2021-08-17,-0.883882,-0.883882,-0.883882,-0.883882
2021-08-18,-0.558872,-0.558872,-0.558872,-0.558872
2021-08-18,0.921450,0.921450,0.921450,0.921450
2021-08-18,0.694652,0.694652,0.694652,0.694652
2021-08-18,-0.757233,-0.757233,-0.757233,-0.757233
...,...,...,...,...
2023-12-16,0.936409,0.936409,0.936409,0.936409
2023-12-16,0.623835,0.623835,0.623835,0.623835
2023-12-16,-0.948334,-0.948334,-0.948334,-0.948334
2023-12-16,-0.483542,-0.483542,-0.483542,-0.483542


In [5]:
# Sum by date
sentiment_df = sentiment_df.groupby(sentiment_df.index).sum()
sentiment_df

Unnamed: 0,Lsa,Luhn,Textrank,Lexrank
2021-08-17,-0.883882,-0.883882,-0.883882,-0.883882
2021-08-18,-5.309715,-5.309715,-5.309715,-5.309715
2021-08-19,1.202789,1.202789,1.202789,1.202789
2021-08-20,-4.399806,-4.399806,-4.399806,-4.399806
2021-08-21,-0.916391,-0.916391,-0.916391,-0.916391
...,...,...,...,...
2023-12-12,-3.547586,-3.547586,-3.547586,-3.547586
2023-12-13,-0.665740,-0.665740,-0.665740,-0.665740
2023-12-14,3.056196,3.056196,3.056196,3.056196
2023-12-15,2.649434,2.649434,2.649434,2.649434


In [6]:
# Put the prices, indicators, and sentiments together
combined_df = pd.concat([stock_df, sentiment_df], axis=1)
# sentiment_df, stock_df = None, None

# Remove where no price data
combined_df.dropna(subset="Open", inplace=True)

combined_df

Unnamed: 0,Close,Dividends,High,Low,Open,Stock Splits,Volume,Lsa,Luhn,Textrank,Lexrank
2011-03-03,0.478412,0.0,0.489415,0.463970,0.486664,0.0,1.593704e+09,,,,
2011-03-04,0.475890,0.0,0.483684,0.470618,0.478870,0.0,9.597600e+08,,,,
2011-03-07,0.469243,0.0,0.480933,0.457322,0.479329,0.0,1.019140e+09,,,,
2011-03-08,0.448153,0.0,0.476807,0.438296,0.474286,0.0,1.947184e+09,,,,
2011-03-09,0.438754,0.0,0.451133,0.434857,0.446778,0.0,1.318976e+09,,,,
...,...,...,...,...,...,...,...,...,...,...,...
2023-12-11,46.606258,0.0,47.509856,45.809616,47.469876,0.0,5.097280e+08,-5.583224,-5.583224,-5.583224,-5.583224
2023-12-12,47.635803,0.0,47.644798,46.025520,46.025520,0.0,3.723870e+08,-3.547586,-3.547586,-3.547586,-3.547586
2023-12-13,48.066616,0.0,48.572392,47.586830,47.607821,0.0,4.477920e+08,-0.665740,-0.665740,-0.665740,-0.665740
2023-12-14,48.328491,0.0,48.648349,47.400906,48.368474,0.0,3.912320e+08,3.056196,3.056196,3.056196,3.056196


In [8]:
# Rename columns before export
combined_df = (
    combined_df[["Open", "Close", "High", "Low", "Volume", "Dividends", "Stock Splits", "Lsa", "Luhn", "Textrank", "Lexrank"]]
    .rename(columns=lambda col: col.lower())
    .rename_axis("date")
)

In [9]:
# Export full
out_dir = f"../{tick.lower()}_data"
combined_df.to_csv(os.path.join(out_dir, "full.csv"))

# Remove empty
combined_df.dropna(inplace=True)

# Export truncated
combined_df.to_csv(os.path.join(out_dir, "truncated.csv"))
print("✅ Export done.")
combined_df

✅ Export done.


Unnamed: 0_level_0,open,close,high,low,volume,dividends,stock splits,lsa,luhn,textrank,lexrank
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-08-17,19.643272,19.416754,19.728093,19.226159,204585000.0,0.0,0.0,-0.883882,-0.883882,-0.883882,-0.883882
2021-08-18,19.458665,18.999641,19.592382,18.959725,285913000.0,0.0,0.0,-5.309715,-5.309715,-5.309715,-5.309715
2021-08-19,19.452678,19.756035,20.451558,18.722230,766555000.0,0.0,0.0,1.202789,1.202789,1.202789,1.202789
2021-08-20,19.948629,20.771881,20.820777,19.890753,675741000.0,0.0,0.0,-4.399806,-4.399806,-4.399806,-4.399806
2021-08-23,20.926550,21.911457,21.950374,20.905594,575807000.0,0.0,0.0,0.094080,0.094080,0.094080,0.094080
...,...,...,...,...,...,...,...,...,...,...,...
2023-12-11,47.469876,46.606258,47.509856,45.809616,509728000.0,0.0,0.0,-5.583224,-5.583224,-5.583224,-5.583224
2023-12-12,46.025520,47.635803,47.644798,46.025520,372387000.0,0.0,0.0,-3.547586,-3.547586,-3.547586,-3.547586
2023-12-13,47.607821,48.066616,48.572392,47.586830,447792000.0,0.0,0.0,-0.665740,-0.665740,-0.665740,-0.665740
2023-12-14,48.368474,48.328491,48.648349,47.400906,391232000.0,0.0,0.0,3.056196,3.056196,3.056196,3.056196
