In [1]:
import pandas as pd

def load_large_csv_in_chunks(
    file_path: str,
    usecols: list[str] = None,
    filter_col: str = None,
    filter_values: list[str] = None,
    chunksize: int = 100_000,
    dropna_cols: list[str] = None
) -> pd.DataFrame:
    """
    Loads a large CSV file in chunks and returns a concatenated DataFrame.
    
    Parameters:
    - file_path (str): Path to the CSV file.
    - usecols (list[str], optional): Columns to load.
    - filter_col (str, optional): Column to apply filtering on.
    - filter_values (list[str], optional): Values to keep in filter_col.
    - chunksize (int): Number of rows per chunk.
    - dropna_cols (list[str], optional): Drop rows with NaN in these columns.
    
    Returns:
    - pd.DataFrame: Filtered and loaded data in memory.
    """
    
    reader = pd.read_csv(file_path, usecols=usecols, chunksize=chunksize)
    chunks = []

    for i, chunk in enumerate(reader):
        print(f"🔄 Processing chunk {i + 1}")
        
        if dropna_cols:
            chunk = chunk.dropna(subset=dropna_cols)
        
        if filter_col and filter_values:
            chunk = chunk[chunk[filter_col].isin(filter_values)]
        
        chunks.append(chunk)
    
    df = pd.concat(chunks, ignore_index=True)
    print(f"✅ Loaded {len(df):,} rows into memory.")
    return df


In [2]:
stocks = ["AAPL", "MSFT", "GOOGL", "AMZN", "NVDA", "META", "TSLA"]

stock_text_dfs = {}

for stock in stocks:
    file_path = f"/Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Sentiment Data/{stock}_sentiment_output.csv"
    print(f"\n Loading: {file_path}")
    df = load_large_csv_in_chunks(
        file_path=file_path,
        chunksize=10_000 
    )
    stock_text_dfs[stock] = df


 Loading: /Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Sentiment Data/AAPL_sentiment_output.csv
🔄 Processing chunk 1
🔄 Processing chunk 2
🔄 Processing chunk 3
🔄 Processing chunk 4
🔄 Processing chunk 5
🔄 Processing chunk 6
🔄 Processing chunk 7
🔄 Processing chunk 8
🔄 Processing chunk 9
🔄 Processing chunk 10
🔄 Processing chunk 11
✅ Loaded 104,345 rows into memory.

 Loading: /Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Sentiment Data/MSFT_sentiment_output.csv
🔄 Processing chunk 1
🔄 Processing chunk 2
🔄 Processing chunk 3
🔄 Processing chunk 4
🔄 Processing chunk 5
✅ Loaded 45,110 rows into memory.

 Loading: /Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Sentiment Data/GOOGL_sentiment_output.csv
🔄 Processing chunk 1
🔄 Processing chunk 2
🔄 Processing chunk 3
🔄 Processing chunk 4
🔄 Processing chunk 5
🔄 Processing chunk 6
🔄 Processing chunk 7
✅ Loaded 60,794 rows into memory.

 Loading: /Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Sentiment Data/AMZN_sent

In [3]:
stock_text_dfs["AAPL"].head()

Unnamed: 0,index,Date,sentiment_vader,sentiment_finbert,sentiment_roberta
0,36,2023-09-25 00:00:00 UTC,5.0,5.0,2.5
1,92,2023-06-02 00:00:00 UTC,5.0,5.0,2.5
2,258,2023-01-27 00:00:00 UTC,5.0,0.0,5.0
3,374,2022-09-29 00:00:00 UTC,5.0,2.5,2.5
4,1108,2020-10-27 00:00:00 UTC,5.0,5.0,2.5


In [4]:
stock_text_dfs["AAPL"].describe()

Unnamed: 0,index,sentiment_vader,sentiment_finbert,sentiment_roberta
count,104345.0,104345.0,104345.0,104345.0
mean,6222204.0,3.535483,4.168288,2.903062
std,4707213.0,1.73728,1.512117,1.121295
min,36.0,0.0,0.0,0.0
25%,1878172.0,2.5,2.5,2.5
50%,5474242.0,5.0,5.0,2.5
75%,10201050.0,5.0,5.0,2.5
max,15549290.0,5.0,5.0,5.0


In [5]:
stocks = ["AAPL", "MSFT", "GOOGL", "AMZN", "NVDA", "TSLA"] #"META" missing

stock_num_dfs = {}

for stock in stocks:
    file_path = f"/Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Auxiliary Data/{stock}.csv"
    print(f"\n Loading: {file_path}")
    df = load_large_csv_in_chunks(
        file_path=file_path,
        chunksize=10_000 
    )
    stock_num_dfs[stock] = df


 Loading: /Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Auxiliary Data/AAPL.csv
🔄 Processing chunk 1
🔄 Processing chunk 2
✅ Loaded 10,852 rows into memory.

 Loading: /Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Auxiliary Data/MSFT.csv
🔄 Processing chunk 1
✅ Loaded 9,526 rows into memory.

 Loading: /Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Auxiliary Data/GOOGL.csv
🔄 Processing chunk 1
✅ Loaded 3,932 rows into memory.

 Loading: /Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Auxiliary Data/AMZN.csv
🔄 Processing chunk 1
✅ Loaded 6,700 rows into memory.

 Loading: /Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Auxiliary Data/NVDA.csv
🔄 Processing chunk 1
✅ Loaded 6,275 rows into memory.

 Loading: /Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Auxiliary Data/TSLA.csv
🔄 Processing chunk 1
✅ Loaded 3,399 rows into memory.


In [6]:
stock_num_dfs["AAPL"].head()

Unnamed: 0,date,open,high,low,close,adj close,volume
0,2023-12-28,194.139999,194.660004,193.169998,193.580002,193.580002,34014500
1,2023-12-27,192.490005,193.5,191.089996,193.149994,193.149994,48087700
2,2023-12-26,193.610001,193.889999,192.830002,193.050003,193.050003,28919300
3,2023-12-22,195.179993,195.410004,192.970001,193.600006,193.600006,37122800
4,2023-12-21,196.100006,197.080002,193.5,194.679993,194.679993,46482500


In [7]:
stock_num_dfs["AAPL"].describe()

Unnamed: 0,open,high,low,close,adj close,volume
count,10852.0,10852.0,10852.0,10852.0,10852.0,10852.0
mean,43.751011,44.215337,43.299883,43.776632,41.7999,85749330.0
std,67.960381,68.688843,67.310474,68.041329,66.836958,83085330.0
min,0.112723,0.112723,0.112723,0.112723,0.087228,0.0
25%,1.178571,1.205357,1.151786,1.178571,0.988569,34442100.0
50%,2.035714,2.071429,2.008929,2.044643,1.713047,60414900.0
75%,75.580715,76.124283,74.780712,75.690712,67.453892,106217100.0
max,367.850006,372.380005,363.910004,366.529999,366.529999,1855410000.0
