In [6]:
import pandas as pd
import glob
import os
import re

In [None]:

# Path to your folder
folder_path = './news_data'  # Update this if you're running from a different directory

# Get all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*_alpha_news_data.csv'))

# Read and concatenate all CSVs
df_list = [pd.read_csv(file) for file in csv_files]
all_data = pd.concat(df_list, ignore_index=True)

# Display the first few rows
print(all_data.head())


    published_date                                              title  \
0  20220301T080000  Tesla Rival Slashes 2022 Production Outlook; L...   
1  20220301T080000  Nasdaq moves into positive territory as broade...   
2  20220301T113014  BYD to use Baidu's autonomous driving technolo...   
3  20220301T163019  Chinese electric car makers' February sales de...   
4  20220302T080000  Box Stock Jumps As Fourth-Quarter Results, Out...   

                                             summary ticker  \
0  Lucid Stock Sinks As Tesla Rival Slashes 2022 ...   TSLA   
1  Live updates: Dow tumbles as Russia threatens ...   TSLA   
2  BYD, China's biggest electric vehicle (EV) bui...   TSLA   
3  China's three biggest makers of smart electric...   TSLA   
4  Box Stock Jumps As Earnings Beat Estimates Inv...   TSLA   

   ticker_sentiment_score ticker_sentiment_label  
0               -0.058479                Neutral  
1               -0.101763                Neutral  
2                0.124467    

In [8]:
# Define input and output directories
input_dir = './news_data'
output_dir = './merged'
output_file = 'merged_alpha_news_data.csv'

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Get all relevant CSV files
csv_files = glob.glob(os.path.join(input_dir, '*_alpha_news_data.csv'))

# Load and merge all CSVs
df_list = [pd.read_csv(file) for file in csv_files]
merged_df = pd.concat(df_list, ignore_index=True)

# Optional: list of columns to clean
text_columns = ['title', 'summary']

# Clean text columns
def clean_text(text):
    if isinstance(text, str):
        text = text.replace('\n', ' ').replace('\r', ' ')
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
        text = text.strip()
    return text

for col in text_columns:
    if col in merged_df.columns:
        merged_df[col] = merged_df[col].apply(clean_text)

# Save cleaned DataFrame
merged_df.to_csv(os.path.join(output_dir, output_file), index=False)

print(f"✅ Cleaned and saved merged data to {os.path.join(output_dir, output_file)}")

✅ Cleaned and saved merged data to ./merged/merged_alpha_news_data.csv


In [None]:
import pandas as pd
import os
import re
import glob

# Input and output paths
input_dir = './news_data'
output_base = './split'
cleaned_dir = './cleaned'

# Clean text function
def clean_text(text):
    if isinstance(text, str):
        text = text.replace('\n', ' ').replace('\r', ' ')
        text = re.sub(r'\s+', ' ', text).strip()
    return text

# Process each file separately
csv_files = glob.glob(os.path.join(input_dir, '*_alpha_news_data.csv'))

for file_path in csv_files:
    # Extract ticker name from filename
    base_name = os.path.basename(file_path)
    ticker = base_name.split('_')[0].upper()  # e.g., AAPL

    # Create ticker-specific output directory
    ticker_dir = os.path.join(output_base, ticker)
    os.makedirs(ticker_dir, exist_ok=True)

    # Load and clean the CSV
    df = pd.read_csv(file_path)

    for col in ['title', 'summary']:
        if col in df.columns:
            df[col] = df[col].apply(clean_text)
            
    # Save the entire cleaned CSV for this stock
    os.makedirs(cleaned_dir, exist_ok=True)
    cleaned_csv_path = os.path.join(cleaned_dir, f'{ticker}_alpha_news_data.csv')
    df.to_csv(cleaned_csv_path, index=False)

    # # Split into 1000-row chunks
    # chunk_size = 1000
    # total_rows = df.shape[0]
    # num_chunks = (total_rows + chunk_size - 1) // chunk_size

    # for i in range(num_chunks):
    #     start = i * chunk_size
    #     end = min(start + chunk_size, total_rows)
    #     chunk_df = df.iloc[start:end]

    #     # Save each chunk
    #     output_file = os.path.join(ticker_dir, f'stock_data_part_{i+1}.csv')
    #     chunk_df.to_csv(output_file, index=False)


✅ Processed NVDA: 20 file(s) saved in ./split/NVDA
✅ Processed AMZN: 20 file(s) saved in ./split/AMZN
✅ Processed TSLA: 20 file(s) saved in ./split/TSLA
✅ Processed NKE: 20 file(s) saved in ./split/NKE
✅ Processed AAPL: 20 file(s) saved in ./split/AAPL


In [13]:
import pandas as pd
import os
import glob

# Paths
input_dir = './stocks_data'
output_dir = './stocks_cleaned'
os.makedirs(output_dir, exist_ok=True)

for file in glob.glob(os.path.join(input_dir, '*.csv')):
    print(f"📂 Processing: {file}")

    # Load with headers
    df = pd.read_csv(file)

    # Clean 'Date' column
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce', utc=True)
        df['Date'] = df['Date'].dt.tz_convert(None)
        df.dropna(subset=['Date'], inplace=True)
    else:
        print(f"⚠️ Skipping {file} - no 'Date' column found")
        continue

    # Remove empty or unnamed columns
    df.dropna(axis=1, how='all', inplace=True)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    # Sort by date
    df.sort_values(by='Date', inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Save cleaned file
    out_path = os.path.join(output_dir, os.path.basename(file))
    df.to_csv(out_path, index=False)
    print(f"✅ Saved cleaned file to: {out_path}")


📂 Processing: ./stocks_data/NVDA_yahoo_data_0.csv
✅ Saved cleaned file to: ./stocks_cleaned/NVDA_yahoo_data_0.csv
📂 Processing: ./stocks_data/AAPL_yahoo_data_0.csv
✅ Saved cleaned file to: ./stocks_cleaned/AAPL_yahoo_data_0.csv
📂 Processing: ./stocks_data/NKE_yahoo_data_0.csv
✅ Saved cleaned file to: ./stocks_cleaned/NKE_yahoo_data_0.csv
📂 Processing: ./stocks_data/AMZN_yahoo_data_0.csv
✅ Saved cleaned file to: ./stocks_cleaned/AMZN_yahoo_data_0.csv
📂 Processing: ./stocks_data/TSLA_yahoo_data_0.csv
✅ Saved cleaned file to: ./stocks_cleaned/TSLA_yahoo_data_0.csv
