In [1]:
# VesterAI - Notebook 02: Data Cleaning & Preprocessing

"""
Objective:
Clean and preprocess raw data (Twitter, Reddit, News, and Stock Prices) to:
- Remove noise (URLs, mentions, emojis, symbols)
- Lowercase, normalize text
- Filter by language (English)
- Unify timestamps and structures
- Save cleaned outputs in /data/processed/

Input: ../data/raw/
Output: ../data/processed/
"""

'\nObjective:\nClean and preprocess raw data (Twitter, Reddit, News, and Stock Prices) to:\n- Remove noise (URLs, mentions, emojis, symbols)\n- Lowercase, normalize text\n- Filter by language (English)\n- Unify timestamps and structures\n- Save cleaned outputs in /data/processed/\n\nInput: ../data/raw/\nOutput: ../data/processed/\n'

In [5]:
!pip install emoji langdetect
!pip install -U jupyterlab ipywidgets jupyterlab-widgets

Defaulting to user installation because normal site-packages is not writeable
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 3.8 MB/s eta 0:00:01
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [done
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993221 sha256=f1e33927a624ba6c7198fc4f0214a78d43f99076c531849c48aeb5b179d1e346
  Stored in directory: /home/tandel.r/.cache/pip/wheels/13/c7/b0/79f66658626032e78fc1a83103690ef6797d551cb22e56e734
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [6]:
import os
import pandas as pd
import re
import string
import emoji
from langdetect import detect
from tqdm import tqdm

# Paths
raw_path = "../data/raw/"
processed_path = "../data/processed/"
os.makedirs(processed_path, exist_ok=True)

print(f"Cleaned data will be saved in: {processed_path}")

Cleaned data will be saved in: ../data/processed/


In [7]:
# Load all available raw data
stock_df = pd.read_csv(os.path.join(raw_path, "AAPL_stock_data.csv"))
news_df = pd.read_csv(os.path.join(raw_path, "AAPL_google_news.csv"))
tweet_df = pd.read_csv(os.path.join(raw_path, "AAPL_twitter_api.csv"))
reddit_path = os.path.join(raw_path, "AAPL_reddit_posts.csv")
reddit_df = pd.read_csv(reddit_path) if os.path.exists(reddit_path) else pd.DataFrame()

print("Data Loaded:")
print(f"Stock data: {stock_df.shape}")
print(f"News data: {news_df.shape}")
print(f"Twitter data: {tweet_df.shape}")
print(f"Reddit data: {reddit_df.shape if not reddit_df.empty else 'Not available'}")

Data Loaded:
Stock data: (1314, 6)
News data: (20, 4)
Twitter data: (100, 6)
Reddit data: (50, 6)


In [8]:
def clean_text(text):
    if pd.isnull(text): return ""
    # Remove URLs
    text = re.sub(r"http\S+|www.\S+", "", text)
    # Remove mentions and hashtags
    text = re.sub(r"@\w+|#\w+", "", text)
    # Remove emojis
    text = emoji.replace_emoji(text, replace="")
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [9]:
tqdm.pandas()

# Clean tweets
tweet_df["clean_text"] = tweet_df["text"].progress_apply(clean_text)
tweet_df["date"] = pd.to_datetime(tweet_df["date"])
tweet_df.dropna(subset=["clean_text"], inplace=True)

# Optional: Remove non-English tweets
def is_english(text):
    try:
        return detect(text) == "en"
    except:
        return False

tweet_df = tweet_df[tweet_df["clean_text"].progress_apply(is_english)]

print(f"Cleaned Tweets: {tweet_df.shape}")
tweet_df[["date", "clean_text"]].head()


100%|██████████| 100/100 [00:00<00:00, 5396.55it/s]
100%|██████████| 100/100 [00:00<00:00, 103.52it/s]

Cleaned Tweets: (95, 7)





Unnamed: 0,date,clean_text
0,2025-03-25 22:54:18+00:00,best stock traders group out there updates cha...
1,2025-03-25 22:53:02+00:00,rt over the long term its hard for a stock to ...
2,2025-03-25 22:51:18+00:00,best stock trade group out there free chatroom...
3,2025-03-25 22:50:18+00:00,bast stock group discord kfc solo w jagx ino b...
4,2025-03-25 22:47:40+00:00,bast stock group discord mac byfc tsla spy box...


In [12]:
news_df["clean_headline"] = news_df["headline"].progress_apply(clean_text)
news_df["date"] = pd.to_datetime(news_df["date"])
news_df.dropna(subset=["clean_headline"], inplace=True)

print(f"Cleaned News: {news_df.shape}")
news_df[["date", "clean_headline"]].head()

100%|██████████| 20/20 [00:00<00:00, 9675.44it/s]

Cleaned News: (20, 5)





Unnamed: 0,date,clean_headline
0,2025-03-25,nasdaqmoreaapl quantitative stock analysis9 ho...
1,2025-03-25,tipranksmoreapple aapl stock shoots higher on ...
2,2025-03-25,seeking alphamoreapple buy now before the ipho...
3,2025-03-25,yahoo financemoreapple inc aapl among the 10 g...
4,2025-03-25,markets insidermoredon’t expect an ai upgrade ...


In [13]:
if not reddit_df.empty:
    reddit_df["full_text"] = reddit_df["title"].fillna("") + " " + reddit_df["content"].fillna("")
    reddit_df["clean_text"] = reddit_df["full_text"].progress_apply(clean_text)
    reddit_df["date"] = pd.to_datetime(reddit_df["date"])
    reddit_df = reddit_df[reddit_df["clean_text"].progress_apply(is_english)]
    
    print(f"Cleaned Reddit: {reddit_df.shape}")
    reddit_df[["date", "clean_text"]].head()
else:
    print("Reddit data not available, skipping.")

100%|██████████| 50/50 [00:00<00:00, 573.85it/s]
100%|██████████| 50/50 [00:00<00:00, 155.16it/s]

Cleaned Reddit: (50, 8)





In [14]:
# Convert date to datetime
stock_df["Date"] = pd.to_datetime(stock_df["Date"])
stock_df = stock_df.dropna()
print(f"Stock price data cleaned: {stock_df.shape}")
stock_df.head()

Stock price data cleaned: (1313, 6)


Unnamed: 0,Date,Close,High,Low,Open,Volume
1,2020-01-02,72.71607208251953,72.77659819422657,71.46681225027338,71.72101896406637,135480400
2,2020-01-03,72.00912475585938,72.7717522953066,71.78396939069293,71.94133580542943,146322800
3,2020-01-06,72.5829086303711,72.62164622763687,70.87607527260708,71.12786596061405,118387200
4,2020-01-07,72.2415542602539,72.84923143823697,72.02123831231323,72.59260129853506,108872000
5,2020-01-08,73.40364837646484,73.70627893727402,71.943758846659,71.943758846659,132079200


In [15]:
tweet_df.to_csv(os.path.join(processed_path, "AAPL_twitter_cleaned.csv"), index=False)
news_df.to_csv(os.path.join(processed_path, "AAPL_news_cleaned.csv"), index=False)
stock_df.to_csv(os.path.join(processed_path, "AAPL_stock_cleaned.csv"), index=False)
if not reddit_df.empty:
    reddit_df.to_csv(os.path.join(processed_path, "AAPL_reddit_cleaned.csv"), index=False)

print("Cleaned data saved to /data/processed/")

Cleaned data saved to /data/processed/


In [16]:
print("Cleaning Summary:")
print(f"Tweets: {tweet_df.shape}")
print(f"News: {news_df.shape}")
print(f"Stock: {stock_df.shape}")
print(f"Reddit: {reddit_df.shape if not reddit_df.empty else 'N/A'}")

print("\nNext: Perform sentiment analysis in `03_sentiment_analysis.ipynb`")

Cleaning Summary:
Tweets: (95, 7)
News: (20, 5)
Stock: (1313, 6)
Reddit: (50, 8)

Next: Perform sentiment analysis in `03_sentiment_analysis.ipynb`
