In [None]:
# Data cleaning for sentiment CSV -> produce `stock.csv`
# Steps:
# - find latest `sentiment_stock_news_*.csv`
# - normalize columns (dates, strings, numeric scores)
# - standardize recommendations to Buy/Sell/Hold
# - drop duplicates (symbol + title + published)
# - save cleaned file as `stock.csv`

import os, glob, sys, subprocess
from datetime import datetime

# ensure pandas installed
def ensure_pkg(module_name, pip_name=None):
    try:
        __import__(module_name)
    except ImportError:
        name = pip_name if pip_name else module_name
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', name])

ensure_pkg('pandas')

import pandas as pd

# Locate latest sentiment CSV
pattern = os.path.join(os.getcwd(), 'sentiment_stock_news_*.csv')
files = sorted(glob.glob(pattern))
if not files:
    raise FileNotFoundError(f'No sentiment CSV files found with pattern: {pattern}')

sent_file = files[-1]
print('Loading:', sent_file)

# Read CSV (be permissive about parsing)
df = pd.read_csv(sent_file, dtype=str, keep_default_na=False)
orig_rows = len(df)

# Normalize column names (lowercase keys) and ensure expected columns exist
df.columns = [c.strip() for c in df.columns]
expected = ['date','company','symbol','title','source','published','url','sentiment_score','recommendation']
for col in expected:
    if col not in df.columns:
        df[col] = ''

# Trim whitespace for string columns
str_cols = ['company','symbol','title','source','url','recommendation']
for c in str_cols:
    df[c] = df[c].astype(str).str.strip()

# Parse dates: 'date' (store as YYYY-MM-DD) and 'published' (ISO timestamp)
df['date_parsed'] = pd.to_datetime(df['date'], errors='coerce').dt.date
# try parsing 'published' to timezone-aware datetime where possible; keep as ISO string
df['published_parsed'] = pd.to_datetime(df['published'], errors='coerce', utc=True)

# Convert sentiment_score to float
df['sentiment_score'] = pd.to_numeric(df['sentiment_score'], errors='coerce')

# Standardize recommendation values to Buy/Hold/Sell
df['recommendation_clean'] = df['recommendation'].str.title().where(df['recommendation'].notna(), '')
# Map common variants if any (extendable)
valid = {'Buy','Sell','Hold'}
df['recommendation_clean'] = df['recommendation_clean'].apply(lambda x: x if x in valid else 'Hold')

# Drop rows without symbol or date
before_drop = len(df)
df = df[df['symbol'].astype(bool) & df['date_parsed'].notna()].copy()
after_drop = len(df)

# Drop duplicates by (symbol, title, published_parsed) where title or published exist
# Create a dedupe key
df['dedupe_key'] = df['symbol'].str.upper() + '|' + df['title'].fillna('') + '|' + df['published_parsed'].astype(str)
df = df.drop_duplicates(subset=['dedupe_key'], keep='first').copy()

# Prepare final cleaned DataFrame with chosen column names
clean = pd.DataFrame({
    'date': df['date_parsed'].astype(str),
    'company': df['company'],
    'symbol': df['symbol'],
    'title': df['title'],
    'source': df['source'],
    'published': df['published_parsed'].dt.strftime('%Y-%m-%dT%H:%M:%SZ').fillna(''),
    'url': df['url'],
    'sentiment_score': df['sentiment_score'],
    'recommendation': df['recommendation_clean']
})

# Save to stock.csv
out_path = os.path.join(os.getcwd(), 'stock.csv')
clean.to_csv(out_path, index=False)

# Print summary
print(f'Original rows: {orig_rows}')
print(f'Rows after removing missing symbol/date: {after_drop} (removed {before_drop - after_drop})')
print(f'Rows after deduplication: {len(clean)}')
print('\nRecommendation counts:')
print(clean['recommendation'].value_counts(dropna=False))
print('\nDate range: ', clean['date'].min(), '->', clean['date'].max())
print('\nSaved cleaned data to', out_path)
