In [1]:
import pandas as pd
import os

# Create processed folders if they don't exist
os.makedirs("../data/processed/fundamental", exist_ok=True)

def clean_fundamental_data(ticker):
    try:
        # Load data
        df = pd.read_csv(f"../data/raw/fundamental/{ticker}_fundamentals.csv")
        
        # Normalize column names
        df.columns = df.columns.str.lower().str.replace(' ', '')
        
        # Flexible column selection
        col_mapping = {
            'date': 'fiscaldateending',
            'netincome': ['netincome', 'netinc', 'profit', 'netprofit'],
            'totalliabilities': ['totalliabilities', 'liabilities'],
            'totalshareholderequity': ['totalshareholderequity', 'equity', 'shareholderequity'],
            'totalassets': ['totalassets', 'assets'],
            'cashandcashequivalentsatcarryingvalue': ['cashandcashequivalentsatcarryingvalue', 'cash', 'cashandequivalents']
        }
        
        # Find available columns
        selected_cols = {}
        for target, possible_names in col_mapping.items():
            if isinstance(possible_names, str):
                possible_names = [possible_names]
            for name in possible_names:
                if name in df.columns:
                    selected_cols[target] = name
                    break
        
        # Create new DataFrame
        clean_df = pd.DataFrame()
        for target, source in selected_cols.items():
            clean_df[target] = df[source]
        
        # Calculate financial ratios
        if 'totalliabilities' in clean_df and 'totalshareholderequity' in clean_df:
            clean_df['debt_to_equity'] = clean_df['totalliabilities'] / clean_df['totalshareholderequity']
        
        if 'netincome' in clean_df and 'totalshareholderequity' in clean_df:
            clean_df['roe'] = clean_df['netincome'] / clean_df['totalshareholderequity']
        
        # Convert dates
        if 'fiscaldateending' in clean_df:
            clean_df['fiscaldateending'] = pd.to_datetime(clean_df['fiscaldateending'])
        
        # Save cleaned data
        clean_df.to_csv(f"../data/processed/fundamental/{ticker}_fundamental_clean.csv", index=False)
        return clean_df
    
    except Exception as e:
        print(f"❌ Error processing {ticker}: {str(e)}")
        return None

# Process all stocks
tickers = ['AAPL', 'JPM', 'AMZN', 'PFE', 'XOM']
for ticker in tickers:
    print(f"\nProcessing {ticker}...")
    data = clean_fundamental_data(ticker)
    if data is not None:
        print(f"✅ Successfully processed {ticker}")
        print(data.head())


Processing AAPL...
✅ Successfully processed AAPL
         date  totalliabilities  totalshareholderequity   totalassets  \
0  2024-09-30      308030000000             56950000000  364980000000   
1  2023-09-30      290437000000             62146000000  352583000000   
2  2022-09-30      302083000000             50672000000  352755000000   
3  2021-09-30      287912000000             63090000000  351002000000   
4  2020-09-30      258549000000             65339000000  323888000000   

   cashandcashequivalentsatcarryingvalue  debt_to_equity  
0                            29943000000        5.408780  
1                            29965000000        4.673462  
2                            23646000000        5.961537  
3                            34940000000        4.563512  
4                            38016000000        3.957039  

Processing JPM...
✅ Successfully processed JPM
         date  totalliabilities  totalshareholderequity    totalassets  \
0  2024-12-31     3658056000000    