In [1]:
import os
import pandas as pd

FILE_PATH = '../data/labeled_multilingual_data.csv'

if os.path.exists(FILE_PATH):
    file_size_bytes = os.path.getsize(FILE_PATH)
    file_size_kb = file_size_bytes / 1024
    print(f"✅ File found!")
    print(f"File Size: {file_size_kb:.2f} KB")
    
    if file_size_bytes <= 100: # Assuming a file smaller than ~100 bytes is essentially empty (only headers, etc.)
        print("⚠️ WARNING: File is too small. It likely contains only headers or minimal data.")
    else:
        print("File size looks adequate.")
else:
    print("❌ ERROR: File not found. The output of Step 4 was never created.")

✅ File found!
File Size: 5002.87 KB
File size looks adequate.


In [2]:
try:
    df_check = pd.read_csv(FILE_PATH, encoding='utf-8')
    print(f"\nTotal rows in file: {len(df_check)}")
    
    if len(df_check) > 0:
        print("✅ Data successfully loaded. First 5 rows:")
        print(df_check.head())
        # Check for required columns
        if 'sentiment' in df_check.columns:
            print(f"\nSentiment distribution:\n{df_check['sentiment'].value_counts()}")
        else:
            print("❌ ERROR: 'sentiment' column is missing!")
    else:
        print("❌ ERROR: File is empty (0 rows).")

except Exception as e:
    print(f"❌ ERROR: Failed to read file. It might be corrupted. Details: {e}")


Total rows in file: 4856
✅ Data successfully loaded. First 5 rows:
                                                text language  \
0         Grok is openly rebelling against its owner       en   
1  Graphic designers panicking about losing their...       en   
2                              He s absolutely right       en   
3  Elon Musk s AI chatbot estimates 75-85 likelih...       en   
4  UAE deposited 2 billion in Trump s crypto firm...       en   

        source_type source_name sentiment         emotion  
0  Reddit_Subreddit  artificial  negative  N/A (Excluded)  
1  Reddit_Subreddit  artificial  negative  N/A (Excluded)  
2  Reddit_Subreddit  artificial   neutral  N/A (Excluded)  
3  Reddit_Subreddit  artificial   neutral  N/A (Excluded)  
4  Reddit_Subreddit  artificial   neutral  N/A (Excluded)  

Sentiment distribution:
sentiment
negative    2294
neutral     1984
positive     578
Name: count, dtype: int64
