In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

ModuleNotFoundError: No module named 'pandas'

In [None]:
IMDB_PATH = Path('data/external/imdb_reviews.csv')
S140_PATH = Path('data/external/sentiment140.csv')
AMAZON_TRAIN = Path('bz2/amazon_train.csv')
AMAZON_TEST = Path('bz2/amazon_test.csv')
FILES = [IMDB_PATH, S140_PATH, AMAZON_TRAIN, AMAZON_TEST]
for f in FILES:
    print(f"Exists? {f}: {f.exists()}")

def load_or_empty(path, encoding=None):
    if not path.exists():
        return pd.DataFrame()
    return pd.read_csv(path, encoding=encoding)

imdb_data = load_or_empty(IMDB_PATH)
sentiment140_data = load_or_empty(S140_PATH, encoding='latin1')
amazon_train = load_or_empty(AMAZON_TRAIN)
amazon_test = load_or_empty(AMAZON_TEST)

print('IMDb Data')
print(imdb_data.head())

print('\nSentiment140 Data')
print(sentiment140_data.head())

print('\nAmazon Train Data')
print(amazon_train.head())

print('\nAmazon Test Data')
print(amazon_test.head())

def plot_sentiment_distribution(data, sentiment_column, title):
    if data.empty or sentiment_column not in data.columns:
        print(f"Skipping plot for {title}; missing data or column.")
        return
    counts = data[sentiment_column].value_counts().sort_index()
    counts.plot(kind='bar', title=title)
    plt.show()

plot_sentiment_distribution(imdb_data, 'sentiment', 'IMDb Sentiment')
plot_sentiment_distribution(sentiment140_data, 'sentiment', 'Sentiment140 Sentiment')
plot_sentiment_distribution(amazon_train, 'sentiment', 'Amazon Train Sentiment')
plot_sentiment_distribution(amazon_test, 'sentiment', 'Amazon Test Sentiment')

for name, df in [('imdb', imdb_data), ('s140', sentiment140_data), ('amazon_train', amazon_train)]:
    if df.empty or 'text' not in df.columns:
        print(f'Skipping length stats for {name}')
        continue
    df['len'] = df['text'].astype(str).str.len()
    print(name, 'len mean:', df['len'].mean())