In [None]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

plt.style.use('default')
sns.set_palette('husl')

In [None]:
# Load dataset
dataset = load_dataset("newsmediabias/news-bias-full-data")
train_df = dataset['train'].to_pandas()

# News Bias Detection - EDA

In [None]:
# Basic info
print(f'Shape: {train_df.shape}')
print(f'Columns: {list(train_df.columns)}')
print(f'Missing values: {train_df.isnull().sum().sum()}')

In [None]:
# Label distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

train_df['label'].value_counts().plot(kind='bar', ax=axes[0,0], title='Bias Labels')
train_df['sentiment'].value_counts().plot(kind='bar', ax=axes[0,1], title='Sentiment')
train_df['dimension'].value_counts().head(8).plot(kind='bar', ax=axes[1,0], title='Top Dimensions')
train_df['toxic'].value_counts().plot(kind='bar', ax=axes[1,1], title='Toxicity')

for ax in axes.flat:
    ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Text analysis
train_df['text_length'] = train_df['text'].str.len()
train_df['word_count'] = train_df['text'].str.split().str.len()

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
train_df['text_length'].hist(bins=50, ax=axes[0], alpha=0.7)
axes[0].set_title('Text Length')
axes[0].axvline(train_df['text_length'].mean(), color='red', linestyle='--')

train_df['word_count'].hist(bins=50, ax=axes[1], alpha=0.7)
axes[1].set_title('Word Count')
axes[1].axvline(train_df['word_count'].mean(), color='red', linestyle='--')
plt.tight_layout()
plt.show()

print(f'Avg text length: {train_df["text_length"].mean():.0f}')
print(f'Avg word count: {train_df["word_count"].mean():.0f}')

In [None]:
# Bias vs Sentiment heatmap
crosstab = pd.crosstab(train_df['label'], train_df['sentiment'], normalize='index') * 100
plt.figure(figsize=(8, 5))
sns.heatmap(crosstab, annot=True, fmt='.1f', cmap='Blues')
plt.title('Bias vs Sentiment (%)')
plt.show()

In [None]:
# Top aspects
plt.figure(figsize=(10, 6))
train_df['aspect'].value_counts().head(12).plot(kind='barh')
plt.title('Top 12 Aspects')
plt.tight_layout()
plt.show()
print(f'Unique aspects: {train_df["aspect"].nunique()}')

In [None]:
# Biased words analysis
train_df['biased_words_count'] = train_df['biased_words'].apply(len)
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

train_df['biased_words_count'].value_counts().head(8).plot(kind='bar', ax=axes[0])
axes[0].set_title('Biased Words Count Distribution')

train_df.groupby('label')['biased_words_count'].mean().plot(kind='bar', ax=axes[1])
axes[1].set_title('Avg Biased Words by Label')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()
print(f'Texts with biased words: {(train_df["biased_words_count"] > 0).mean()*100:.1f}%')

In [None]:
# Summary stats
print('=== DATASET SUMMARY ===')
print(f'Total samples: {len(train_df):,}')
print(f'Dimensions: {train_df["dimension"].nunique()}')
print(f'Aspects: {train_df["aspect"].nunique()}')
print(f'Toxic content: {(train_df["toxic"] == 1.0).mean()*100:.1f}%')
print(f'Identity mentions: {(train_df["identity_mention"] == "YES").mean()*100:.1f}%')
print('\nBias distribution:')
for label, count in train_df['label'].value_counts().items():
    print(f'  {label}: {count:,} ({count/len(train_df)*100:.1f}%)')