# üéÆ Toxic Chat Detection - Data Exploration

This notebook explores the sample dataset for our toxic chat detection project.

**Objectives:**
- Load and inspect the data
- Analyze class distribution
- Explore text characteristics
- Visualize word frequencies


## 1. Setup and Imports


In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Configure display settings
pd.set_option('display.max_colwidth', 100)

print("Libraries loaded successfully! ‚úÖ")


## 2. Load the Data


In [None]:
# Load the sample dataset
df = pd.read_csv('../data/sample_data.csv')

# Basic info
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"\nData types:")
print(df.dtypes)


In [None]:
# Display first few rows
print("\nüìù Sample rows:")
df.head(10)


## 3. Class Distribution Analysis


In [None]:
# Count of each class
class_counts = df['label'].value_counts()
print("\nüìä Class Distribution:")
print(f"  Non-Toxic (0): {class_counts[0]} ({class_counts[0]/len(df)*100:.1f}%)")
print(f"  Toxic (1):     {class_counts[1]} ({class_counts[1]/len(df)*100:.1f}%)")


In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Bar chart
colors = ['#2ecc71', '#e74c3c']
axes[0].bar(['Non-Toxic', 'Toxic'], class_counts.values, color=colors)
axes[0].set_title('Class Distribution (Count)', fontsize=12)
axes[0].set_ylabel('Number of Messages')
for i, v in enumerate(class_counts.values):
    axes[0].text(i, v + 0.5, str(v), ha='center', fontsize=12, fontweight='bold')

# Pie chart
axes[1].pie(class_counts.values, labels=['Non-Toxic', 'Toxic'], 
            autopct='%1.1f%%', colors=colors, startangle=90,
            explode=(0, 0.05))
axes[1].set_title('Class Distribution (Percentage)', fontsize=12)

plt.tight_layout()
plt.show()

# Check balance
balance_ratio = min(class_counts) / max(class_counts)
print(f"\n‚öñÔ∏è Class balance ratio: {balance_ratio:.2f}")
if balance_ratio > 0.8:
    print("   ‚Üí Dataset is well-balanced!")
elif balance_ratio > 0.5:
    print("   ‚Üí Dataset has moderate imbalance")
else:
    print("   ‚Üí Dataset is imbalanced - consider class weights or oversampling")


## 4. Text Analysis


In [None]:
# Calculate text statistics
df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))
df['char_count'] = df['text'].apply(lambda x: len(str(x)))

print("üìè Text Length Statistics:")
print(f"\n  Word count:")
print(f"    Min: {df['word_count'].min()}")
print(f"    Max: {df['word_count'].max()}")
print(f"    Mean: {df['word_count'].mean():.1f}")
print(f"    Median: {df['word_count'].median():.1f}")

print(f"\n  Character count:")
print(f"    Min: {df['char_count'].min()}")
print(f"    Max: {df['char_count'].max()}")
print(f"    Mean: {df['char_count'].mean():.1f}")


## 5. Word Frequency Analysis


In [None]:
def get_word_freq(texts):
    """Get word frequencies from a list of texts."""
    all_words = []
    for text in texts:
        words = str(text).lower().split()
        all_words.extend(words)
    return Counter(all_words)

# Get word frequencies by class
non_toxic_texts = df[df['label'] == 0]['text'].tolist()
toxic_texts = df[df['label'] == 1]['text'].tolist()

non_toxic_words = get_word_freq(non_toxic_texts)
toxic_words = get_word_freq(toxic_texts)

print("üî§ Top 10 words in NON-TOXIC messages:")
for word, count in non_toxic_words.most_common(10):
    print(f"   {word}: {count}")

print("\n‚ö†Ô∏è Top 10 words in TOXIC messages:")
for word, count in toxic_words.most_common(10):
    print(f"   {word}: {count}")


In [None]:
# Visualize top words
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Non-toxic words
top_non_toxic = non_toxic_words.most_common(10)
words, counts = zip(*top_non_toxic)
axes[0].barh(words, counts, color='#2ecc71')
axes[0].set_xlabel('Frequency')
axes[0].set_title('Top 10 Words in Non-Toxic Messages')
axes[0].invert_yaxis()

# Toxic words
top_toxic = toxic_words.most_common(10)
words, counts = zip(*top_toxic)
axes[1].barh(words, counts, color='#e74c3c')
axes[1].set_xlabel('Frequency')
axes[1].set_title('Top 10 Words in Toxic Messages')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()


## 6. Example Messages


In [None]:
# Show example messages from each class
print("‚úÖ Example NON-TOXIC messages:")
print("-" * 50)
for i, text in enumerate(df[df['label'] == 0]['text'].head(5), 1):
    print(f'{i}. "{text}"')

print("\n‚ö†Ô∏è Example TOXIC messages:")
print("-" * 50)
for i, text in enumerate(df[df['label'] == 1]['text'].head(5), 1):
    print(f'{i}. "{text}"')


## 7. Summary & Next Steps


In [None]:
print("="*60)
print("üìã DATA EXPLORATION SUMMARY")
print("="*60)
print(f"\nüìä Dataset Statistics:")
print(f"   Total samples: {len(df)}")
print(f"   Non-toxic: {class_counts[0]} ({class_counts[0]/len(df)*100:.1f}%)")
print(f"   Toxic: {class_counts[1]} ({class_counts[1]/len(df)*100:.1f}%)")
print(f"   Average word count: {df['word_count'].mean():.1f}")
print(f"\nüîë Key Observations:")
print(f"   - Dataset is well-balanced (50/50 split)")
print(f"   - Short messages typical of chat (avg ~5 words)")
print(f"   - Toxic messages contain negative words")
print(f"   - Non-toxic messages are encouraging/positive")
print(f"\nüìù Next Steps:")
print(f"   1. Train baseline model: python src/train_baseline.py")
print(f"   2. Train PyTorch model: python src/train_torch.py")
print(f"   3. Run web app: streamlit run app/app_streamlit.py")
print("="*60)

print("\n‚úÖ Exploration complete! Ready for model training.")
