# Data Exploration Notebook

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from src.config import load_config
from src.data_loader import DataLoader

# Load configuration
config = load_config()

# Load data
data_loader = DataLoader(config)
df = data_loader.load_data(config['data']['train_path'])


In [None]:
# Basic dataset information
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Label distribution:\n{df[config['data']['label_column']].value_counts()}")


In [None]:
# Visualize label distribution
plt.figure(figsize=(8, 6))
sns.countplot(x=config['data']['label_column'], data=df)
plt.title('Label Distribution')
plt.show()


In [None]:
# Text length analysis
df['text_length'] = df[config['data']['text_column']].apply(len)
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='text_length', hue=config['data']['label_column'], bins=50)
plt.title('Text Length Distribution by Label')
plt.show()


In [None]:
# Word clouds for each class
for label in df[config['data']['label_column']].unique():
    text = ' '.join(df[df[config['data']['label_column']] == label][config['data']['text_column']])
    wordcloud = WordCloud(width=800, height=400).generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for Class {label}')
    plt.axis('off')
    plt.show()
