In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

from src.utils.config_loader import ConfigLoader

# Load config for labels
config = ConfigLoader()
label_map = config.get_labels()
label_names = list(label_map.values())

In [None]:
df = pd.read_csv('../data/labeled_data.csv')  # adjust path if needed
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head())

In [None]:
class_counts = df['label_id'].value_counts().sort_index()

plt.figure(figsize=(10,5))
sns.barplot(x=label_names, y=class_counts.values, palette="viridis")
plt.title('Sample Count per Class')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../saved_models/class_distribution.png', dpi=300)
plt.show()

print("Class counts:\n", class_counts)

In [None]:
df['comment_length'] = df['comment'].astype(str).str.len()

plt.figure(figsize=(10,5))
sns.histplot(data=df, x='comment_length', hue='label_id', bins=50, multiple='stack')
plt.title('Comment Length Distribution by Class')
plt.xlabel('Comment Length')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

print("Avg. comment length per class:\n", df.groupby('label_id')['comment_length'].mean())

In [None]:
for label_id, name in label_map.items():
    print(f"\n{name} samples:")
    samples = df[df['label_id'] == label_id]['comment'].sample(3, random_state=1).tolist()
    for s in samples:
        print("-", s)

In [None]:
for label_id, name in label_map.items():
    text = ' '.join(df[df['label_id'] == label_id]['comment'].astype(str))
    if text.strip():  # avoid empty wordclouds
        wc = WordCloud(width=800, height=500, background_color='white').generate(text)
        plt.figure(figsize=(8,5))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'WordCloud - {name}')
        plt.tight_layout()
        plt.savefig(f'../saved_models/wordcloud_{name.replace(" ", "_").lower()}.png', dpi=300)
        plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(data=df, x='language', order=df['language'].value_counts().index, palette="Set2")
plt.title('Distribution of Comments by Language')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../saved_models/language_distribution.png', dpi=300)
plt.show()

print("Language counts:\n", df['language'].value_counts())