# Data Exploration - Keyword Feedback Analysis

This notebook explores the feedback data collected from user annotations.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Data

In [None]:
# Load feedback data
df = pd.read_csv('../../data/feedback.csv')
print(f"Total samples: {len(df)}")
df.head()

## Basic Statistics

In [None]:
# Class distribution
print("\nClass Distribution:")
print(df['label'].value_counts())
print(f"\nApproval rate: {df['label'].mean():.2%}")

In [None]:
# Feature statistics
feature_cols = ['length', 'yake_score', 'f1_wfreq', 'f2_wcase', 'f3_wpos', 'f4_wrel', 'f5_wspread']
df[feature_cols].describe()

## Class Balance Visualization

In [None]:
# Plot class distribution
plt.figure(figsize=(8, 5))
df['label'].value_counts().plot(kind='bar', color=['#e74c3c', '#2ecc71'])
plt.title('Keyword Label Distribution')
plt.xlabel('Label (0=Rejected, 1=Approved)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## Feature Distributions

In [None]:
# Plot feature distributions by class
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.ravel()

for idx, col in enumerate(['length', 'yake_score'] + [f'f{i}_{name}' for i, name in enumerate(['wfreq', 'wcase', 'wpos', 'wrel', 'wspread'], 1)]):
    for label in [0, 1]:
        data = df[df['label'] == label][col]
        axes[idx].hist(data, alpha=0.6, label=f'Label {label}', bins=20)
    
    axes[idx].set_title(col)
    axes[idx].legend()
    axes[idx].set_xlabel('Value')
    axes[idx].set_ylabel('Frequency')

# Hide extra subplots
for idx in range(len(feature_cols), len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

## Feature Correlations

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
corr = df[feature_cols].corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## Keyword Characteristics

In [None]:
# Compare approved vs rejected keywords
print("Approved Keywords (mean features):")
print(df[df['label'] == 1][feature_cols].mean())
print("\nRejected Keywords (mean features):")
print(df[df['label'] == 0][feature_cols].mean())

In [None]:
# Sample keywords
print("Sample Approved Keywords:")
print(df[df['label'] == 1][['keyword', 'yake_score']].head(10))
print("\nSample Rejected Keywords:")
print(df[df['label'] == 0][['keyword', 'yake_score']].head(10))