# Exploratory Data Analysis: Language-Mixed Sentiment Data

This notebook analyzes code-mixed Nepali-English data for:
- **Language ratios** (Nepali/English/mixed)
- **Vocabulary distribution**
- **Sentiment label balance**
- **Code-mixing density** (token-level)
- **UMAP visualization** of word embeddings

> _Visualizations are saved to `notebooks/eda_outputs/` and linked in the main README automatically._

In [ ]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.manifold import TSNE
import umap
from collections import Counter

# Output directory for figures
output_dir = Path('../notebooks/eda_outputs')
output_dir.mkdir(parents=True, exist_ok=True)


In [ ]:
# Load processed data
DATA_PATH = '../data/processed/sentiment_train.csv'
df = pd.read_csv(DATA_PATH)
df.head()

## Language Ratio Analysis

In [ ]:
lang_counts = df['language'].value_counts()
plt.figure(figsize=(5,3))
sns.barplot(x=lang_counts.index, y=lang_counts.values, palette='viridis')
plt.title('Language Distribution (Nepali/English/Mixed)')
plt.xlabel('Language')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig(output_dir/'language_distribution.png')
plt.show()

## Sentiment Label Balance

In [ ]:
label_map = {0:'Negative', 1:'Neutral', 2:'Positive'}
df['sentiment_label'] = df['label'].map(label_map)
sentiment_counts = df['sentiment_label'].value_counts()
plt.figure(figsize=(5,3))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette='Set2')
plt.title('Sentiment Label Balance')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig(output_dir/'sentiment_balance.png')
plt.show()

## Vocabulary Distribution

In [ ]:
from collections import Counter
all_words = ' '.join(df['processed_text'].astype(str)).split()
word_counts = Counter(all_words)
most_common = word_counts.most_common(25)
words, counts = zip(*most_common)
plt.figure(figsize=(10,4))
sns.barplot(x=list(words), y=list(counts), palette='magma')
plt.title('Top 25 Most Common Words')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(output_dir/'top_words.png')
plt.show()

## Code-Mixing Density (Token-Level)

In [ ]:
def is_nepali_token(token):
    for char in token:
        if 'ऀ' <= char <= 'ॿ':
            return True
    return False

df['num_nepali_tokens'] = df['processed_text'].apply(lambda x: sum(is_nepali_token(tok) for tok in str(x).split()))
df['num_english_tokens'] = df['processed_text'].apply(lambda x: sum(tok.isascii() and tok.isalpha() for tok in str(x).split()))
df['mix_density'] = df['num_nepali_tokens'] / (df['num_nepali_tokens'] + df['num_english_tokens'] + 1e-5)
plt.figure(figsize=(6,3))
sns.histplot(df['mix_density'], bins=20, kde=True, color='teal')
plt.title('Code-Mixing Density (Nepali tokens ratio)')
plt.xlabel('Nepali token ratio')
plt.tight_layout()
plt.savefig(output_dir/'mix_density.png')
plt.show()

## UMAP Visualization of Word Embeddings (Language Overlap)

In [ ]:
from transformers import AutoTokenizer, AutoModel
import torch
import umap

# Use a subset for speed
sample_texts = df['processed_text'].dropna().sample(n=200, random_state=42) if len(df) > 200 else df['processed_text'].dropna()
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModel.from_pretrained('xlm-roberta-base')
model.eval()

embeddings = []
with torch.no_grad():
    for text in sample_texts:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=32)
        outputs = model(**inputs)
        emb = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(emb)

embeddings = np.stack(embeddings)
reducer = umap.UMAP(random_state=42)
umap_emb = reducer.fit_transform(embeddings)
plt.figure(figsize=(7,5))
sns.scatterplot(x=umap_emb[:,0], y=umap_emb[:,1], hue=sample_texts.index.map(lambda i: df.loc[i, 'language']), palette='Set1', alpha=0.7)
plt.title('UMAP Projection of Text Embeddings (Language Overlap)')
plt.legend(title='Language')
plt.tight_layout()
plt.savefig(output_dir/'umap_embeddings.png')
plt.show()

---

## Windsurf/README Integration

- The figures above are saved to `notebooks/eda_outputs/`.
- To update the main README with these outputs, run the script below or use a Windsurf automation.

```python
# Update README.md with EDA outputs
from pathlib import Path
eda_dir = Path('notebooks/eda_outputs')
readme_path = Path('README.md')
figs = list(eda_dir.glob('*.png'))
with readme_path.open('a') as f:
    f.write('
## Exploratory Data Analysis (Auto-Generated)
')
    for fig in figs:
        f.write(f'![EDA Output]({fig})
')
```
