# Data Exploration for Nepali-English Code-Mixed Text

This notebook explores the characteristics of Nepali-English code-mixed text data.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.data.preprocess import TextPreprocessor

sns.set_style('whitegrid')
%matplotlib inline

## Load Sample Data

In [None]:
# Create sample data
sample_texts = [
    "यो movie राम्रो थियो but ending disappointing थियो",
    "I love Nepali food especially मोमो and चाउमिन",
    "Weather आज धेरै राम्रो छ let's go hiking",
    "This is completely in English",
    "यो पूर्ण रूपमा नेपालीमा छ"
]

df = pd.DataFrame({'text': sample_texts})
df.head()

## Preprocess and Analyze

In [None]:
preprocessor = TextPreprocessor()
df_processed = preprocessor.preprocess_dataset(df)
df_processed

In [None]:
# Language distribution
plt.figure(figsize=(10, 6))
df_processed['language'].value_counts().plot(kind='bar')
plt.title('Language Distribution')
plt.xlabel('Language')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Text Length Analysis

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df_processed['text_length'], bins=20, edgecolor='black')
plt.title('Text Length Distribution')
plt.xlabel('Text Length (characters)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()