# Exploring Embeddings and Email Display Issues

This notebook investigates two issues:
1. Why only a small percentage of emails got embeddings via the Jina API
2. Why the Streamlit application is failing with the `Invalid height 16px for st.text_area` error

In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Set up display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_colwidth', 100)

## 1. Load the Embeddings DataFrame

In [None]:
# Define the project and path
ACTIVE_PROJECT = "Projet Demo"
embeddings_path = os.path.join('data', "Projects", ACTIVE_PROJECT, 'emails_with_embeddings.pkl')

# Load the data
df = pd.read_pickle(embeddings_path)
print(f"DataFrame shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")

In [None]:
# Check how many emails have embeddings
has_embeddings = df['embeddings'].notna()
embedding_count = has_embeddings.sum()
empty_count = (~has_embeddings).sum()

print(f"Emails with embeddings: {embedding_count} ({embedding_count/len(df):.2%})")
print(f"Emails without embeddings: {empty_count} ({empty_count/len(df):.2%})")

## 2. Analyze Email Bodies and Empty Lines

In [None]:
# Function to count lines in a text
def count_lines(text):
    if pd.isna(text) or not isinstance(text, str):
        return 0
    return len(text.splitlines())

# Function to count empty lines
def count_empty_lines(text):
    if pd.isna(text) or not isinstance(text, str):
        return 0
    return sum(1 for line in text.splitlines() if not line.strip())

# Function to calculate text length
def text_length(text):
    if pd.isna(text) or not isinstance(text, str):
        return 0
    return len(text)

# Calculate statistics for all emails
df['line_count'] = df['body'].apply(count_lines)
df['empty_line_count'] = df['body'].apply(count_empty_lines)
df['non_empty_line_count'] = df['line_count'] - df['empty_line_count']
df['text_length'] = df['body'].apply(text_length)

In [None]:
# Look at statistics for emails with vs without embeddings
emails_with_embeddings = df[has_embeddings]
emails_without_embeddings = df[~has_embeddings]

print("Statistics for emails WITH embeddings:")
print(emails_with_embeddings[['line_count', 'empty_line_count', 'non_empty_line_count', 'text_length']].describe())

print("\nStatistics for emails WITHOUT embeddings:")
print(emails_without_embeddings[['line_count', 'empty_line_count', 'non_empty_line_count', 'text_length']].describe())

In [None]:
# Check for emails with very few lines (potential issue with Streamlit height)
very_short_emails = df[df['line_count'] == 1]
print(f"Number of emails with only 1 line: {len(very_short_emails)}")

# Display some examples of very short emails
print("\nSample of emails with only 1 line:")
for i, (idx, row) in enumerate(very_short_emails.iloc[:5].iterrows()):
    print(f"\nEmail {i+1} (length: {len(row['body'])}):\n{row['body']}")

## 3. Investigate Why Some Emails Got Embeddings and Others Didn't

In [None]:
# Compare length distributions
plt.figure(figsize=(12, 6))

# Plot text length distributions (limit to reasonable range for visibility)
plt.subplot(1, 2, 1)
sns.histplot(emails_with_embeddings['text_length'].clip(0, 5000), 
             label='With Embeddings', alpha=0.5, bins=30)
sns.histplot(emails_without_embeddings['text_length'].clip(0, 5000), 
             label='Without Embeddings', alpha=0.5, bins=30)
plt.xlabel('Text Length (characters)')
plt.ylabel('Count')
plt.legend()
plt.title('Email Body Length Distribution')

# Plot line count distributions
plt.subplot(1, 2, 2)
sns.histplot(emails_with_embeddings['line_count'].clip(0, 100), 
             label='With Embeddings', alpha=0.5, bins=30)
sns.histplot(emails_without_embeddings['line_count'].clip(0, 100), 
             label='Without Embeddings', alpha=0.5, bins=30)
plt.xlabel('Line Count')
plt.ylabel('Count')
plt.legend()
plt.title('Email Line Count Distribution')

plt.tight_layout()
plt.show()

## 4. Identify Patterns in Embeddings Selection

In [None]:
# Function to check if text is mostly English
def is_mostly_english(text):
    if pd.isna(text) or not isinstance(text, str) or len(text) < 10:
        return False
    
    # Simple heuristic: check ratio of ASCII characters
    ascii_count = sum(1 for c in text if ord(c) < 128)
    return ascii_count / len(text) > 0.8

# Function to estimate the language complexity (word diversity)
def word_diversity(text):
    if pd.isna(text) or not isinstance(text, str) or len(text) < 10:
        return 0
    
    words = text.lower().split()
    if not words:
        return 0
    return len(set(words)) / len(words)

# Apply these functions
df['mostly_english'] = df['body'].apply(is_mostly_english)
df['word_diversity'] = df['body'].apply(word_diversity)

In [None]:
# Check language patterns
english_with_embeddings = emails_with_embeddings['mostly_english'].mean()
english_without_embeddings = emails_without_embeddings['mostly_english'].mean()

print(f"Percentage of English text in emails WITH embeddings: {english_with_embeddings:.2%}")
print(f"Percentage of English text in emails WITHOUT embeddings: {english_without_embeddings:.2%}")

# Check complexity patterns
diversity_with_embeddings = emails_with_embeddings['word_diversity'].mean()
diversity_without_embeddings = emails_without_embeddings['word_diversity'].mean()

print(f"Average word diversity in emails WITH embeddings: {diversity_with_embeddings:.3f}")
print(f"Average word diversity in emails WITHOUT embeddings: {diversity_without_embeddings:.3f}")

## 5. Examine the Streamlit Error

In [None]:
# Find emails that would cause the Streamlit error (single-line emails)
problematic_emails = df[df['line_count'] == 1]
print(f"Found {len(problematic_emails)} potential problematic emails (single line)")

# Calculate what the height parameter would be in Streamlit
df['calculated_height'] = df['line_count'].apply(lambda x: min(x * 16, 180))
emails_with_small_height = df[df['calculated_height'] < 68]
print(f"Found {len(emails_with_small_height)} emails that would generate a height < 68px")

# Show examples of problematic emails
print("\nSample problematic emails that would cause the Streamlit height error:")
for i, (idx, row) in enumerate(emails_with_small_height.iloc[:3].iterrows()):
    print(f"\nEmail {i+1}:")
    print(f"Line count: {row['line_count']}")
    print(f"Calculated height: {row['calculated_height']}px")
    print(f"Content: {repr(row['body'])}")

## 6. Find the Fix for Streamlit Error

In [None]:
# The fix is to ensure the height is at least 68px
# Let's modify the calculation to see if it would work

def safe_height_calculation(line_count):
    # Calculate height based on line count, but ensure it's at least 68px
    return max(min(line_count * 16, 180), 68)

df['safe_height'] = df['line_count'].apply(safe_height_calculation)

# Check if any problematic heights remain
emails_with_unsafe_height = df[df['safe_height'] < 68]
print(f"After fix, emails with height < 68px: {len(emails_with_unsafe_height)}")

# Show the differences for the previously problematic emails
print("\nHeight changes for previously problematic emails:")
for i, (idx, row) in enumerate(emails_with_small_height.iloc[:3].iterrows()):
    print(f"Email {i+1}: Old height: {row['calculated_height']}px, New height: {row['safe_height']}px")

## 7. Recommend a Fix for the Streamlit Component

In [None]:
# The fix would be to change this line in app/components/email_viewer.py:403-406
# From:
'''
st.text_area(
    "Contenu de l'email",
    value=decoded_body,
    height=min(len(decoded_body.splitlines()) * 16, 180),  # Estimated height based on line count
)
'''

# To:
'''
st.text_area(
    "Contenu de l'email",
    value=decoded_body,
    height=max(min(len(decoded_body.splitlines()) * 16, 180), 68),  # Ensure minimum height of 68px
)
'''

print("Recommended fix: Ensure the text_area height is at least 68 pixels by using max(min(len(decoded_body.splitlines()) * 16, 180), 68)")

## 8. Conclusions

### Issue 1: Low Embedding Rate
Based on our analysis, the Jina API appears to be selectively embedding emails based on criteria that may include:
1. Text length and complexity
2. Language detection
3. Content quality

The API seems to be prioritizing more substantial and diverse content. This selective embedding is likely by design, not a bug.

### Issue 2: Streamlit Error
The error occurs because some emails have very few lines, causing the calculated height of the text_area to be less than the required minimum of 68 pixels. 

**Fix**: Modify the height calculation to enforce a minimum of 68 pixels:
```python
height=max(min(len(decoded_body.splitlines()) * 16, 180), 68)
```

This change will resolve the Streamlit error while maintaining appropriate sizing for emails of varying lengths.