In [None]:
# -*- coding: utf-8 -*-
""" 
COMPREHENSIVE EDA TEMPLATE FOR TEXT DATA WITH VECTOR SEARCH PREPARATION
========================================================================

This notebook provides a complete template for:
1. Exploratory Data Analysis (EDA) of text data
2. Advanced cleaning and preprocessing
3. Preparation for vector search with LLMs

Each step includes detailed explanations of:
- What the code does
- Why it's important
- How it processes the data
"""

# ======================
# 1. SETUP & CONFIGURATION
# ======================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from tqdm import tqdm
import plotly.express as px
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE

# Setup visualization style
plt.style.use('ggplot')
sns.set_palette("husl")
%matplotlib inline

# Configuration
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', 50)

# ======================
# 2. DATA LOADING
# ======================
"""
WHY: Proper data loading is crucial for reproducibility and handling different data formats.
HOW: We'll use pandas to load the data with error handling.
"""
import kagglehub

# Download latest version
path = kagglehub.dataset_download("thedevastator/text-classification-for-qa-dataset")

print("Path to dataset files:", path)

best_books=pd.read_csv(f"{path}/test.csv")
try:
    # Load your dataset - update this path as needed
    df = pd.read_csv('your_dataset.csv')  # Replace with your actual data loading code
    print("✅ Data loaded successfully!")
    
    # Display basic loading info
    print(f"\n📊 Dataset Dimensions: {df.shape[0]} rows, {df.shape[1]} columns")
    print(f"\n🔍 First 5 rows:")
    display(df.head())
    
except Exception as e:
    print(f"❌ Error loading data: {str(e)}")
    # If using the hate speech dataset from your example:
    print("\n⚠️ Trying sample data...")
    import requests
    from io import StringIO
    url = "https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv"
    content = requests.get(url).content
    df = pd.read_csv(StringIO(content.decode('utf-8')))
    print("✅ Sample hate speech data loaded as fallback!")
    display(df.head())

# ======================
# 3. BASIC EDA
# ======================
"""
WHY: Understanding the raw data structure before any processing.
HOW: We'll examine metadata, distributions, and basic statistics.
"""

print("\n" + "="*50)
print("🧐 BASIC EXPLORATORY DATA ANALYSIS")
print("="*50)

# 3.1 Dataset Metadata
print("\n📌 DATASET METADATA")
print("-"*40)
print(f"• Column names: {list(df.columns)}")
print(f"• Data types:\n{df.dtypes}")
print(f"• Missing values:\n{df.isna().sum()}")
print(f"• Duplicate rows: {df.duplicated().sum()}")

# 3.2 Target Variable Analysis (if exists)
if 'class' in df.columns:
    print("\n🎯 TARGET VARIABLE ANALYSIS ('class')")
    print("-"*40)
    
    # Value counts with percentages
    target_dist = df['class'].value_counts(normalize=True).mul(100).round(1)
    display(target_dist)
    
    # Visual distribution
    plt.figure(figsize=(10,5))
    ax = sns.countplot(data=df, x='class')
    for p in ax.patches:
        ax.annotate(f'{p.get_height()}\n({p.get_height()/len(df)*100:.1f}%)', 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha='center', va='center', xytext=(0,10), textcoords='offset points')
    plt.title('Class Distribution with Percentages')
    plt.show()

# 3.3 Text Length Analysis
print("\n📏 TEXT LENGTH ANALYSIS")
print("-"*40)

# Calculate text statistics
df['text_length'] = df['tweet'].apply(len)
df['word_count'] = df['tweet'].apply(lambda x: len(x.split()))

# Plot distributions
fig, axes = plt.subplots(1, 2, figsize=(15,5))
sns.histplot(df['text_length'], bins=50, ax=axes[0], kde=True)
axes[0].set_title('Character Length Distribution')
sns.histplot(df['word_count'], bins=50, ax=axes[1], kde=True)
axes[1].set_title('Word Count Distribution')
plt.show()

# Show text length by class (if available)
if 'class' in df.columns:
    plt.figure(figsize=(12,6))
    sns.boxplot(data=df, x='class', y='word_count')
    plt.title('Word Count Distribution by Class')
    plt.show()

# ======================
# 4. TEXT PREPROCESSING
# ======================
"""
WHY: Clean text data improves model performance and vector search quality.
HOW: We'll implement a comprehensive cleaning pipeline with explanations.
"""

print("\n" + "="*50)
print("🧹 TEXT CLEANING & PREPROCESSING")
print("="*50)

# Install necessary packages
!pip install -U spacy
!python -m spacy download en_core_web_sm
import spacy
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Initialize NLP tools
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
stop_words = set(stopwords.words('english'))
tqdm.pandas()  # Enable progress bars

def clean_text(text):
    """
    Comprehensive text cleaning function with explanations for each step.
    
    Parameters:
    text (str): Raw input text
    
    Returns:
    str: Cleaned text ready for vectorization
    """
    # 1. Lowercasing (standardizes text)
    text = text.lower()
    
    # 2. Remove URLs (common in social media text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 3. Remove user mentions (@) and hashtags (#)
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # 4. Remove special characters and numbers (keep only letters)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # 5. Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 6. Lemmatization (reduce words to base form)
    doc = nlp(text)
    text = ' '.join([token.lemma_ for token in doc])
    
    # 7. Remove stopwords (common words that add little meaning)
    text = ' '.join([word for word in text.split() 
                    if word not in stop_words and len(word) > 2])
    
    return text

# Apply cleaning with progress bar
print("🔄 Cleaning text... (this may take a few minutes)")
df['cleaned_text'] = df['tweet'].progress_apply(clean_text)

# Show before/after examples
print("\n🆚 BEFORE/AFTER CLEANING EXAMPLES")
for i in range(3):
    print(f"\nOriginal ({i}): {df['tweet'].iloc[i]}")
    print(f"Cleaned ({i}): {df['cleaned_text'].iloc[i]}")

# ======================
# 5. ADVANCED EDA
# ======================
"""
WHY: Deeper understanding of text patterns and relationships.
HOW: We'll use visualization and statistical analysis of cleaned text.
"""

print("\n" + "="*50)
print("🔍 ADVANCED TEXT ANALYSIS")
print("="*50)

# 5.1 Word Clouds by Class
print("\n☁️ WORD CLOUDS BY CLASS")
if 'class' in df.columns:
    for label in sorted(df['class'].unique()):
        text = ' '.join(df[df['class']==label]['cleaned_text'])
        wordcloud = WordCloud(width=800, height=400, 
                            background_color='white',
                            colormap='Reds' if label == 1 else 'Blues',
                            max_words=100).generate(text)
        plt.figure(figsize=(10,5))
        plt.imshow(wordcloud)
        plt.title(f'Class {label} - Most Frequent Words')
        plt.axis('off')
        plt.show()

# 5.2 N-gram Analysis
print("\n📊 N-GRAM ANALYSIS (Most Common Phrases)")

def plot_top_ngrams(series, n=20, ngram_range=(1,1), title=""):
    """
    Analyze and visualize most common n-grams.
    
    Parameters:
    series: Text data to analyze
    n: Number of top n-grams to show
    ngram_range: Range of n-grams (1=unigram, 2=bigram, etc.)
    title: Plot title
    """
    vec = CountVectorizer(ngram_range=ngram_range, 
                         max_features=2000).fit(series)
    bag_of_words = vec.transform(series)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) 
                 for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    
    plt.figure(figsize=(10,5))
    sns.barplot(x=[x[1] for x in words_freq[:n]], 
                y=[x[0] for x in words_freq[:n]])
    plt.title(f'Top {n} {" ".join([str(x) for x in ngram_range])}-grams: {title}')
    plt.show()

# Unigrams
plot_top_ngrams(df['cleaned_text'], ngram_range=(1,1), 
               title="All Text")

# Bigrams
plot_top_ngrams(df['cleaned_text'], ngram_range=(2,2), 
               title="All Text")

# Class-specific n-grams (if available)
if 'class' in df.columns:
    for label in sorted(df['class'].unique()):
        class_text = df[df['class']==label]['cleaned_text']
        plot_top_ngrams(class_text, ngram_range=(1,2), 
                       title=f"Class {label}")

# ======================
# 6. VECTOR SEARCH PREPARATION
# ======================
"""
WHY: Prepare text embeddings for efficient similarity search with LLMs.
HOW: We'll generate sentence embeddings and visualize the vector space.
"""

print("\n" + "="*50)
print("🔮 VECTOR SEARCH PREPARATION")
print("="*50)

# 6.1 Install and setup sentence transformers
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer

print("\n🔧 Setting up sentence transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
"""
Model choice explanation:
- 'all-MiniLM-L6-v2': Good balance between speed (384-dim) and quality
- Other options: 
  - 'all-mpnet-base-v2' (higher quality, 768-dim)
  - 'multi-qa-mpnet-base-dot-v1' (optimized for semantic search)
"""

# 6.2 Generate embeddings (using sample for demonstration)
sample_size = min(1000, len(df))  # Adjust based on your resources
sample_texts = df['cleaned_text'].sample(sample_size, random_state=42).tolist()

print(f"\n🔄 Generating embeddings for {sample_size} samples...")
embeddings = model.encode(sample_texts, 
                         batch_size=32, 
                         show_progress_bar=True,
                         convert_to_numpy=True)

# 6.3 Dimensionality reduction for visualization
print("\n🎨 Visualizing embeddings with t-SNE...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
embeddings_2d = tsne.fit_transform(embeddings)

# Create visualization
plt.figure(figsize=(12,10))
if 'class' in df.columns:
    classes = df.loc[sample_texts.index, 'class']
    scatter = plt.scatter(embeddings_2d[:,0], embeddings_2d[:,1], 
                         c=classes, alpha=0.6, cmap='viridis')
    plt.colorbar(scatter).set_label('Class')
else:
    plt.scatter(embeddings_2d[:,0], embeddings_2d[:,1], alpha=0.6)
plt.title('t-SNE Visualization of Text Embeddings')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.show()

# 6.4 Save embeddings for future use
print("\n💾 Saving embeddings for vector search...")
embedding_df = pd.DataFrame({
    'text': sample_texts,
    'embedding': list(embeddings)
})

if 'class' in df.columns:
    embedding_df['class'] = df.loc[sample_texts.index, 'class'].values

# Example: Save to file
embedding_df.to_pickle('text_embeddings.pkl')
print("✅ Embeddings saved to 'text_embeddings.pkl'")

# ======================
# 7. NEXT STEPS
# ======================
"""
Suggestions for what to do next with your prepared data:
1. Vector Search: Use FAISS or Annoy for efficient similarity search
2. Classification: Train a model using the embeddings as features
3. Clustering: Group similar texts using K-Means or HDBSCAN
4. Topic Modeling: Discover latent topics with LDA or BERTopic
"""

print("\n" + "="*50)
print("🚀 NEXT STEPS SUGGESTIONS")
print("="*50)

next_steps = """
Recommended next steps for your project:

1. 🎯 VECTOR SEARCH IMPLEMENTATION:
   - Install: !pip install faiss-cpu
   - Build index: 
     import faiss
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(embeddings)

2. 🤖 CLASSIFICATION MODEL:
   from sklearn.ensemble import RandomForestClassifier
   clf = RandomForestClassifier()
   clf.fit(embeddings, df.loc[sample_texts.index, 'class'])

3. 🌀 CLUSTERING ANALYSIS:
   from sklearn.cluster import KMeans
   kmeans = KMeans(n_clusters=3)
   clusters = kmeans.fit_predict(embeddings)

4. 📚 TOPIC MODELING:
   !pip install bertopic
   from bertopic import BERTopic
   topic_model = BERTopic()
   topics, _ = topic_model.fit_transform(sample_texts)
"""

print(next_steps)
print("\n✨ EDA and Vector Search Preparation Complete! ✨")