# 1.0 Data Exploration and Initial Analysis

This notebook performs initial exploration of our dataset and analyzes the quality of different columns before implementing the cleaning pipeline.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
from langdetect import detect
import spacy
import warnings
import logging
from typing import List, Dict, Any
from pathlib import Path

# Configure warnings and logging
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set plotting style and figure size defaults
plt.style.use('seaborn')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = [12, 8]

# Try loading spaCy model with error handling
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    logger.error("spaCy model 'en_core_web_sm' not found. Installing...")
    import subprocess
    subprocess.run(['python', '-m', 'spacy', 'download', 'en_core_web_sm'])
    nlp = spacy.load('en_core_web_sm')
except Exception as e:
    logger.error(f"Error loading spaCy model: {str(e)}")
    raise

## 1. Data Loading and Initial Inspection

In [None]:
# Load the dataset
try:
    data_path = Path('../data/raw/zero_waste.csv')
    if not data_path.exists():
        raise FileNotFoundError(f"Data file not found at {data_path}")
    
    df = pd.read_csv(data_path)
    
    # Validate that required columns exist based on config
    required_columns = ['text', 'hashtags', 'place_country_code', 'Developed / Developing']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")
    
    # Display basic information
    print("Dataset Shape:", df.shape)
    print("\nColumns:", df.columns.tolist())
    print("\nDataset Info:")
    df.info()
    print("\nMissing Values:")
    print(df.isnull().sum())
    
    # Log successful data load
    logger.info(f"Successfully loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns")

except FileNotFoundError as e:
    logger.error(f"Error loading data: {str(e)}")
    raise
except ValueError as e:
    logger.error(f"Data validation error: {str(e)}")
    raise
except Exception as e:
    logger.error(f"Unexpected error loading data: {str(e)}")
    raise

## 2. Text Column Analysis

In [None]:
def analyze_text_column(texts):
    """Analyze text column for various metrics"""
    # Input validation
    if not isinstance(texts, pd.Series):
        raise TypeError("Input must be a pandas Series")
    
    # Handle empty input
    if len(texts) == 0:
        raise ValueError("Input Series is empty")
        
    try:
        metrics = {
            'total_rows': len(texts),
            'null_count': texts.isnull().sum(),
            'empty_count': texts.str.strip().eq('').sum(),
            'unique_count': texts.nunique(),
            'avg_length': texts.str.len().mean(),
            'min_length': texts.str.len().min(),
            'max_length': texts.str.len().max(),
        }
        
        # Validate metrics
        if any(pd.isna(value) for value in metrics.values()):
            raise ValueError("Error calculating metrics - check input data")
            
        # Calculate and plot length distribution
        plt.figure(figsize=(12, 6))
        texts.str.len().hist(bins=50)
        plt.title('Text Length Distribution')
        plt.xlabel('Length')
        plt.ylabel('Frequency')
        plt.show()
        
        return pd.Series(metrics)
        
    except Exception as e:
        logger.error(f"Error analyzing text column: {str(e)}")
        raise

try:
    # Analyze text column
    text_metrics = analyze_text_column(df['text'])
    print("Text Column Metrics:")
    print(text_metrics)
except Exception as e:
    print(f"Failed to analyze text column: {str(e)}")

In [None]:
def detect_text_issues(texts):
    """Detect various issues in text data"""
    # Input validation
    if not isinstance(texts, pd.Series):
        raise TypeError("Input must be a pandas Series")
        
    issues = {
        'urls': [],
        'non_english': [],
        'short_text': [], 
        'special_chars': [],
        'rt_prefix': []
    }
    
    # Get validation parameters from config
    min_words = 3  # From cleaning_config validation min_words
    
    for idx, text in enumerate(texts):
        if pd.isna(text):
            continue
            
        text = str(text).strip()
        if not text:
            continue
            
        # Check for URLs using regex from cleaning config
        if re.search(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text):
            issues['urls'].append(idx)
        
        # Check language using language detection model
        try:
            if detect(text) != 'en':
                issues['non_english'].append(idx)
        except Exception as e:
            logger.warning(f"Language detection failed for text at index {idx}: {str(e)}")
            continue
        
        # Check minimum word length
        if len(text.split()) < min_words:
            issues['short_text'].append(idx)
        
        # Check special characters, excluding allowed punctuation
        if re.search(r'[^\w\s.,!?\'"-]', text):
            issues['special_chars'].append(idx)
        
        # Check RT prefix case-insensitively
        if text.lower().startswith('rt '):
            issues['rt_prefix'].append(idx)
    
    # Calculate issue counts
    issue_counts = {k: len(v) for k, v in issues.items()}
    
    # Validate results
    if sum(issue_counts.values()) == 0 and len(texts) > 0:
        logger.warning("No issues detected - verify detection logic")
        
    return issue_counts

try:
    # Analyze text issues
    text_issues = detect_text_issues(df['text'])
    print("\nText Issues Found:")
    for issue, count in text_issues.items():
        percentage = count/len(df)*100 if len(df) > 0 else 0
        print(f"{issue}: {count} instances ({percentage:.2f}%)")
except Exception as e:
    logger.error(f"Failed to detect text issues: {str(e)}")
    raise

## 3. Hashtag Analysis

In [None]:
def analyze_hashtags(hashtags):
    """Analyze hashtag patterns and issues"""
    if not isinstance(hashtags, pd.Series):
        raise TypeError("Input must be a pandas Series")
        
    # Split hashtags into individual tags
    all_tags = []
    for tags in hashtags.dropna():
        if isinstance(tags, str):
            # Split on whitespace and filter out empty strings
            tags_list = [tag.strip() for tag in tags.split() if tag.strip()]
            # Validate each tag starts with #
            tags_list = [tag if tag.startswith('#') else f'#{tag}' for tag in tags_list]
            all_tags.extend(tags_list)
    
    # Calculate metrics
    tag_counts = Counter(all_tags)
    
    try:
        avg_per_row = len(all_tags) / len(hashtags) if len(hashtags) > 0 else 0
    except ZeroDivisionError:
        avg_per_row = 0
        
    metrics = {
        'total_hashtags': len(all_tags),
        'unique_hashtags': len(tag_counts),
        'avg_per_row': avg_per_row,
        'rows_with_tags': hashtags.notna().sum(),
        'empty_rows': hashtags.isna().sum(),
        'max_tags_per_row': max((len(str(tags).split()) for tags in hashtags.dropna()), default=0)
    }
    
    # Plot top hashtags if there are any
    if tag_counts:
        plt.figure(figsize=(12, 6))
        top_tags = pd.Series(dict(tag_counts.most_common(20)))
        top_tags.plot(kind='bar')
        plt.title('Top 20 Hashtags')
        plt.xlabel('Hashtag')
        plt.ylabel('Frequency')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
    else:
        print("No hashtags found to plot")
    
    return pd.Series(metrics)

try:
    # Analyze hashtags
    hashtag_metrics = analyze_hashtags(df['hashtags'])
    print("\nHashtag Metrics:")
    print(hashtag_metrics)
except Exception as e:
    print(f"Error analyzing hashtags: {str(e)}")

## 4. Country Code Analysis

In [None]:
def analyze_country_codes(codes):
    """Analyze country code distribution and validity"""
    # Get valid country codes from ISO 3166-1 alpha-2 standard
    # Using a more comprehensive list of valid country codes
    valid_codes = set([
        "AF", "AL", "DZ", "AS", "AD", "AO", "AI", "AQ", "AG", "AR", "AM", "AW", "AU", "AT", "AZ",
        "BS", "BH", "BD", "BB", "BY", "BE", "BZ", "BJ", "BM", "BT", "BO", "BA", "BW", "BV", "BR",
        "IO", "BN", "BG", "BF", "BI", "KH", "CM", "CA", "CV", "KY", "CF", "TD", "CL", "CN", "CX",
        "CC", "CO", "KM", "CG", "CD", "CK", "CR", "CI", "HR", "CU", "CY", "CZ", "DK", "DJ", "DM",
        "DO", "EC", "EG", "SV", "GQ", "ER", "EE", "ET", "FK", "FO", "FJ", "FI", "FR", "GF", "PF",
        "TF", "GA", "GM", "GE", "DE", "GH", "GI", "GR", "GL", "GD", "GP", "GU", "GT", "GN", "GW",
        "GY", "HT", "HM", "VA", "HN", "HK", "HU", "IS", "IN", "ID", "IR", "IQ", "IE", "IL", "IT",
        "JM", "JP", "JO", "KZ", "KE", "KI", "KP", "KR", "KW", "KG", "LA", "LV", "LB", "LS", "LR",
        "LY", "LI", "LT", "LU", "MO", "MK", "MG", "MW", "MY", "MV", "ML", "MT", "MH", "MQ", "MR",
        "MU", "YT", "MX", "FM", "MD", "MC", "MN", "MS", "MA", "MZ", "MM", "NA", "NR", "NP", "NL",
        "NC", "NZ", "NI", "NE", "NG", "NU", "NF", "MP", "NO", "OM", "PK", "PW", "PS", "PA", "PG",
        "PY", "PE", "PH", "PN", "PL", "PT", "PR", "QA", "RE", "RO", "RU", "RW", "SH", "KN", "LC",
        "PM", "VC", "WS", "SM", "ST", "SA", "SN", "SC", "SL", "SG", "SK", "SI", "SB", "SO", "ZA",
        "GS", "ES", "LK", "SD", "SR", "SJ", "SZ", "SE", "CH", "SY", "TW", "TJ", "TZ", "TH", "TL",
        "TG", "TK", "TO", "TT", "TN", "TR", "TM", "TC", "TV", "UG", "UA", "AE", "GB", "US", "UM",
        "UY", "UZ", "VU", "VE", "VN", "VG", "VI", "WF", "EH", "YE", "ZM", "ZW", "UK"  # Including UK as common variant
    ])
    
    # Convert codes to uppercase for consistent comparison
    codes = codes.str.upper()
    
    # Calculate metrics
    metrics = {
        'total_rows': len(codes),
        'null_count': codes.isnull().sum(),
        'unique_codes': codes.nunique(),
        'valid_codes': sum(1 for code in codes.dropna() if code in valid_codes),
        'invalid_codes': sum(1 for code in codes.dropna() if code not in valid_codes),
        'most_common_code': codes.mode()[0] if not codes.empty else 'None'
    }
    
    # Calculate percentages
    total_non_null = len(codes.dropna())
    if total_non_null > 0:
        metrics['valid_codes_pct'] = (metrics['valid_codes'] / total_non_null) * 100
        metrics['invalid_codes_pct'] = (metrics['invalid_codes'] / total_non_null) * 100
    else:
        metrics['valid_codes_pct'] = 0
        metrics['invalid_codes_pct'] = 0
        
    # Plot code distribution
    plt.figure(figsize=(12, 6))
    value_counts = codes.value_counts().head(20)
    ax = value_counts.plot(kind='bar')
    plt.title('Country Code Distribution (Top 20)')
    plt.xlabel('Country Code')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    
    # Add value labels on top of bars
    for i, v in enumerate(value_counts):
        ax.text(i, v, str(v), ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    return pd.Series(metrics)

try:
    # Analyze country codes
    country_metrics = analyze_country_codes(df['country_code'])
    print("\nCountry Code Metrics:")
    print(country_metrics)
except Exception as e:
    print(f"Error analyzing country codes: {str(e)}")

## 5. Development Status Analysis

In [None]:
def analyze_development_status(status):
    """Analyze development status distribution and standardization needs"""
    if not isinstance(status, pd.Series):
        raise TypeError("Input must be a pandas Series")
        
    # Standardize status for analysis
    status = status.str.lower().fillna('unknown')
    
    # Get valid categories from config
    valid_categories = ['developed', 'developing']
    
    metrics = {
        'total_rows': len(status),
        'null_count': (status == 'unknown').sum(),
        'unique_values': status.nunique(),
        'developed_count': sum(1 for s in status if any(term in s for term in ['developed', 'advanced'])),
        'developing_count': sum(1 for s in status if any(term in s for term in ['developing', 'emerging'])),
        'unclear_count': sum(1 for s in status if not any(term in s for term in ['developed', 'developing', 'advanced', 'emerging']))
    }
    
    # Calculate percentages
    total = metrics['total_rows']
    metrics['developed_pct'] = (metrics['developed_count'] / total) * 100
    metrics['developing_pct'] = (metrics['developing_count'] / total) * 100
    metrics['unclear_pct'] = (metrics['unclear_count'] / total) * 100
    
    # Plot status distribution
    plt.figure(figsize=(10, 6))
    status_counts = status.value_counts()
    colors = ['#2ecc71', '#e74c3c', '#95a5a6']  # Green, Red, Gray
    plt.pie(status_counts, labels=status_counts.index, autopct='%1.1f%%', colors=colors)
    plt.title('Development Status Distribution')
    plt.axis('equal')
    
    # Add legend
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()
    
    return pd.Series(metrics)

try:
    # Analyze development status
    status_metrics = analyze_development_status(df['development_status'])
    print("\nDevelopment Status Metrics:")
    print(status_metrics)
except Exception as e:
    print(f"Error analyzing development status: {str(e)}")

## 6. Cross-Column Analysis

In [None]:
# Analyze relationships between columns
def analyze_cross_columns(df):
    """Analyze relationships between different columns"""
    try:
        # Validate input dataframe
        required_columns = ['text', 'hashtags', 'country_code', 'development_status']
        if not all(col in df.columns for col in required_columns):
            raise ValueError("Missing required columns in dataframe")
            
        # Text length by country
        plt.figure(figsize=(12, 6))
        text_lengths = df['text'].fillna('').str.len()
        sns.boxplot(x='country_code', y=text_lengths, data=df)
        plt.title('Text Length Distribution by Country')
        plt.xticks(rotation=45)
        plt.show()
        
        # Hashtag count by development status
        df['hashtag_count'] = df['hashtags'].fillna('').str.split().str.len()
        plt.figure(figsize=(10, 6))
        sns.boxplot(x='development_status', y='hashtag_count', data=df)
        plt.title('Hashtag Count by Development Status')
        plt.show()
        
        # Correlation matrix
        correlation_data = pd.DataFrame({
            'text_length': text_lengths,
            'hashtag_count': df['hashtag_count'],
            'is_developed': (df['development_status'].str.lower().str.contains('developed', na=False)).astype(int)
        })
        
        plt.figure(figsize=(8, 6))
        sns.heatmap(correlation_data.corr(), annot=True, cmap='coolwarm')
        plt.title('Correlation Matrix')
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"Error in cross-column analysis: {str(e)}")
        raise

try:
    # Perform cross-column analysis
    analyze_cross_columns(df)
except Exception as e:
    print(f"Failed to perform cross-column analysis: {str(e)}")