# 1.0 Data Exploration and Initial Analysis

This notebook performs initial exploration of our dataset and analyzes the quality of different columns before implementing the cleaning pipeline.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
from langdetect import detect
import spacy
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn')
sns.set_palette('husl')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

## 1. Data Loading and Initial Inspection

In [None]:
# Load the dataset
df = pd.read_csv('../data/raw/input_dataset.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nDataset Info:")
df.info()
print("\nMissing Values:")
print(df.isnull().sum())

## 2. Text Column Analysis

In [None]:
def analyze_text_column(texts):
    """Analyze text column for various metrics"""
    metrics = {
        'total_rows': len(texts),
        'null_count': texts.isnull().sum(),
        'empty_count': texts.str.strip().eq('').sum(),
        'unique_count': texts.nunique(),
        'avg_length': texts.str.len().mean(),
        'min_length': texts.str.len().min(),
        'max_length': texts.str.len().max(),
    }
    
    # Calculate length distribution
    plt.figure(figsize=(12, 6))
    texts.str.len().hist(bins=50)
    plt.title('Text Length Distribution')
    plt.xlabel('Length')
    plt.ylabel('Frequency')
    plt.show()
    
    return pd.Series(metrics)

# Analyze text column
text_metrics = analyze_text_column(df['text'])
print("Text Column Metrics:")
print(text_metrics)

In [None]:
def detect_text_issues(texts):
    """Detect various issues in text data"""
    issues = {
        'urls': [],
        'non_english': [],
        'short_text': [],
        'special_chars': [],
        'rt_prefix': []
    }
    
    for idx, text in enumerate(texts):
        if pd.isna(text):
            continue
            
        # Check for URLs
        if re.search(r'http\S+|www\S+|https\S+', text):
            issues['urls'].append(idx)
        
        # Check language
        try:
            if detect(text) != 'en':
                issues['non_english'].append(idx)
        except:
            pass
        
        # Check length
        if len(text.split()) < 3:
            issues['short_text'].append(idx)
        
        # Check special characters
        if re.search(r'[^\w\s.,!?]', text):
            issues['special_chars'].append(idx)
        
        # Check RT prefix
        if text.startswith('RT') or text.startswith('rt'):
            issues['rt_prefix'].append(idx)
    
    return {k: len(v) for k, v in issues.items()}

# Analyze text issues
text_issues = detect_text_issues(df['text'])
print("\nText Issues Found:")
for issue, count in text_issues.items():
    print(f"{issue}: {count} instances ({count/len(df)*100:.2f}%)")

## 3. Hashtag Analysis

In [None]:
def analyze_hashtags(hashtags):
    """Analyze hashtag patterns and issues"""
    # Split hashtags into individual tags
    all_tags = []
    for tags in hashtags.dropna():
        if isinstance(tags, str):
            all_tags.extend(tags.split())
    
    # Calculate metrics
    tag_counts = Counter(all_tags)
    
    metrics = {
        'total_hashtags': len(all_tags),
        'unique_hashtags': len(tag_counts),
        'avg_per_row': len(all_tags) / len(hashtags),
        'rows_with_tags': hashtags.notna().sum(),
    }
    
    # Plot top hashtags
    plt.figure(figsize=(12, 6))
    pd.Series(dict(tag_counts.most_common(20))).plot(kind='bar')
    plt.title('Top 20 Hashtags')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    return pd.Series(metrics)

# Analyze hashtags
hashtag_metrics = analyze_hashtags(df['hashtags'])
print("\nHashtag Metrics:")
print(hashtag_metrics)

## 4. Country Code Analysis

In [None]:
def analyze_country_codes(codes):
    """Analyze country code distribution and validity"""
    # Valid country codes (example list)
    valid_codes = {'US', 'UK', 'IN', 'CN', 'JP', 'DE', 'FR', 'IT', 'BR', 'CA'}
    
    metrics = {
        'total_rows': len(codes),
        'null_count': codes.isnull().sum(),
        'unique_codes': codes.nunique(),
        'invalid_codes': sum(1 for code in codes.dropna() if code.upper() not in valid_codes)
    }
    
    # Plot code distribution
    plt.figure(figsize=(12, 6))
    codes.value_counts().head(20).plot(kind='bar')
    plt.title('Country Code Distribution (Top 20)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    return pd.Series(metrics)

# Analyze country codes
country_metrics = analyze_country_codes(df['country_code'])
print("\nCountry Code Metrics:")
print(country_metrics)

## 5. Development Status Analysis

In [None]:
def analyze_development_status(status):
    """Analyze development status distribution and standardization needs"""
    # Standardize status for analysis
    status = status.str.lower()
    
    metrics = {
        'total_rows': len(status),
        'null_count': status.isnull().sum(),
        'unique_values': status.nunique(),
        'developed_count': sum(1 for s in status.dropna() if 'developed' in str(s)),
        'developing_count': sum(1 for s in status.dropna() if 'developing' in str(s)),
        'unclear_count': sum(1 for s in status.dropna() if 'developed' not in str(s) and 'developing' not in str(s))
    }
    
    # Plot status distribution
    plt.figure(figsize=(10, 6))
    status.value_counts().plot(kind='pie', autopct='%1.1f%%')
    plt.title('Development Status Distribution')
    plt.axis('equal')
    plt.show()
    
    return pd.Series(metrics)

# Analyze development status
status_metrics = analyze_development_status(df['development_status'])
print("\nDevelopment Status Metrics:")
print(status_metrics)

## 6. Cross-Column Analysis

In [None]:
# Analyze relationships between columns
def analyze_cross_columns(df):
    """Analyze relationships between different columns"""
    # Text length by country
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='country_code', y=df['text'].str.len(), data=df)
    plt.title('Text Length Distribution by Country')
    plt.xticks(rotation=45)
    plt.show()
    
    # Hashtag count by development status
    df['hashtag_count'] = df['hashtags'].fillna('').str.split().str.len()
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='development_status', y='hashtag_count', data=df)
    plt.title('Hashtag Count by Development Status')
    plt.show()
    
    # Correlation matrix
    correlation_data = pd.DataFrame({
        'text_length': df['text'].str.len(),
        'hashtag_count': df['hashtag_count'],
        'is_developed': (df['development_status'].str.lower().str.contains('developed')).astype(int)
    })
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_data.corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()

# Perform cross-column analysis
analyze_cross_columns(df)