# Data Cleaning Results Analysis

This notebook analyzes the results of our ML-based data cleaning pipeline, comparing the original and cleaned datasets to evaluate the effectiveness of our cleaning process.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud
from scipy import stats
import yaml

# Set style for visualizations
plt.style.use('seaborn')
sns.set_palette('husl')

# Configure pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

In [None]:
# Load the original and cleaned datasets
original_df = pd.read_csv('../data/raw/input_dataset.csv')
cleaned_df = pd.read_csv('../data/processed/cleaned_dataset.csv')

# Load configuration
with open('../configs/cleaning_config.yaml', 'r') as f:
    config = yaml.safe_load(f)

## 1. Basic Statistics Comparison

In [None]:
def calculate_basic_stats(original_df, cleaned_df):
    stats_dict = {
        'Metric': [
            'Total Records',
            'Records Removed',
            'Removal Rate (%)',
            'Average Text Length',
            'Median Text Length',
            'Average Hashtags per Record',
            'Unique Country Codes',
            'Missing Values (Total)'
        ],
        'Original': [
            len(original_df),
            0,
            0,
            original_df['text'].str.len().mean(),
            original_df['text'].str.len().median(),
            original_df['hashtags'].str.count('#').mean(),
            original_df['country_code'].nunique(),
            original_df.isnull().sum().sum()
        ],
        'Cleaned': [
            len(cleaned_df),
            len(original_df) - len(cleaned_df),
            ((len(original_df) - len(cleaned_df)) / len(original_df) * 100),
            cleaned_df['text'].str.len().mean(),
            cleaned_df['text'].str.len().median(),
            cleaned_df['hashtags'].str.count('#').mean(),
            cleaned_df['country_code'].nunique(),
            cleaned_df.isnull().sum().sum()
        ]
    }
    
    return pd.DataFrame(stats_dict).round(2)

basic_stats = calculate_basic_stats(original_df, cleaned_df)
basic_stats

## 2. Text Quality Analysis

In [None]:
def analyze_text_quality(df, column='text'):
    """Analyze text quality metrics"""
    metrics = {
        'Total Words': df[column].str.split().str.len().sum(),
        'Unique Words': len(set(' '.join(df[column].dropna()).split())),
        'Average Words per Text': df[column].str.split().str.len().mean(),
        'Text Length Std Dev': df[column].str.len().std(),
        'Empty Texts': df[column].isna().sum(),
        'Short Texts (<10 chars)': (df[column].str.len() < 10).sum()
    }
    return pd.Series(metrics)

# Compare text quality
original_quality = analyze_text_quality(original_df)
cleaned_quality = analyze_text_quality(cleaned_df)

quality_comparison = pd.DataFrame({
    'Original': original_quality,
    'Cleaned': cleaned_quality,
    'Change (%)': ((cleaned_quality - original_quality) / original_quality * 100).round(2)
})

quality_comparison

## 3. Visualization of Changes

In [None]:
def plot_text_length_distribution(original_df, cleaned_df):
    plt.figure(figsize=(12, 6))
    
    # Plot original distribution
    sns.histplot(data=original_df, x=original_df['text'].str.len(),
                label='Original', alpha=0.5, bins=50)
    
    # Plot cleaned distribution
    sns.histplot(data=cleaned_df, x=cleaned_df['text'].str.len(),
                label='Cleaned', alpha=0.5, bins=50)
    
    plt.title('Text Length Distribution Before and After Cleaning')
    plt.xlabel('Text Length (characters)')
    plt.ylabel('Count')
    plt.legend()
    plt.show()

plot_text_length_distribution(original_df, cleaned_df)

In [None]:
def plot_country_distribution(original_df, cleaned_df):
    fig = go.Figure()
    
    # Add original distribution
    fig.add_trace(go.Bar(
        x=original_df['country_code'].value_counts().index,
        y=original_df['country_code'].value_counts().values,
        name='Original',
        marker_color='lightblue'
    ))
    
    # Add cleaned distribution
    fig.add_trace(go.Bar(
        x=cleaned_df['country_code'].value_counts().index,
        y=cleaned_df['country_code'].value_counts().values,
        name='Cleaned',
        marker_color='lightgreen'
    ))
    
    fig.update_layout(
        title='Country Code Distribution Before and After Cleaning',
        xaxis_title='Country Code',
        yaxis_title='Count',
        barmode='group'
    )
    
    fig.show()

plot_country_distribution(original_df, cleaned_df)

## 4. Hashtag Analysis

In [None]:
def analyze_hashtags(df, column='hashtags'):
    # Split hashtags and create a list of all hashtags
    all_hashtags = [tag for tags in df[column].dropna() 
                    for tag in str(tags).split()]
    
    # Count unique hashtags
    unique_hashtags = len(set(all_hashtags))
    
    # Get top hashtags
    top_hashtags = pd.Series(all_hashtags).value_counts().head(10)
    
    # Create word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white')
    wordcloud.generate(' '.join(all_hashtags))
    
    # Plotting
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    # Plot top hashtags
    top_hashtags.plot(kind='barh', ax=ax1)
    ax1.set_title('Top 10 Hashtags')
    
    # Plot word cloud
    ax2.imshow(wordcloud, interpolation='bilinear')
    ax2.axis('off')
    ax2.set_title('Hashtag Word Cloud')
    
    plt.tight_layout()
    plt.show()
    
    return pd.Series({
        'Total Hashtags': len(all_hashtags),
        'Unique Hashtags': unique_hashtags,
        'Average Hashtags per Record': len(all_hashtags) / len(df),
        'Records with Hashtags (%)': (df[column].notna().sum() / len(df) * 100).round(2)
    })

print("Original Dataset Hashtag Analysis:")
original_hashtag_stats = analyze_hashtags(original_df)
print(original_hashtag_stats)

print("\nCleaned Dataset Hashtag Analysis:")
cleaned_hashtag_stats = analyze_hashtags(cleaned_df)
print(cleaned_hashtag_stats)

## 5. Development Status Changes

In [None]:
def analyze_development_status(original_df, cleaned_df):
    # Create comparison DataFrame
    comparison = pd.DataFrame({
        'Original': original_df['development_status'].value_counts(normalize=True) * 100,
        'Cleaned': cleaned_df['development_status'].value_counts(normalize=True) * 100
    }).round(2)
    
    # Calculate changes
    comparison['Change (%)'] = (comparison['Cleaned'] - comparison['Original']).round(2)
    
    # Plotting
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Pie charts
    original_df['development_status'].value_counts().plot(
        kind='pie', autopct='%1.1f%%', ax=ax1, title='Original Distribution')
    cleaned_df['development_status'].value_counts().plot(
        kind='pie', autopct='%1.1f%%', ax=ax2, title='Cleaned Distribution')
    
    plt.tight_layout()
    plt.show()
    
    return comparison

development_status_comparison = analyze_development_status(original_df, cleaned_df)
development_status_comparison