In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import json
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

# Define paths
RAW_DATA_DIR = Path("../data/raw")
WEATHER_DIR = RAW_DATA_DIR
RETAIL_FILE = RAW_DATA_DIR / "retail_sales_dataset.csv"
HEADLINES_DIR = RAW_DATA_DIR / "web"

print(f"Raw data directory: {RAW_DATA_DIR.absolute()}")

In [None]:
# Data quality assessment functions
def completeness_score(df):
    """Calculate completeness score (percentage of non-null values)"""
    return (df.notnull().sum() / len(df) * 100).round(2)

def uniqueness_score(df, column):
    """Calculate uniqueness score for a column"""
    if column in df.columns:
        unique_ratio = df[column].nunique() / len(df) * 100
        return round(unique_ratio, 2)
    return 0

def detect_outliers_iqr(df, column):
    """Detect outliers using IQR method"""
    if column not in df.columns or df[column].dtype not in ['int64', 'float64']:
        return pd.Series([], dtype=bool)
    
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    return (df[column] < lower_bound) | (df[column] > upper_bound)

def data_quality_report(df, name):
    """Generate comprehensive data quality report"""
    report = {
        'dataset': name,
        'total_records': len(df),
        'total_columns': len(df.columns),
        'completeness_overall': round(df.notnull().sum().sum() / (len(df) * len(df.columns)) * 100, 2),
        'columns_with_nulls': (df.isnull().sum() > 0).sum(),
        'duplicate_records': df.duplicated().sum()
    }
    
    # Column-level metrics
    column_metrics = []
    for col in df.columns:
        col_info = {
            'column': col,
            'dtype': str(df[col].dtype),
            'completeness': round(df[col].notnull().sum() / len(df) * 100, 2),
            'unique_values': df[col].nunique(),
            'null_count': df[col].isnull().sum()
        }
        
        # Numeric columns - add outlier detection
        if df[col].dtype in ['int64', 'float64']:
            outliers = detect_outliers_iqr(df, col)
            col_info['outliers'] = outliers.sum()
            col_info['outlier_percentage'] = round(outliers.sum() / len(df) * 100, 2)
        else:
            col_info['outliers'] = 0
            col_info['outlier_percentage'] = 0
            
        column_metrics.append(col_info)
    
    return report, pd.DataFrame(column_metrics)

## 1. Weather Data Quality Assessment

In [None]:
# Load weather data
weather_files = list(WEATHER_DIR.glob("*_weather_*.json"))
weather_records = []

for file_path in weather_files:
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
            
        record = {
            'timestamp': data.get('dt'),
            'date': pd.to_datetime(data.get('dt'), unit='s').normalize() if data.get('dt') else None,
            'city': data.get('name'),
            'temperature': data.get('main', {}).get('temp'),
            'humidity': data.get('main', {}).get('humidity'),
            'pressure': data.get('main', {}).get('pressure'),
            'weather_main': data.get('weather', [{}])[0].get('main'),
            'wind_speed': data.get('wind', {}).get('speed'),
            'source_file': file_path.name
        }
        weather_records.append(record)
        
    except Exception as e:
        print(f"Error processing {file_path.name}: {e}")

weather_df = pd.DataFrame(weather_records)
print(f"Weather data loaded: {len(weather_df)} records")

In [None]:
# Weather data quality report
weather_report, weather_column_metrics = data_quality_report(weather_df, 'Weather')

print("Weather Data Quality Report:")
for key, value in weather_report.items():
    print(f"{key}: {value}")

print("\nColumn-level Quality Metrics:")
display(weather_column_metrics)

In [None]:
# Weather data quality visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Weather Data Quality Analysis', fontsize=16)

# Completeness by column
weather_column_metrics.set_index('column')['completeness'].plot(kind='bar', ax=axes[0,0])
axes[0,0].set_title('Data Completeness by Column')
axes[0,0].set_ylabel('Completeness (%)')
axes[0,0].tick_params(axis='x', rotation=45)

# Missing values heatmap
sns.heatmap(weather_df.isnull(), cbar=False, ax=axes[0,1])
axes[0,1].set_title('Missing Values Pattern')
axes[0,1].set_xlabel('Columns')
axes[0,1].set_ylabel('Records')

# Outlier analysis for numeric columns
numeric_cols = weather_column_metrics[weather_column_metrics['dtype'].isin(['int64', 'float64'])]
if not numeric_cols.empty:
    numeric_cols.set_index('column')['outlier_percentage'].plot(kind='bar', ax=axes[1,0])
    axes[1,0].set_title('Outlier Percentage by Numeric Column')
    axes[1,0].set_ylabel('Outlier Percentage (%)')
    axes[1,0].tick_params(axis='x', rotation=45)
else:
    axes[1,0].text(0.5, 0.5, 'No numeric columns', transform=axes[1,0].transAxes, ha='center')

# Duplicate analysis
duplicate_info = f"Duplicate records: {weather_report['duplicate_records']}"
axes[1,1].text(0.5, 0.5, duplicate_info, transform=axes[1,1].transAxes, ha='center', fontsize=12)
axes[1,1].set_title('Duplicate Records')
axes[1,1].set_xlim(0, 1)
axes[1,1].set_ylim(0, 1)
axes[1,1].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Weather data validation rules
print("Weather Data Validation Results:")

# Temperature range check (-50°C to 60°C)
if 'temperature' in weather_df.columns:
    temp_valid = ((weather_df['temperature'] >= -50) & (weather_df['temperature'] <= 60)).sum()
    temp_invalid = len(weather_df) - temp_valid
    print(f"Temperature range validation: {temp_valid} valid, {temp_invalid} invalid")

# Humidity range check (0-100%)
if 'humidity' in weather_df.columns:
    humidity_valid = ((weather_df['humidity'] >= 0) & (weather_df['humidity'] <= 100)).sum()
    humidity_invalid = len(weather_df) - humidity_valid
    print(f"Humidity range validation: {humidity_valid} valid, {humidity_invalid} invalid")

# Pressure range check (800-1200 hPa)
if 'pressure' in weather_df.columns:
    pressure_valid = ((weather_df['pressure'] >= 800) & (weather_df['pressure'] <= 1200)).sum()
    pressure_invalid = len(weather_df) - pressure_valid
    print(f"Pressure range validation: {pressure_valid} valid, {pressure_invalid} invalid")

# Date validation
if 'date' in weather_df.columns:
    future_dates = (weather_df['date'] > pd.Timestamp.now()).sum()
    old_dates = (weather_df['date'] < pd.Timestamp('2020-01-01')).sum()
    print(f"Date validation: {future_dates} future dates, {old_dates} very old dates")

## 2. Retail Sales Data Quality Assessment

In [None]:
# Load retail data
try:
    retail_df = pd.read_csv(RETAIL_FILE)
    print(f"Retail data loaded: {len(retail_df)} records")
except Exception as e:
    print(f"Error loading retail data: {e}")
    retail_df = pd.DataFrame()

In [None]:
if not retail_df.empty:
    # Retail data quality report
    retail_report, retail_column_metrics = data_quality_report(retail_df, 'Retail')
    
    print("Retail Data Quality Report:")
    for key, value in retail_report.items():
        print(f"{key}: {value}")
    
    print("\nColumn-level Quality Metrics:")
    display(retail_column_metrics)
else:
    print("No retail data to analyze")

In [None]:
if not retail_df.empty:
    # Retail data quality visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Retail Data Quality Analysis', fontsize=16)
    
    # Completeness by column
    retail_column_metrics.set_index('column')['completeness'].plot(kind='bar', ax=axes[0,0])
    axes[0,0].set_title('Data Completeness by Column')
    axes[0,0].set_ylabel('Completeness (%)')
    axes[0,0].tick_params(axis='x', rotation=45)
    
    # Missing values heatmap
    sns.heatmap(retail_df.isnull(), cbar=False, ax=axes[0,1])
    axes[0,1].set_title('Missing Values Pattern')
    axes[0,1].set_xlabel('Columns')
    axes[0,1].set_ylabel('Records')
    
    # Outlier analysis for numeric columns
    numeric_cols = retail_column_metrics[retail_column_metrics['dtype'].isin(['int64', 'float64'])]
    if not numeric_cols.empty:
        numeric_cols.set_index('column')['outlier_percentage'].plot(kind='bar', ax=axes[1,0])
        axes[1,0].set_title('Outlier Percentage by Numeric Column')
        axes[1,0].set_ylabel('Outlier Percentage (%)')
        axes[1,0].tick_params(axis='x', rotation=45)
    else:
        axes[1,0].text(0.5, 0.5, 'No numeric columns', transform=axes[1,0].transAxes, ha='center')
    
    # Duplicate analysis
    duplicate_info = f"Duplicate records: {retail_report['duplicate_records']}"
    axes[1,1].text(0.5, 0.5, duplicate_info, transform=axes[1,1].transAxes, ha='center', fontsize=12)
    axes[1,1].set_title('Duplicate Records')
    axes[1,1].set_xlim(0, 1)
    axes[1,1].set_ylim(0, 1)
    axes[1,1].axis('off')
    
    plt.tight_layout()
    plt.show()

## 3. Headlines Data Quality Assessment

In [None]:
# Load headlines data
headlines_files = list(HEADLINES_DIR.glob("*.json"))
headlines_records = []

for file_path in headlines_files:
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
            
        if isinstance(data, list):
            for item in data:
                record = {
                    'title': item.get('title'),
                    'description': item.get('description'),
                    'published_at': item.get('publishedAt'),
                    'source': item.get('source', {}).get('name'),
                    'url': item.get('url'),
                    'source_file': file_path.name
                }
                headlines_records.append(record)
        else:
            record = {
                'title': data.get('title'),
                'description': data.get('description'),
                'published_at': data.get('publishedAt'),
                'source': data.get('source', {}).get('name'),
                'url': data.get('url'),
                'source_file': file_path.name
            }
            headlines_records.append(record)
            
    except Exception as e:
        print(f"Error processing {file_path.name}: {e}")

headlines_df = pd.DataFrame(headlines_records)
print(f"Headlines data loaded: {len(headlines_df)} records")

In [None]:
# Headlines data quality report
headlines_report, headlines_column_metrics = data_quality_report(headlines_df, 'Headlines')

print("Headlines Data Quality Report:")
for key, value in headlines_report.items():
    print(f"{key}: {value}")

print("\nColumn-level Quality Metrics:")
display(headlines_column_metrics)

In [None]:
# Headlines data quality visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Headlines Data Quality Analysis', fontsize=16)

# Completeness by column
headlines_column_metrics.set_index('column')['completeness'].plot(kind='bar', ax=axes[0,0])
axes[0,0].set_title('Data Completeness by Column')
axes[0,0].set_ylabel('Completeness (%)')
axes[0,0].tick_params(axis='x', rotation=45)

# Missing values heatmap
sns.heatmap(headlines_df.isnull(), cbar=False, ax=axes[0,1])
axes[0,1].set_title('Missing Values Pattern')
axes[0,1].set_xlabel('Columns')
axes[0,1].set_ylabel('Records')

# Title length distribution (if titles exist)
if 'title' in headlines_df.columns:
    title_lengths = headlines_df['title'].dropna().str.len()
    title_lengths.hist(bins=20, ax=axes[1,0], alpha=0.7)
    axes[1,0].set_title('Title Length Distribution')
    axes[1,0].set_xlabel('Title Length (characters)')
    axes[1,0].set_ylabel('Frequency')
else:
    axes[1,0].text(0.5, 0.5, 'No title column', transform=axes[1,0].transAxes, ha='center')

# Duplicate analysis
duplicate_info = f"Duplicate records: {headlines_report['duplicate_records']}"
axes[1,1].text(0.5, 0.5, duplicate_info, transform=axes[1,1].transAxes, ha='center', fontsize=12)
axes[1,1].set_title('Duplicate Records')
axes[1,1].set_xlim(0, 1)
axes[1,1].set_ylim(0, 1)
axes[1,1].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Headlines validation rules
print("Headlines Data Validation Results:")

# URL validation
if 'url' in headlines_df.columns:
    valid_urls = headlines_df['url'].str.startswith(('http://', 'https://')).sum()
    invalid_urls = len(headlines_df) - valid_urls
    print(f"URL format validation: {valid_urls} valid, {invalid_urls} invalid")

# Title validation (non-empty)
if 'title' in headlines_df.columns:
    empty_titles = headlines_df['title'].str.strip().eq('').sum()
    print(f"Title validation: {empty_titles} empty titles")

# Date validation
if 'published_at' in headlines_df.columns:
    try:
        dates = pd.to_datetime(headlines_df['published_at'], errors='coerce')
        future_dates = (dates > pd.Timestamp.now()).sum()
        old_dates = (dates < pd.Timestamp('2020-01-01')).sum()
        invalid_dates = dates.isnull().sum()
        print(f"Date validation: {future_dates} future dates, {old_dates} very old dates, {invalid_dates} invalid formats")
    except:
        print("Could not validate dates")

## 4. Cross-Dataset Quality Analysis

In [None]:
# Cross-dataset quality comparison
datasets = []
if 'weather_report' in locals():
    datasets.append(weather_report)
if 'retail_report' in locals() and retail_report['total_records'] > 0:
    datasets.append(retail_report)
if 'headlines_report' in locals():
    datasets.append(headlines_report)

if datasets:
    comparison_df = pd.DataFrame(datasets)
    print("Cross-Dataset Quality Comparison:")
    display(comparison_df)
    
    # Quality scores visualization
    fig, ax = plt.subplots(1, 1, figsize=(10, 6))
    comparison_df.set_index('dataset')['completeness_overall'].plot(kind='bar', ax=ax)
    ax.set_title('Overall Data Completeness by Dataset')
    ax.set_ylabel('Completeness (%)')
    ax.set_xlabel('Dataset')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("No datasets available for comparison")

## 5. Data Quality Recommendations

In [None]:
print("DATA QUALITY RECOMMENDATIONS")
print("=" * 50)

# Weather data recommendations
if 'weather_report' in locals():
    print("\nWEATHER DATA:")
    if weather_report['columns_with_nulls'] > 0:
        print(f
)
        null_cols = weather_column_metrics[weather_column_metrics['null_count'] > 0]['column'].tolist()
        print(f"  Columns: {', '.join(null_cols)}")
    
    outliers = weather_column_metrics[weather_column_metrics['outliers'] > 0]
    if not outliers.empty:
        print(f"- Review {len(outliers)} columns with potential outliers")
        print(f"  Columns: {', '.join(outliers['column'].tolist())}")
    
    if weather_report['duplicate_records'] > 0:
        print(f"- Remove {weather_report['duplicate_records']} duplicate records")

# Retail data recommendations
if 'retail_report' in locals() and retail_report['total_records'] > 0:
    print("\nRETAIL DATA:")
    if retail_report['columns_with_nulls'] > 0:
        print(f"- Handle {retail_report['columns_with_nulls']} columns with missing values")
        null_cols = retail_column_metrics[retail_column_metrics['null_count'] > 0]['column'].tolist()
        print(f"  Columns: {', '.join(null_cols)}")
    
    outliers = retail_column_metrics[retail_column_metrics['outliers'] > 0]
    if not outliers.empty:
        print(f"- Review {len(outliers)} columns with potential outliers")
        print(f"  Columns: {', '.join(outliers['column'].tolist())}")
    
    if retail_report['duplicate_records'] > 0:
        print(f"- Remove {retail_report['duplicate_records']} duplicate records")

# Headlines data recommendations
if 'headlines_report' in locals():
    print("\nHEADLINES DATA:")
    if headlines_report['columns_with_nulls'] > 0:
        print(f"- Handle {headlines_report['columns_with_nulls']} columns with missing values")
        null_cols = headlines_column_metrics[headlines_column_metrics['null_count'] > 0]['column'].tolist()
        print(f"  Columns: {', '.join(null_cols)}")
    
    if headlines_report['duplicate_records'] > 0:
        print(f"- Remove {headlines_report['duplicate_records']} duplicate records")

print("\nGENERAL RECOMMENDATIONS:")
print("- Standardize date formats across all datasets")
print("- Implement data validation rules in ETL pipeline")
print("- Set up automated data quality monitoring")
print("- Document data quality expectations and thresholds")
print("- Consider data profiling tools for ongoing monitoring")