In [None]:
# Import required libraries
import pandas as pd
import json
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

# Define paths
RAW_DATA_DIR = Path("../data/raw")
WEATHER_DIR = RAW_DATA_DIR
RETAIL_FILE = RAW_DATA_DIR / "retail_sales_dataset.csv"
HEADLINES_DIR = RAW_DATA_DIR / "web"

print(f"Raw data directory: {RAW_DATA_DIR.absolute()}")
print(f"Weather files location: {WEATHER_DIR}")
print(f"Retail data: {RETAIL_FILE}")
print(f"Headlines data: {HEADLINES_DIR}")

## 1. Weather Data Exploration

In [None]:
# List weather JSON files
weather_files = list(WEATHER_DIR.glob("*_weather_*.json"))
print(f"Found {len(weather_files)} weather data files:")
for file in weather_files:
    print(f"  - {file.name}")

# Load and examine first weather file
if weather_files:
    sample_weather_file = weather_files[0]
    print(f"\nExamining sample file: {sample_weather_file.name}")
    
    with open(sample_weather_file, 'r') as f:
        sample_weather = json.load(f)
    
    print("Sample weather data structure:")
    print(json.dumps(sample_weather, indent=2))

In [None]:
# Load all weather data into a DataFrame
weather_records = []

for file_path in weather_files:
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
            
        # Extract relevant fields
        record = {
            'timestamp': data.get('dt'),
            'date': pd.to_datetime(data.get('dt'), unit='s').normalize() if data.get('dt') else None,
            'city': data.get('name'),
            'country': data.get('sys', {}).get('country'),
            'temperature': data.get('main', {}).get('temp'),
            'feels_like': data.get('main', {}).get('feels_like'),
            'humidity': data.get('main', {}).get('humidity'),
            'pressure': data.get('main', {}).get('pressure'),
            'weather_main': data.get('weather', [{}])[0].get('main'),
            'weather_description': data.get('weather', [{}])[0].get('description'),
            'wind_speed': data.get('wind', {}).get('speed'),
            'wind_direction': data.get('wind', {}).get('deg'),
            'clouds': data.get('clouds', {}).get('all'),
            'source_file': file_path.name
        }
        weather_records.append(record)
        
    except Exception as e:
        print(f"Error processing {file_path.name}: {e}")

# Create DataFrame
weather_df = pd.DataFrame(weather_records)
print(f"\nWeather data shape: {weather_df.shape}")
print("\nWeather data info:")
weather_df.info()

In [None]:
# Weather data statistics
print("Weather Data Summary:")
display(weather_df.describe())

print("\nMissing values:")
display(weather_df.isnull().sum())

print("\nUnique cities:")
display(weather_df['city'].value_counts())

print("\nWeather conditions:")
display(weather_df['weather_main'].value_counts())

In [None]:
# Weather data visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Weather Data Exploration', fontsize=16)

# Temperature distribution
axes[0,0].hist(weather_df['temperature'].dropna(), bins=20, alpha=0.7)
axes[0,0].set_title('Temperature Distribution')
axes[0,0].set_xlabel('Temperature (Â°C)')
axes[0,0].set_ylabel('Frequency')

# Humidity distribution
axes[0,1].hist(weather_df['humidity'].dropna(), bins=20, alpha=0.7)
axes[0,1].set_title('Humidity Distribution')
axes[0,1].set_xlabel('Humidity (%)')
axes[0,1].set_ylabel('Frequency')

# Cities
weather_df['city'].value_counts().plot(kind='bar', ax=axes[1,0])
axes[1,0].set_title('Weather Data by City')
axes[1,0].set_xlabel('City')
axes[1,0].set_ylabel('Count')
axes[1,0].tick_params(axis='x', rotation=45)

# Weather conditions
weather_df['weather_main'].value_counts().plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('Weather Conditions')
axes[1,1].set_xlabel('Condition')
axes[1,1].set_ylabel('Count')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 2. Retail Sales Data Exploration

In [None]:
# Load retail sales data
try:
    retail_df = pd.read_csv(RETAIL_FILE)
    print(f"Retail data shape: {retail_df.shape}")
    print("\nRetail data columns:")
    print(retail_df.columns.tolist())
    
    print("\nFirst 5 rows:")
    display(retail_df.head())
    
except FileNotFoundError:
    print(f"Retail data file not found: {RETAIL_FILE}")
    retail_df = None
except Exception as e:
    print(f"Error loading retail data: {e}")
    retail_df = None

In [None]:
if retail_df is not None:
    print("Retail Data Summary:")
    display(retail_df.describe())
    
    print("\nData types:")
    display(retail_df.dtypes)
    
    print("\nMissing values:")
    display(retail_df.isnull().sum())
    
    # Check for date columns
    date_columns = [col for col in retail_df.columns if 'date' in col.lower()]
    if date_columns:
        print(f"\nPotential date columns: {date_columns}")
        for col in date_columns:
            try:
                retail_df[col] = pd.to_datetime(retail_df[col])
                print(f"Converted {col} to datetime")
            except:
                print(f"Could not convert {col} to datetime")

In [None]:
if retail_df is not None:
    # Retail data visualizations
    numeric_cols = retail_df.select_dtypes(include=['number']).columns
    
    if len(numeric_cols) > 0:
        fig, axes = plt.subplots(1, min(3, len(numeric_cols)), figsize=(15, 5))
        fig.suptitle('Retail Data Numeric Distributions', fontsize=16)
        
        if len(numeric_cols) == 1:
            axes = [axes]
        
        for i, col in enumerate(numeric_cols[:3]):
            if i < len(axes):
                retail_df[col].hist(bins=20, ax=axes[i], alpha=0.7)
                axes[i].set_title(f'{col} Distribution')
                axes[i].set_xlabel(col)
                axes[i].set_ylabel('Frequency')
        
        plt.tight_layout()
        plt.show()
    
    # Categorical columns
    categorical_cols = retail_df.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        print("\nCategorical columns value counts:")
        for col in categorical_cols[:3]:  # Show first 3 categorical columns
            print(f"\n{col}:")
            display(retail_df[col].value_counts().head(10))

## 3. News Headlines Data Exploration

In [None]:
# List headlines JSON files
headlines_files = list(HEADLINES_DIR.glob("*.json"))
print(f"Found {len(headlines_files)} headlines data files:")
for file in headlines_files:
    print(f"  - {file.name}")

# Load and examine headlines data
headlines_records = []

for file_path in headlines_files:
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
            
        # Extract relevant fields (adjust based on actual structure)
        if isinstance(data, list):
            for item in data:
                record = {
                    'title': item.get('title'),
                    'description': item.get('description'),
                    'url': item.get('url'),
                    'published_at': item.get('publishedAt'),
                    'source': item.get('source', {}).get('name'),
                    'author': item.get('author'),
                    'source_file': file_path.name
                }
                headlines_records.append(record)
        else:
            # Single object structure
            record = {
                'title': data.get('title'),
                'description': data.get('description'),
                'url': data.get('url'),
                'published_at': data.get('publishedAt'),
                'source': data.get('source', {}).get('name'),
                'author': data.get('author'),
                'source_file': file_path.name
            }
            headlines_records.append(record)
            
    except Exception as e:
        print(f"Error processing {file_path.name}: {e}")

# Create DataFrame
headlines_df = pd.DataFrame(headlines_records)
print(f"\nHeadlines data shape: {headlines_df.shape}")
print("\nHeadlines data info:")
headlines_df.info()

In [None]:
if not headlines_df.empty:
    print("Headlines Data Summary:")
    display(headlines_df.describe())
    
    print("\nMissing values:")
    display(headlines_df.isnull().sum())
    
    print("\nTop sources:")
    display(headlines_df['source'].value_counts().head(10))
    
    # Convert published_at to datetime if possible
    if 'published_at' in headlines_df.columns:
        try:
            headlines_df['published_at'] = pd.to_datetime(headlines_df['published_at'])
            print("\nConverted published_at to datetime")
            print(f"Date range: {headlines_df['published_at'].min()} to {headlines_df['published_at'].max()}")
        except:
            print("\nCould not convert published_at to datetime")

In [None]:
if not headlines_df.empty:
    # Headlines data visualizations
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    fig.suptitle('Headlines Data Exploration', fontsize=16)
    
    # Sources distribution
    headlines_df['source'].value_counts().head(10).plot(kind='bar', ax=axes[0])
    axes[0].set_title('Top News Sources')
    axes[0].set_xlabel('Source')
    axes[0].set_ylabel('Count')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Publication timeline (if datetime conversion worked)
    if 'published_at' in headlines_df.columns and pd.api.types.is_datetime64_any_dtype(headlines_df['published_at']):
        headlines_df['published_at'].dt.date.value_counts().sort_index().plot(ax=axes[1])
        axes[1].set_title('Headlines Over Time')
        axes[1].set_xlabel('Date')
        axes[1].set_ylabel('Number of Headlines')
        axes[1].tick_params(axis='x', rotation=45)
    else:
        axes[1].text(0.5, 0.5, 'No datetime data available', 
                    transform=axes[1].transAxes, ha='center', va='center')
        axes[1].set_title('Publication Timeline')
    
    plt.tight_layout()
    plt.show()

## 4. Data Quality Summary

In [None]:
# Summary statistics across all datasets
summary_data = {
    'Dataset': ['Weather', 'Retail', 'Headlines'],
    'Records': [len(weather_df), len(retail_df) if retail_df is not None else 0, len(headlines_df)],
    'Columns': [len(weather_df.columns), len(retail_df.columns) if retail_df is not None else 0, len(headlines_df.columns)]
}

summary_df = pd.DataFrame(summary_data)
print("Dataset Summary:")
display(summary_df)

# Data quality issues identified
print("\nData Quality Notes:")
print("1. Weather data:")
print(f"   - Missing values: {weather_df.isnull().sum().sum()} total")
print(f"   - Cities covered: {weather_df['city'].nunique()}")

if retail_df is not None:
    print("2. Retail data:")
    print(f"   - Missing values: {retail_df.isnull().sum().sum()} total")
    print(f"   - Numeric columns: {len(retail_df.select_dtypes(include=['number']).columns)}")

print("3. Headlines data:")
print(f"   - Missing values: {headlines_df.isnull().sum().sum()} total")
print(f"   - Sources: {headlines_df['source'].nunique() if not headlines_df.empty else 0}")

print("\nNext steps:")
print("- Review missing value handling strategies")
print("- Standardize date formats across datasets")
print("- Validate data ranges and business rules")
print("- Consider data enrichment opportunities")