In [None]:
import pandas as pd
from datetime import datetime, timedelta


In [None]:
# Function to check for missing weeks in time series data
def check_missing_weeks(csv_file_path, date_column='WkStrtActual'):
    """
    Check for missing weeks in a time series dataset.

    Parameters:
    csv_file_path (str): Path to the CSV file
    date_column (str): Name of the column containing week start dates

    Returns:
    dict: Summary of missing weeks analysis
    """
    # Load the data
    try:
        df = pd.read_csv(csv_file_path)
        print(f"Loaded {len(df)} rows from {csv_file_path}")
        print(f"Columns in dataset: {list(df.columns)}")
    except FileNotFoundError:
        return {"error": f"File {csv_file_path} not found"}

    # Check if the date column exists
    if date_column not in df.columns:
        available_cols = [col for col in df.columns if 'date' in col.lower() or 'week' in col.lower() or 'time' in col.lower()]
        return {"error": f"Column '{date_column}' not found. Available date-like columns: {available_cols}"}

    # Convert date column to datetime
    df[date_column] = pd.to_datetime(df[date_column])

    # Sort by date to ensure proper order
    df = df.sort_values(date_column)

    # Get unique week start dates
    unique_weeks = df[date_column].drop_duplicates().sort_values()

    # Get the date range
    min_date = unique_weeks.min()
    max_date = unique_weeks.max()

    print(f"Date range: {min_date.date()} to {max_date.date()}")
    print(f"Number of unique weeks in data: {len(unique_weeks)}")

    # Generate expected weekly sequence
    # Assuming weeks start on the same day of week as the first date
    expected_weeks = []
    current_week = min_date

    while current_week <= max_date:
        expected_weeks.append(current_week)
        current_week += timedelta(days=7)

    expected_weeks = pd.Series(expected_weeks)
    print(f"Expected number of weeks: {len(expected_weeks)}")

    # Find missing weeks
    missing_weeks = expected_weeks[~expected_weeks.isin(unique_weeks)]

    # Find extra weeks (shouldn't happen in a proper weekly sequence, but checking)
    extra_weeks = unique_weeks[~unique_weeks.isin(expected_weeks)]

    # Results summary
    results = {
        "total_rows": len(df),
        "unique_weeks_in_data": len(unique_weeks),
        "expected_weeks": len(expected_weeks),
        "missing_weeks_count": len(missing_weeks),
        "extra_weeks_count": len(extra_weeks),
        "date_range": (min_date.date(), max_date.date()),
        "missing_weeks": missing_weeks.dt.date.tolist() if len(missing_weeks) > 0 else [],
        "extra_weeks": extra_weeks.dt.date.tolist() if len(extra_weeks) > 0 else [],
        "data_sample": df.head()
    }

    return results


In [None]:
# Check your specific file with the correct path
file_path = "data/sandiego_epideimilogy/Time_Series_0427D0184F5A45B7973E2512848A1EE4.csv"

# Run the missing weeks check
missing_weeks_analysis = check_missing_weeks(file_path, 'WkStrtActual')

# Display results
print("\n" + "="*60)
print("MISSING WEEKS ANALYSIS")
print("="*60)

if "error" in missing_weeks_analysis:
    print(f"❌ Error: {missing_weeks_analysis['error']}")
else:
    print(f"📊 Total rows in dataset: {missing_weeks_analysis['total_rows']:,}")
    print(f"📅 Unique weeks in data: {missing_weeks_analysis['unique_weeks_in_data']:,}")
    print(f"🔢 Expected weeks: {missing_weeks_analysis['expected_weeks']:,}")
    print(f"❓ Missing weeks: {missing_weeks_analysis['missing_weeks_count']:,}")
    print(f"📈 Date range: {missing_weeks_analysis['date_range'][0]} to {missing_weeks_analysis['date_range'][1]}")

    # Show first few rows of data for context
    print(f"\n📋 Sample of data:")
    print(missing_weeks_analysis['data_sample'].to_string())

    if missing_weeks_analysis['missing_weeks_count'] > 0:
        print(f"\n⚠️  MISSING WEEKS DETECTED ({missing_weeks_analysis['missing_weeks_count']} total):")
        for i, week in enumerate(missing_weeks_analysis['missing_weeks']):
            if i < 20:  # Show first 20
                print(f"   - {week}")
            elif i == 20:
                print(f"   ... and {len(missing_weeks_analysis['missing_weeks']) - 20} more")
                break
    else:
        print("\n✅ NO MISSING WEEKS - Dataset is complete!")

    if missing_weeks_analysis['extra_weeks_count'] > 0:
        print(f"\n⚠️  UNEXPECTED WEEKS DETECTED: {missing_weeks_analysis['extra_weeks_count']}")
        for week in missing_weeks_analysis['extra_weeks']:
            print(f"   - {week}")


In [None]:
# Optional: Create a detailed week-by-week analysis
def detailed_week_analysis(csv_file_path, date_column='WkStrtActual'):
    """
    Create a detailed week-by-week analysis showing gaps
    """
    try:
        df = pd.read_csv(csv_file_path)
        df[date_column] = pd.to_datetime(df[date_column])
        df = df.sort_values(date_column)

        # Get unique weeks and sort
        unique_weeks = df[date_column].drop_duplicates().sort_values().reset_index(drop=True)

        # Calculate gaps between consecutive weeks
        gaps = []
        for i in range(1, len(unique_weeks)):
            current_week = unique_weeks.iloc[i]
            previous_week = unique_weeks.iloc[i-1]
            expected_next = previous_week + timedelta(days=7)

            if current_week != expected_next:
                gap_days = (current_week - expected_next).days
                gap_weeks = gap_days // 7
                gaps.append({
                    'after_week': previous_week.date(),
                    'before_week': current_week.date(),
                    'gap_days': gap_days,
                    'gap_weeks': gap_weeks,
                    'missing_weeks': gap_weeks
                })

        return pd.DataFrame(gaps)
    except Exception as e:
        print(f"Error in detailed analysis: {e}")
        return pd.DataFrame()

# Run detailed analysis
print("\n" + "="*60)
print("DETAILED GAP ANALYSIS")
print("="*60)

gaps_df = detailed_week_analysis(file_path, 'WkStrtActual')
if len(gaps_df) > 0:
    print("🔍 Gaps found in the time series:")
    print(gaps_df.to_string(index=False))
    print(f"\n📊 Summary: {len(gaps_df)} gap(s) found, totaling {gaps_df['missing_weeks'].sum()} missing weeks")
else:
    print("✅ No gaps found - weeks are consecutive!")


In [None]:
# Additional analysis: Show week distribution and patterns
def week_pattern_analysis(csv_file_path, date_column='WkStrtActual'):
    """
    Analyze weekly patterns in the data
    """
    try:
        df = pd.read_csv(csv_file_path)
        df[date_column] = pd.to_datetime(df[date_column])

        # Add week information
        df['year'] = df[date_column].dt.year
        df['week_of_year'] = df[date_column].dt.isocalendar().week
        df['day_of_week'] = df[date_column].dt.day_name()

        print("\n📈 WEEK PATTERN ANALYSIS")
        print("-" * 40)
        print(f"Years covered: {df['year'].min():} - {df['year'].max()}")

        # Show basic distributions
        print("\n📅 Week Distribution:")
        print(df['week_of_year'].value_counts().sort_index())

        print("\n☀️ Day of Week Distribution:")
        print(df['day_of_week'].value_counts())

        # Show weekly trend
        weekly_trend = df.groupby(['year', 'week_of_year']).size().reset_index(name='count')
        print("\nWeekly Trend:")
        print(weekly_trend.head())

        # You can add more trend analysis here, like plotting, aggregation, etc.

    except Exception as e:
        print(f"Error in week pattern analysis: {e}")

# Run week pattern analysis
week_pattern_analysis(file_path, 'WkStrtActual')


In [None]:
import pandas as pd


In [2]:
# Generate year, week, start date for weeks starting on Sunday (2020-2027)
def generate_weekly_data(start_year=2020, end_year=2027):
    """
    Generate year, week number, and start date for each week from start_year to end_year.
    Weeks start on Sunday and follow ISO week numbering adjusted for Sunday start.
    """
    results = []

    for year in range(start_year, end_year + 1):
        # Start from January 1st of the year
        current_date = pd.Timestamp(year, 1, 1)

        # Find the first Sunday of the year or use Jan 1 if it's already Sunday
        # Sunday = 6 in pandas (Monday = 0)
        days_until_sunday = (6 - current_date.weekday()) % 7
        first_sunday = current_date + pd.Timedelta(days=days_until_sunday)

        # If Jan 1 is not Sunday and first Sunday is more than 3 days away,
        # consider the week containing Jan 1 as week 1
        if current_date.weekday() != 6 and days_until_sunday > 3:
            # Start from the Sunday before Jan 1
            first_sunday = current_date - pd.Timedelta(days=(current_date.weekday() + 1) % 7)

        week_num = 1
        week_start = first_sunday

        # Generate weeks for the entire year
        while week_start.year <= year:
            if week_start.year == year or (week_start.year == year - 1 and week_start + pd.Timedelta(days=6) >= pd.Timestamp(year, 1, 1)):
                results.append({
                    'year': year,
                    'week': week_num,
                    'week_start_date': week_start.date()
                })

            week_start += pd.Timedelta(days=7)
            week_num += 1

            # Stop if we've moved to the next year and it's more than a few days in
            if week_start.year > year and week_start > pd.Timestamp(year + 1, 1, 7):
                break

    return pd.DataFrame(results)

# Generate the data
weekly_data = generate_weekly_data(2020, 2027)

# Display first few rows
print("Sample of weekly data:")
print(weekly_data.head(10))

# Show weeks per year to verify some years have 53 weeks
weeks_per_year = weekly_data.groupby('year')['week'].max()
print("\nWeeks per year:")
print(weeks_per_year)

# Show years with 53 weeks
years_with_53_weeks = weeks_per_year[weeks_per_year == 53].index.tolist()
print(f"\nYears with 53 weeks: {years_with_53_weeks}")

weekly_data

Sample of weekly data:
   year  week week_start_date
0  2020     1      2019-12-29
1  2020     2      2020-01-05
2  2020     3      2020-01-12
3  2020     4      2020-01-19
4  2020     5      2020-01-26
5  2020     6      2020-02-02
6  2020     7      2020-02-09
7  2020     8      2020-02-16
8  2020     9      2020-02-23
9  2020    10      2020-03-01

Weeks per year:
year
2020    53
2021    52
2022    52
2023    53
2024    53
2025    53
2026    52
2027    52
Name: week, dtype: int64

Years with 53 weeks: [2020, 2023, 2024, 2025]


Unnamed: 0,year,week,week_start_date
0,2020,1,2019-12-29
1,2020,2,2020-01-05
2,2020,3,2020-01-12
3,2020,4,2020-01-19
4,2020,5,2020-01-26
...,...,...,...
415,2027,48,2027-11-28
416,2027,49,2027-12-05
417,2027,50,2027-12-12
418,2027,51,2027-12-19
