In [None]:
import calendar
import csv
import datetime
import itertools
import os
import random
from datetime import datetime, timedelta
from itertools import cycle, islice

import numpy as np
import pandas as pd

random.seed(42)

In [None]:
raw = pd.read_csv ('data/pizza_sales.csv')
raw["order_date"] = pd.to_datetime(raw["order_date"], format='%m-%d-%y')
raw.head()

#### Generate random rows with outlying dates.


In [None]:
def generate_random_date_in_range(start_year, end_year):
    """Generate a random date within a specified range of years."""
    start_date = datetime(start_year, 1, 1)
    end_date = datetime(end_year, 12, 31)
    delta = end_date - start_date
    random_days = random.randint(0, delta.days)
    return start_date + timedelta(days=random_days)


def generate_random_past_or_present_row(df):
    """Generate a new row with random values based on existing data, with a modified date.
    Dates are adjusted to fall within the ranges 1700-1800 or 2100-2200."""
    sample_row = df.sample(1, random_state=random.randint(1, 10000)).iloc[0].copy()

    # Choose between the past and future date ranges
    if random.choice([True, False]):
        start_year, end_year = 1700, 1800
    else:
        start_year, end_year = 2100, 2200

    # Generate a random date within the chosen range and keep it as a datetime object
    new_date = generate_random_date_in_range(start_year, end_year)

    # Update the order_date in the sample row
    sample_row["order_date"] = new_date

    # Ensure the order_details_id is unique
    sample_row["order_details_id"] = df["order_details_id"].max() + 1

    return sample_row


# Since the df was modified in previous operations, let's reset it to ensure consistency
df = raw.copy()

# Convert 'order_date' to datetime for consistency with the function's expectations

# Generate 60 new rows adjusted for the new date ranges
new_rows = [generate_random_past_or_present_row(df) for _ in range(60)]

# Convert the list of dictionaries to a DataFrame
df_new_rows = pd.DataFrame(new_rows)

# Add these rows to the original DataFrame
output = pd.concat([df, df_new_rows], ignore_index=True)

output.sort_values(by='order_date')

### Adding random, duplicate rows.

In [None]:
def introduce_duplicate_rows(df, n=1000):
    duplicates = df.sample(n=n, replace=True).reset_index(drop=True)
    return pd.concat([df, duplicates], ignore_index=True)

output = introduce_duplicate_rows(df=output)

#### Change random values in random rows to null.

In [None]:
def modify_random_rows(df, random_seed=42):
    np.random.seed(random_seed)  # Set the random seed for reproducibility

    # Exclude columns of datetime64[ns] type from modification
    columns_to_consider = df.select_dtypes(exclude=['datetime64[ns]']).columns
    
    # Determine the exact number of rows to modify, randomly choosing up to 1000
    rows_to_modify_count = np.random.randint(1, min(len(df), 1001))
    
    # Select random row indices
    random_indices = np.random.choice(df.index, size=rows_to_modify_count, replace=False)
    
    for index in random_indices:
        # Randomly determine the number of values to change to null, up to 5
        values_to_null_count = np.random.randint(1, 6)
        
        # Select random columns to modify from the eligible columns
        if len(columns_to_consider) > 0:
            columns_to_modify = np.random.choice(columns_to_consider, size=min(values_to_null_count, len(columns_to_consider)), replace=False)
            for column in columns_to_modify:
                df.at[index, column] = np.nan
    
    return df

# To use this function, simply pass your dataframe to it as follows:
# Display the modified rows
output = modify_random_rows(output)
output.loc[output.isnull().any(axis=1)]

#### Swap values between columns.

In [None]:
def swap_values_between_columns(df, col1, col2, random_seed=42, n=500):
    np.random.seed(random_seed)
    indices = np.random.choice(df.index, size=n, replace=False)
    for ix in indices:
        df.at[ix, col1], df.at[ix, col2] = df.at[ix, col2], df.at[ix, col1]
    return df

# Usage example:
output = swap_values_between_columns(df=output, col1='order_id', col2='pizza_name', n=1500)
# This will randomly swap values between 'pizza_name' and 'order_id' in the dataframe.

In [None]:
output.to_csv(
    "data/pizza_sales_with_data_quality_issues.csv",
    index=False,
)