# Dirty Function

This function's purpose is to intentionally insert some missing values in a clean dataset to provide datasets for cleaning practices. Only use if the need to have dirty datasets arise. 

In [None]:
import pandas as pd
import numpy as np
import random 

original_file = pd.read_csv('Dataset.csv')
dirty_file = original_file.copy()

## User Input Section

1) Insert the number of data to be converted into missing values 
2) Declare which columns to ignore when converting data into missing values

In [None]:
n_missing = 100                            # Number of missing values to introduce
ignore_cols = ['vehicle_id', 'timestamp']  # Columns to ignore

## Dataset Dirty Function

In [None]:
def introduce_missing_values(dirty_file, n_missing, ignore_cols):
    # Identify candidate columns for introducing missing values
    candidate_cols = [col for col in dirty_file.columns if col not in ignore_cols]
    modification_log = [] 
    modified_cells = set()
    n_rows = len(dirty_file)
    n_cols = len(candidate_cols)

    print(f"Starting to introduce {n_missing} missing values...")

    # Validate n_missing against available cells
    count_added = 0
    total_available_cells = n_rows * n_cols
    if n_missing > total_available_cells:
        print(f"Warning: You asked for {n_missing} missing values,")
        print(f"but there are only {total_available_cells} available cells to modify.")
        print(f"Setting n_missing to {total_available_cells}.")
        n_missing = total_available_cells
    
    # Introduce missing values
    while count_added < n_missing:
        row_idx = random.randint(0, n_rows - 1)
        col_name = random.choice(candidate_cols)
        cell_key = (row_idx, col_name)
        if cell_key not in modified_cells:
            dirty_file.loc[row_idx, col_name] = np.nan
            modified_cells.add(cell_key)
            modification_log.append({
                'modified_row_index': row_idx,
                'modified_column_name': col_name
            })
            count_added += 1

    print(f"Successfully added {count_added} missing values.")

    log_dirty_file = pd.DataFrame(modification_log)
    return dirty_file, log_dirty_file

## Function Execution Section

In [None]:
dirty_file, log_dirty_file = introduce_missing_values(dirty_file, n_missing, ignore_cols)
print("\n--- Info for the new 'dirty' DataFrame ---")
dirty_file.info()
print("\n\n--- Head of the Modification Log ---")
print(log_dirty_file.head())

## Export Final Dirty Dataset

In [None]:
# After merging log data
final_dirty_file = dirty_file.join(
    log_dirty_file.set_index('modified_row_index'),
    how='left'
)

# Fix timestamp formatting (NO TRAILING SPACE!)
if 'timestamp' in final_dirty_file.columns:
    final_dirty_file['timestamp'] = pd.to_datetime(final_dirty_file['timestamp']).dt.strftime('%d/%m/%Y %H:%M:%S')


print("\n--- Info for the Final Dirty DataFrame ---")
final_dirty_file.info()

final_dirty_file.to_csv('Dirty_Dataset_with_Log.csv', index=False)
print("\nFinal dirty dataset with log saved to 'Dirty_Dataset_with_Log.csv'")