In [1]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = ('C:/Users/Ananya/Downloads/sem 2/DSP/new_loan_approval_data.csv')
df = pd.read_csv(file_path)

# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Dictionary to track errors
error_log = {
    'missing_column': [],
    'missing_values': [],
    'unknown_values': [],
    'wrong_values': [],
    'string_in_numeric': [],
    'duplicated_rows': 0,  # Track number of duplicated rows
    'incorrect_data_type': []
}

# 1. Add unknown values for a given feature ('education' and 'employment')
unknown_education_sample = df.sample(frac=0.01).index
df.loc[unknown_education_sample, 'education'] = 'Unknown'  # Invalid value
error_log['unknown_values'].extend([(idx, 'education') for idx in unknown_education_sample])

unknown_employment_sample = df.sample(frac=0.01).index
df.loc[unknown_employment_sample, 'employment'] = 'Maybe'  # Invalid value
error_log['unknown_values'].extend([(idx, 'employment') for idx in unknown_employment_sample])

# 2. Add wrong values for 'loan_amount'
wrong_value_sample = df.sample(frac=0.01).index
wrong_values = [-1000, -500, -100, 0, 10000000, 999999999]
df.loc[wrong_value_sample, 'loan_amount'] = np.random.choice(wrong_values, size=len(wrong_value_sample))
error_log['wrong_values'].extend([(idx, 'loan_amount') for idx in wrong_value_sample])

# 3. Add string values to 'loan_amount' (should be numeric)
string_in_numeric_sample = df.sample(frac=0.01).index  # 1% string in 'loan_amount'
df['loan_amount'] = df['loan_amount'].astype('object')  # Convert 'loan_amount' to object type temporarily
df.loc[string_in_numeric_sample, 'loan_amount'] = 'not available'  # Add string value
error_log['string_in_numeric'].extend([(idx, 'loan_amount') for idx in string_in_numeric_sample])

# Convert 'loan_amount' to numeric, forcing errors to NaN
df['loan_amount'] = pd.to_numeric(df['loan_amount'], errors='coerce')

# 4. Duplicated rows (fewer than before)
duplicated_sample = df.sample(frac=0.001).copy()  
df = pd.concat([df, duplicated_sample])
error_log['duplicated_rows'] += len(duplicated_sample)

# 5. Incorrect data type: Insert numerical values into 'education' (categorical)
incorrect_type_sample = df.sample(frac=0.01).index
df.loc[incorrect_type_sample, 'education'] = 12345  
error_log['incorrect_data_type'].extend([(idx, 'education') for idx in incorrect_type_sample])

# 6. Introduce missing column (annual_income)
required_column = 'annual_income'  
if required_column in df.columns:
    df = df.drop(columns=[required_column])
    error_log['missing_column'].append(required_column)

# 7. Introduce missing values (NaNs) in 'loan_amount'
missing_value_sample = df.sample(frac=0.01).index
df.loc[missing_value_sample, 'loan_amount'] = np.nan  
error_log['missing_values'].extend([(idx, 'loan_amount') for idx in missing_value_sample])

# Ensure consistent data types in the DataFrame before exporting
df = df.convert_dtypes()  # Convert to the best possible dtypes for each column

# Print out the error log summary directly in the notebook
print("Error Log Summary:")
for error_type, details in error_log.items():
    if error_type == 'duplicated_rows':
        print(f"{error_type}: {details} duplicated rows added")
    else:
        print(f"{error_type}: {len(details)} errors added")
        print(f"Sample errors: {details[:5]}")  # Display the first 5 entries of each type of error

# Save the dataset with errors to the current directory
output_path = ('C:/Users/Ananya/Downloads/sem 2/DSP/dataset_with_errors.csv')
df.to_csv(output_path, index=False)
print(f"Dataset with errors saved to {output_path}")


Error Log Summary:
missing_column: 1 errors added
Sample errors: ['annual_income']
missing_values: 100 errors added
Sample errors: [(2408, 'loan_amount'), (8186, 'loan_amount'), (8683, 'loan_amount'), (7884, 'loan_amount'), (8904, 'loan_amount')]
unknown_values: 200 errors added
Sample errors: [(9818, 'education'), (5502, 'education'), (2789, 'education'), (3667, 'education'), (7934, 'education')]
wrong_values: 100 errors added
Sample errors: [(3584, 'loan_amount'), (2251, 'loan_amount'), (9088, 'loan_amount'), (1483, 'loan_amount'), (2046, 'loan_amount')]
string_in_numeric: 100 errors added
Sample errors: [(7609, 'loan_amount'), (9133, 'loan_amount'), (6019, 'loan_amount'), (6150, 'loan_amount'), (379, 'loan_amount')]
duplicated_rows: 10 duplicated rows added
incorrect_data_type: 100 errors added
Sample errors: [(5528, 'education'), (1974, 'education'), (5071, 'education'), (1840, 'education'), (5362, 'education')]
Dataset with errors saved to C:/Users/Ananya/Downloads/sem 2/DSP/datas