# Data Cleansing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## Data Cleansing

This notebook focuses on improving data quality by handling missing values, outliers, and other data issues.
The goal is to prepare a clean dataset for further analysis and modeling.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats


In [None]:
# Load the dataset

In [None]:
# Note: Replace 'your_data_file.csv' with the actual file path
df = pd.read_csv('your_data_file.csv')


In [None]:
# Display basic information about the dataset
print("Dataset shape:", df.shape)
print("\nDataset info:")
df.info()

print("\nFirst 5 rows:")
print(df.head())

print("\nDescriptive statistics:")
print(df.describe())


In [None]:
# 1. Handling missing values

In [None]:
# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())


In [None]:
# The dataset appears to have no missing values based on the provided info

In [None]:
# But we'll include code for handling missing values as a best practice


In [None]:
# For numeric columns, we could fill with mean/median

In [None]:
# For categorical columns, we could fill with mode

In [None]:
# Example:

In [None]:
# df['Value'] = df['Value'].fillna(df['Value'].median())

In [None]:
# df['Industry_name_NZSIOC'] = df['Industry_name_NZSIOC'].fillna(df['Industry_name_NZSIOC'].mode()[0])


In [None]:
# 2. Fix data type issues

In [None]:
# Convert 'Value' column from object to numeric

In [None]:
# First, check if there are any non-numeric values
print("\nSample values from 'Value' column:")
print(df['Value'].head())


In [None]:
# Convert 'Value' column to numeric

In [None]:
# Using errors='coerce' to convert any non-numeric values to NaN
df['Value'] = pd.to_numeric(df['Value'], errors='coerce')


In [None]:
# Check if any NaN values were introduced during conversion
print("\nNaN values in 'Value' after conversion:", df['Value'].isna().sum())


In [None]:
# If there are NaN values, we can investigate and handle them
if df['Value'].isna().sum() > 0:
    print("Examples of rows with non-numeric 'Value':")
    print(df[df['Value'].isna()].head())
    

In [None]:
    # Fill NaN values with median or drop rows based on analysis
    df['Value'] = df['Value'].fillna(df['Value'].median())


In [None]:
# 3. Detecting and handling outliers

In [None]:
# Calculate Z-scores for the 'Value' column
z_scores = stats.zscore(df['Value'])
abs_z_scores = np.abs(z_scores)
outliers = (abs_z_scores > 3)  # Threshold of 3 standard deviations

print("\nNumber of outliers in 'Value' column:", np.sum(outliers))
print("Percentage of outliers:", (np.sum(outliers) / len(df)) * 100, "%")


In [None]:
# Visualize the distribution of 'Value' to understand outliers
plt.figure(figsize=(10, 6))
sns.histplot(df['Value'], kde=True)
plt.title('Distribution of Value')
plt.xlabel('Value (in millions)')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Visualize outliers with a box plot
plt.figure(figsize=(10, 6))
sns.boxplot(y=df['Value'])
plt.title('Box Plot of Value')
plt.ylabel('Value (in millions)')
plt.show()


In [None]:
# Handle outliers - we can cap them at a certain percentile

In [None]:
# This is a common approach for financial data where extreme values might be valid
Q1 = df['Value'].quantile(0.25)
Q3 = df['Value'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print("\nOutlier thresholds for 'Value':")
print(f"Lower bound: {lower_bound}")
print(f"Upper bound: {upper_bound}")


In [None]:
# Create a copy of the original data before capping outliers
df_original = df.copy()


In [None]:
# Cap the outliers
df['Value_capped'] = df['Value'].clip(lower=lower_bound, upper=upper_bound)


In [None]:
# Compare original vs capped values
print("\nOriginal 'Value' statistics:")
print(df['Value'].describe())
print("\nCapped 'Value' statistics:")
print(df['Value_capped'].describe())


In [None]:
# 4. Removing duplicates

In [None]:
# Check for duplicate rows
duplicate_count = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_count}")


In [None]:
# If duplicates exist, remove them
if duplicate_count > 0:
    df = df.drop_duplicates()
    print(f"Removed {duplicate_count} duplicate rows. New shape: {df.shape}")


In [None]:
# 5. Standardizing formats

In [None]:
# Clean up Industry_code_ANZSIC06 - extract just the code part

In [None]:
# First, let's examine the format
print("\nSample ANZSIC06 codes:")
print(df['Industry_code_ANZSIC06'].head())


In [None]:
# Extract the main ANZSIC06 code (assuming it's the part before any spaces or special characters)
df['ANZSIC06_clean'] = df['Industry_code_ANZSIC06'].str.extract(r'(ANZSIC06 [A-Za-z0-9\-]+)')

print("\nCleaned ANZSIC06 codes:")
print(df['ANZSIC06_clean'].head())


In [None]:
# Standardize Year format (ensure it's treated as a year, not just a number)

In [None]:
# Convert to datetime year
df['Year'] = pd.to_datetime(df['Year'], format='%Y').dt.year


In [None]:
# Create a new column for the decade
df['Decade'] = (df['Year'] // 10) * 10
print("\nDecade distribution:")
print(df['Decade'].value_counts().sort_index())


In [None]:
# 6. Additional data quality checks

In [None]:
# Check for consistency between industry codes and names
industry_code_name_pairs = df[['Industry_code_NZSIOC', 'Industry_name_NZSIOC']].drop_duplicates()
print(f"\nUnique industry code-name pairs: {len(industry_code_name_pairs)}")


In [None]:
# Check if any industry code has multiple names
code_name_counts = df.groupby('Industry_code_NZSIOC')['Industry_name_NZSIOC'].nunique()
inconsistent_codes = code_name_counts[code_name_counts > 1]

if len(inconsistent_codes) > 0:
    print("\nInconsistent industry codes (multiple names):")
    print(inconsistent_codes)
    

In [None]:
    # Show examples of inconsistencies
    for code in inconsistent_codes.index[:3]:  # Show first 3 examples
        print(f"\nCode {code} has these different names:")
        print(df[df['Industry_code_NZSIOC'] == code]['Industry_name_NZSIOC'].unique())


In [None]:
# 7. Final cleaned dataset

In [None]:
# Select and reorder columns for the final dataset
final_columns = [
    'Year', 'Decade', 'Industry_aggregation_NZSIOC', 
    'Industry_code_NZSIOC', 'Industry_name_NZSIOC',
    'ANZSIC06_clean', 'Variable_code', 'Variable_name',
    'Variable_category', 'Value', 'Value_capped', 'Units'
]


In [None]:
# Create final cleaned dataframe
df_cleaned = df[final_columns]

print("\nFinal cleaned dataset:")
print(df_cleaned.head())
print("\nCleaned dataset info:")
df_cleaned.info()


In [None]:
# Save the cleaned dataset
df_cleaned.to_csv('nz_industry_financial_data_cleaned.csv', index=False)
print("\nCleaned dataset saved to 'nz_industry_financial_data_cleaned.csv'")


In [None]:
# Optional: Save the original dataset with outliers identified
df_original['is_outlier'] = outliers
df_original.to_csv('nz_industry_financial_data_with_outliers_flagged.csv', index=False)
print("Original dataset with outliers flagged saved to 'nz_industry_financial_data_with_outliers_flagged.csv'")