# Data Transformation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## Data Transformation

This notebook implements feature engineering and data transformations to prepare the data for modeling.
It includes normalization, encoding, and creation of derived features.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Load the dataset

In [None]:
# Assuming the dataset is in a CSV file named 'nz_industry_data.csv'
df = pd.read_csv('nz_industry_data.csv')


In [None]:
# Display basic information about the dataset
print("Dataset shape:", df.shape)
df.info()
df.head()


In [None]:
# Convert 'Value' column to numeric

In [None]:
# First, remove any non-numeric characters (like commas or currency symbols)
df['Value'] = pd.to_numeric(df['Value'].str.replace(',', '').str.replace('$', ''), errors='coerce')


In [None]:
# Check for missing values after conversion
print("\nMissing values after converting 'Value' to numeric:")
print(df.isnull().sum())


In [None]:
# Fill any missing values if necessary
df = df.fillna({
    'Value': df['Value'].median()
})


In [None]:
# Extract clean industry codes from ANZSIC06

In [None]:
# Create a new column with simplified industry codes
df['Industry_code_simple'] = df['Industry_code_ANZSIC06'].str.extract(r'(ANZSIC06 [A-Za-z0-9\-]+)')


In [None]:
# Feature Engineering


In [None]:
# 1. Create financial ratios and metrics

In [None]:
# Group by Year and Industry to calculate aggregated metrics
financial_metrics = df.pivot_table(
    index=['Year', 'Industry_code_NZSIOC', 'Industry_name_NZSIOC'],
    columns='Variable_name',
    values='Value',
    aggfunc='sum'
).reset_index()


In [None]:
# Rename columns to remove spaces and special characters
financial_metrics.columns = [col.replace(' ', '_').replace(',', '').replace('-', '_') 
                            if isinstance(col, str) else col 
                            for col in financial_metrics.columns]


In [None]:
# Calculate profit (Total income - Total expenditure)
if 'Total_income' in financial_metrics.columns and 'Total_expenditure' in financial_metrics.columns:
    financial_metrics['Profit'] = financial_metrics['Total_income'] - financial_metrics['Total_expenditure']
    

In [None]:
    # Calculate profit margin
    financial_metrics['Profit_Margin'] = financial_metrics['Profit'] / financial_metrics['Total_income']


In [None]:
# 2. Create time-based features

In [None]:
# Calculate year-over-year growth for key metrics
if 'Total_income' in financial_metrics.columns:
    financial_metrics = financial_metrics.sort_values(['Industry_code_NZSIOC', 'Year'])
    financial_metrics['Income_YoY_Growth'] = financial_metrics.groupby('Industry_code_NZSIOC')['Total_income'].pct_change()

if 'Total_expenditure' in financial_metrics.columns:
    financial_metrics['Expenditure_YoY_Growth'] = financial_metrics.groupby('Industry_code_NZSIOC')['Total_expenditure'].pct_change()


In [None]:
# 3. Create industry size categories based on income
if 'Total_income' in financial_metrics.columns:

In [None]:
    # Define quantiles for industry size categorization
    income_quantiles = financial_metrics.groupby('Year')['Total_income'].transform(
        lambda x: pd.qcut(x, q=[0, 0.25, 0.5, 0.75, 1], labels=['Small', 'Medium', 'Large', 'Very Large'])
    )
    financial_metrics['Industry_Size'] = income_quantiles


In [None]:
# 4. Create industry growth classification
if 'Income_YoY_Growth' in financial_metrics.columns:
    conditions = [
        (financial_metrics['Income_YoY_Growth'] > 0.1),
        (financial_metrics['Income_YoY_Growth'] > 0.02) & (financial_metrics['Income_YoY_Growth'] <= 0.1),
        (financial_metrics['Income_YoY_Growth'] >= -0.02) & (financial_metrics['Income_YoY_Growth'] <= 0.02),
        (financial_metrics['Income_YoY_Growth'] < -0.02)
    ]
    choices = ['High Growth', 'Moderate Growth', 'Stable', 'Declining']
    financial_metrics['Growth_Category'] = np.select(conditions, choices, default='Unknown')


In [None]:
# 5. Create economic cycle indicators

In [None]:
# Define economic periods (this is a simplified example - in reality would need external economic data)
economic_periods = {
    2013: 'Recovery',
    2014: 'Growth',
    2015: 'Growth',
    2016: 'Growth',
    2017: 'Growth',
    2018: 'Peak',
    2019: 'Peak',
    2020: 'Recession',  # COVID-19 impact
    2021: 'Recovery',
    2022: 'Recovery',
    2023: 'Stabilization'
}
financial_metrics['Economic_Cycle'] = financial_metrics['Year'].map(economic_periods)


In [None]:
# Merge the engineered features back to the original dataset if needed

In [None]:
# This creates a new dataframe with both original and engineered features
df_enriched = df.merge(
    financial_metrics[['Year', 'Industry_code_NZSIOC', 'Industry_name_NZSIOC', 'Profit', 'Profit_Margin', 
                      'Income_YoY_Growth', 'Industry_Size', 'Growth_Category', 'Economic_Cycle']],
    on=['Year', 'Industry_code_NZSIOC', 'Industry_name_NZSIOC'],
    how='left'
)


In [None]:
# Data Normalization/Standardization


In [None]:
# 1. Select numerical columns for standardization
numeric_cols = ['Value', 'Profit', 'Profit_Margin', 'Income_YoY_Growth']
numeric_cols = [col for col in numeric_cols if col in df_enriched.columns]


In [None]:
# 2. Apply StandardScaler to numeric columns
scaler = StandardScaler()
df_enriched[numeric_cols + '_scaled'] = scaler.fit_transform(df_enriched[numeric_cols].fillna(0))


In [None]:
# Encoding Categorical Variables


In [None]:
# 1. Select categorical columns for encoding
categorical_cols = ['Industry_aggregation_NZSIOC', 'Industry_code_NZSIOC', 'Units', 
                   'Variable_code', 'Variable_category', 'Industry_Size', 'Growth_Category', 'Economic_Cycle']
categorical_cols = [col for col in categorical_cols if col in df_enriched.columns]


In [None]:
# 2. Apply Label Encoding for ordinal categories
label_encoder = LabelEncoder()
for col in ['Industry_Size', 'Economic_Cycle']:
    if col in df_enriched.columns:
        df_enriched[col + '_encoded'] = label_encoder.fit_transform(df_enriched[col].fillna('Unknown'))


In [None]:
# 3. Apply One-Hot Encoding for nominal categories

In [None]:
# For demonstration, we'll one-hot encode a subset of categorical columns to avoid too many columns
ohe_cols = ['Industry_aggregation_NZSIOC', 'Variable_category', 'Growth_Category']
ohe_cols = [col for col in ohe_cols if col in df_enriched.columns]

for col in ohe_cols:
    ohe = pd.get_dummies(df_enriched[col], prefix=col)
    df_enriched = pd.concat([df_enriched, ohe], axis=1)


In [None]:
# Dimensionality Reduction with PCA

In [None]:
# Select numeric columns for PCA
pca_cols = [col for col in df_enriched.columns if col.endswith('_scaled')]

if len(pca_cols) >= 2:  # Need at least 2 columns for PCA to be meaningful

In [None]:
    # Apply PCA
    pca = PCA(n_components=min(len(pca_cols), 3))  # Keep up to 3 components
    pca_result = pca.fit_transform(df_enriched[pca_cols].fillna(0))
    

In [None]:
    # Add PCA results to dataframe
    df_enriched['PCA_1'] = pca_result[:, 0]
    if pca_result.shape[1] > 1:
        df_enriched['PCA_2'] = pca_result[:, 1]
    if pca_result.shape[1] > 2:
        df_enriched['PCA_3'] = pca_result[:, 2]
    

In [None]:
    # Print explained variance
    print("\nPCA Explained Variance Ratio:", pca.explained_variance_ratio_)
    print("Cumulative Explained Variance:", np.sum(pca.explained_variance_ratio_))


In [None]:
# Create additional derived features


In [None]:
# 1. Calculate rolling averages for time series analysis

In [None]:
# Group by industry and calculate 3-year rolling averages for key metrics
if 'Total_income' in financial_metrics.columns:
    rolling_metrics = financial_metrics.sort_values(['Industry_code_NZSIOC', 'Year'])
    rolling_metrics['Income_3yr_Rolling_Avg'] = rolling_metrics.groupby('Industry_code_NZSIOC')['Total_income'].transform(
        lambda x: x.rolling(window=3, min_periods=1).mean()
    )
    

In [None]:
    # Merge rolling averages back to enriched dataframe
    df_enriched = df_enriched.merge(
        rolling_metrics[['Year', 'Industry_code_NZSIOC', 'Income_3yr_Rolling_Avg']],
        on=['Year', 'Industry_code_NZSIOC'],
        how='left'
    )


In [None]:
# 2. Calculate industry concentration metrics

In [None]:
# For each year, calculate the share of each industry in total income
if 'Total_income' in financial_metrics.columns:
    yearly_totals = financial_metrics.groupby('Year')['Total_income'].sum().reset_index()
    yearly_totals.rename(columns={'Total_income': 'Year_Total_Income'}, inplace=True)
    
    concentration_metrics = financial_metrics.merge(yearly_totals, on='Year', how='left')
    concentration_metrics['Market_Share'] = concentration_metrics['Total_income'] / concentration_metrics['Year_Total_Income']
    

In [None]:
    # Merge market share back to enriched dataframe
    df_enriched = df_enriched.merge(
        concentration_metrics[['Year', 'Industry_code_NZSIOC', 'Market_Share']],
        on=['Year', 'Industry_code_NZSIOC'],
        how='left'
    )


In [None]:
# 3. Create volatility measures

In [None]:
# Calculate coefficient of variation for industries over time
if 'Total_income' in financial_metrics.columns:
    volatility = financial_metrics.groupby('Industry_code_NZSIOC')['Total_income'].agg(['mean', 'std']).reset_index()
    volatility['Income_Volatility'] = volatility['std'] / volatility['mean']
    volatility = volatility[['Industry_code_NZSIOC', 'Income_Volatility']]
    

In [None]:
    # Merge volatility metrics back to enriched dataframe
    df_enriched = df_enriched.merge(volatility, on='Industry_code_NZSIOC', how='left')


In [None]:
# Display the final transformed dataset
print("\nTransformed dataset shape:", df_enriched.shape)
print("\nNew features created:")
new_cols = [col for col in df_enriched.columns if col not in df.columns]
print(new_cols)


In [None]:
# Save the transformed dataset
df_enriched.to_csv('nz_industry_transformed.csv', index=False)


In [None]:
# Visualize some of the engineered features
plt.figure(figsize=(12, 8))


In [None]:
# Plot 1: Distribution of Profit Margin by Industry Size
if 'Profit_Margin' in df_enriched.columns and 'Industry_Size' in df_enriched.columns:
    plt.subplot(2, 2, 1)
    sns.boxplot(x='Industry_Size', y='Profit_Margin', data=df_enriched)
    plt.title('Profit Margin by Industry Size')
    plt.xticks(rotation=45)


In [None]:
# Plot 2: Income Growth Trends Over Time
if 'Income_YoY_Growth' in df_enriched.columns:
    plt.subplot(2, 2, 2)
    yearly_growth = df_enriched.groupby('Year')['Income_YoY_Growth'].mean().reset_index()
    sns.lineplot(x='Year', y='Income_YoY_Growth', data=yearly_growth)
    plt.title('Average Income Growth by Year')
    plt.grid(True)


In [None]:
# Plot 3: Market Share Distribution
if 'Market_Share' in df_enriched.columns:
    plt.subplot(2, 2, 3)
    sns.histplot(df_enriched['Market_Share'].dropna(), bins=30, kde=True)
    plt.title('Distribution of Industry Market Shares')


In [None]:
# Plot 4: PCA Visualization
if 'PCA_1' in df_enriched.columns and 'PCA_2' in df_enriched.columns:
    plt.subplot(2, 2, 4)
    sns.scatterplot(x='PCA_1', y='PCA_2', hue='Industry_Size', data=df_enriched.sample(1000))
    plt.title('PCA Visualization of Industries')

plt.tight_layout()
plt.savefig('feature_engineering_visualization.png')
plt.show()