In [14]:
%%writefile testutility.py
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime 
import gc
import re

################
# File Reading #
################

def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)

def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string

def col_header_val(df, table_config):
    '''
    replace whitespaces in the column
    and standardized column names
    '''
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[^\w]','_',regex=True)
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
    df.columns = list(map(lambda x: replacer(x,'_'), list(df.columns)))
    expected_col = list(map(lambda x: x.lower(),  table_config['columns']))
    expected_col.sort()
    df.columns =list(map(lambda x: x.lower(), list(df.columns)))
    df = df.reindex(sorted(df.columns), axis=1)
    if len(df.columns) == len(expected_col) and list(expected_col) == list(df.columns):
        print("Column name and column length validation passed")
        return 1
    else:
        print("Column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file", mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded", missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0

def install_packages():
    try:
        # Install Pandas
        subprocess.run(["pip", "install", "--upgrade", "pandas==2.1.*"])

        # Install Modin
        subprocess.run(["pip", "install", "--upgrade", "--force-reinstall", "modin"])

        print("Packages installed successfully.")
    except Exception as e:
        print(f"Error installing packages: {e}")

# Example usage
# install_packages()  # Uncomment this line to install packages

# Read the World Energy Consumption CSV file
file_path = 'C:/Users/Admin/Downloads/World Energy Consumption.csv'
config_file_path = 'file.yaml'

# Read the config file
config = read_config_file(config_file_path)

# Read the CSV file
df = pd.read_csv(file_path)

# Perform column validation
col_header_val(df, config)


Overwriting testutility.py


In [None]:
%%writefile file.yaml
dataset_name: World Energy Consumption.csv
file_type:csv
'file_type': file_type,
        'dataset_name': 'world_energy',
        'file_name': 'world_energy_data.' + file_type,
        'table_name': 'energy_data',
        'inbound_delimiter': ',',
        'outbound_delimiter': '|',
        'skip_leading_rows': skip_leading_rows,
        'columns': columns
    }



In [15]:
import pandas as pd
import yaml

def generate_yaml_config(dataset_path, yaml_path, skip_leading_rows=0):
    # Read the dataset to get column names
    df = pd.read_csv(dataset_path, nrows=5)  # Reading only a few rows for efficiency
    columns = df.columns.tolist()

    # Determine file type based on the extension
    file_type = dataset_path.split('.')[-1]

    # Create YAML configuration dictionary
    config = {
        'file_type': file_type,
        'dataset_name': 'world_energy',
        'file_name': 'world_energy_data.' + file_type,
        'table_name': 'energy_data',
        'inbound_delimiter': ',',
        'outbound_delimiter': '|',
        'skip_leading_rows': skip_leading_rows,
        'columns': columns
    }

    # Write the YAML configuration to a file
    with open(yaml_path, 'w') as yaml_file:
        yaml.dump(config, yaml_file, default_flow_style=False)

# Example usage
dataset_path = 'C:/Users/Admin/Downloads/World Energy Consumption.csv'
yaml_path = 'world_energy_config.yaml'
generate_yaml_config(dataset_path, yaml_path, skip_leading_rows=1)


In [16]:
# Read config file
import testutility as util
config_data = util.read_config_file("file.yaml")

Column name and column length validation failed
Following File columns are not in the YAML file ['coal_cons_per_capita', 'wind_electricity', 'biofuel_elec_per_capita', 'coal_prod_per_capita', 'energy_cons_change_twh', 'low_carbon_share_elec', 'renewables_consumption', 'gas_share_energy', 'renewables_share_elec', 'hydro_elec_per_capita', 'wind_cons_change_pct', 'fossil_elec_per_capita', 'biofuel_cons_change_pct', 'hydro_cons_change_pct', 'gas_cons_change_pct', 'nuclear_cons_change_pct', 'low_carbon_cons_change_twh', 'wind_share_energy', 'oil_share_energy', 'fossil_energy_per_capita', 'coal_production', 'other_renewable_electricity', 'oil_cons_change_twh', 'oil_electricity', 'other_renewables_share_energy', 'coal_consumption', 'oil_prod_change_pct', 'population', 'oil_cons_change_pct', 'other_renewables_elec_per_capita', 'solar_cons_change_pct', 'per_capita_electricity', 'coal_share_elec', 'fossil_share_elec', 'oil_production', 'biofuel_cons_change_twh', 'biofuel_share_energy', 'gas_cons

In [21]:
import logging  # Add this line to import the 'logging' module

# The rest of your functions...


In [23]:
import pandas as pd
import yaml
import os
import gzip
import re

# The existing functions from testutility.py
def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)

def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string

def col_header_val(df, table_config):
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[^\w]','_',regex=True)
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
    df.columns = list(map(lambda x: replacer(x,'_'), list(df.columns)))
    expected_col = list(map(lambda x: x.lower(),  table_config['columns']))
    expected_col.sort()
    df.columns =list(map(lambda x: x.lower(), list(df.columns)))
    df = df.reindex(sorted(df.columns), axis=1)
    if len(df.columns) == len(expected_col) and list(expected_col) == list(df.columns):
        print("Column name and column length validation passed")
        return 1
    else:
        print("Column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file", mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded", missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0

def validate_columns(df, config):
    expected_columns = config['columns']
    if len(df.columns) == len(expected_columns) and list(df.columns) == expected_columns:
        print("Column validation passed.")
        return True
    else:
        print("Column validation failed.")
        print(f"Expected columns: {expected_columns}")
        print(f"Actual columns: {list(df.columns)}")
        return False

def write_to_pipe_delimited_gzip(df, output_path):
    df.to_csv(output_path, sep='|', index=False, compression='gzip')

def file_summary(df):
    total_rows = len(df)
    total_columns = len(df.columns)
    file_size = os.path.getsize('output_file.csv.gz') / (1024 * 1024)  # File size in MB

    print(f"\nSummary of the file:")
    print(f"Total number of rows: {total_rows}")
    print(f"Total number of columns: {total_columns}")
    print(f"File size: {file_size:.2f} MB")

# Example usage
dataset_path = 'C:/Users/Admin/Downloads/World Energy Consumption.csv'
yaml_path = 'world_energy_config.yaml'
output_file_path = 'output_file.csv.gz'

df = pd.read_csv(dataset_path, skiprows=1)  # Skip leading rows as specified in YAML
config = read_config_file(yaml_path)

# Validate columns
if col_header_val(df, config) and validate_columns(df, config):
    # Write to pipe-separated gzip file
    write_to_pipe_delimited_gzip(df, output_file_path)

    # Create a summary of the file
    file_summary(df)


Column name and column length validation failed
Following File columns are not in the YAML file ['unnamed_20', 'unnamed_65', '0_000', 'unnamed_16', 'unnamed_18', 'unnamed_17', '61_500', 'unnamed_49', '500_231', 'unnamed_108', 'unnamed_91', 'unnamed_8', '295_750', 'unnamed_106', '368_650_1', 'unnamed_48', 'unnamed_59', 'unnamed_92', 'unnamed_98', 'unnamed_23', 'unnamed_57', 'unnamed_30', 'unnamed_120', '163_220', '16_930', '71_030', 'unnamed_107', 'unnamed_54', '6_111', 'unnamed_114', 'unnamed_104', 'unnamed_63', 'unnamed_43', '0_000_1', 'unnamed_62', 'unnamed_21', 'unnamed_37', 'unnamed_99', 'unnamed_87', 'unnamed_110', 'unnamed_113', 'unnamed_40', 'unnamed_123', 'unnamed_70', '80_225', 'unnamed_34', 'unnamed_72', 'unnamed_96', '44_275', 'unnamed_44', 'unnamed_83', 'unnamed_14', 'unnamed_4', 'unnamed_61', 'unnamed_12', 'unnamed_89', 'unnamed_115', 'unnamed_38', 'unnamed_31', 'unnamed_50', '19_775', 'unnamed_102', 'unnamed_118', 'unnamed_41', 'unnamed_55', '4_592', 'unnamed_22', '13_663