In [2]:
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET
from datetime import datetime
import zipfile

def log(message, log_file):
    """Log messages to a log file with a timestamp."""
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    with open(log_file, 'a') as f:
        f.write(f"[{timestamp}] {message}\n")

def unzip_file(zip_path, extract_to):
    """Unzip the file to the specified directory."""
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)  # Create the folder if it doesn't exist
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def extract_csv(file_path):
    """Extract data from a CSV file."""
    try:
        return pd.read_csv(file_path)
    except Exception as e:
        print(f"Error reading CSV file {file_path}: {e}")
        return pd.DataFrame()

def extract_json(file_path):
    """Extract data from a JSON file."""
    try:
        return pd.read_json(file_path, lines=True)  # Use lines=True for JSON lines format
    except ValueError as e:
        print(f"Error reading JSON file {file_path}: {e}")
        return pd.DataFrame()

def extract_xml(file_path):
    """Extract data from an XML file."""
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        all_data = []
        for child in root:
            data = {element.tag: element.text for element in child}
            all_data.append(data)
        return pd.DataFrame(all_data)
    except ET.ParseError as e:
        print(f"Error reading XML file {file_path}: {e}")
        return pd.DataFrame()

def extract_data(data_folder):
    """Extract data from multiple file formats."""
    extracted_data = pd.DataFrame()
    for file in glob.glob(f"{data_folder}/*"):
        print(f"Processing file: {file}")  # Print each file being processed
        try:
            if file.endswith('.csv'):
                data = extract_csv(file)
            elif file.endswith('.json'):
                data = extract_json(file)
            elif file.endswith('.xml'):
                data = extract_xml(file)
            else:
                continue
            extracted_data = pd.concat([extracted_data, data], ignore_index=True)
        except Exception as e:
            print(f"Error processing file {file}: {e}")
    
    # Normalize column names to lowercase for consistency
    extracted_data.columns = [col.lower() for col in extracted_data.columns]

        # Remove duplicates
    extracted_data = extracted_data.drop_duplicates()

    return extracted_data

def transform_data(data):
    """Transform the data (convert heights and weights)."""
    try:
        data['height'] = data['height'].astype(float) * 0.0254  # inches to meters
        data['weight'] = data['weight'].astype(float) * 0.453592  # pounds to kilograms
    except KeyError as e:
        print(f"Missing column during transformation: {e}")
    except ValueError as e:
        print(f"Data conversion error during transformation: {e}")
    return data

def load_data(data, output_file):
    """Save the transformed data to a CSV file."""
    try:
        data.to_csv(output_file, index=False)
    except Exception as e:
        print(f"Error saving data to {output_file}: {e}")

def main():
    # Paths
    zip_file = r'C:\ram\source.zip'
    data_folder = r'C:\ram\unzipped_data'
    log_file = r'C:\ram\log_file.txt'
    output_file = r'C:\ram\transformed_data.csv'

    # Clear log file
    if os.path.exists(log_file):
        os.remove(log_file)

    # Unzipping step
    log("Unzipping the file.", log_file)
    unzip_file(zip_file, data_folder)

    log("ETL process started.", log_file)
    
    try:
        # Extraction
        log("Starting data extraction.", log_file)
        extracted_data = extract_data(data_folder)
        log("Data extraction completed.", log_file)

        # Print extracted data for debugging
        print("Extracted Data:")
        print(extracted_data.head())

        # Transformation
        log("Starting data transformation.", log_file)
        transformed_data = transform_data(extracted_data)
        log("Data transformation completed.", log_file)

        # Print transformed data for debugging
        print("Transformed Data:")
        print(transformed_data.head())

        # Loading
        log("Starting data loading.", log_file)
        load_data(transformed_data, output_file)
        log("Data loading completed.", log_file)

        log("ETL process completed successfully.", log_file)

    except Exception as e:
        log(f"ETL process failed: {e}", log_file)
        print(f"ETL process failed: {e}")

if __name__ == "__main__":
    main()


Processing file: C:\ram\unzipped_data\source1.csv
Processing file: C:\ram\unzipped_data\source1.json
Processing file: C:\ram\unzipped_data\source1.xml
Processing file: C:\ram\unzipped_data\source2.csv
Processing file: C:\ram\unzipped_data\source2.json
Processing file: C:\ram\unzipped_data\source2.xml
Processing file: C:\ram\unzipped_data\source3.csv
Processing file: C:\ram\unzipped_data\source3.json
Processing file: C:\ram\unzipped_data\source3.xml
Extracted Data:
    name height  weight
0   alex  65.78  112.99
1   ajay  71.52  136.49
2  alice   69.4  153.03
3   ravi  68.22  142.34
4    joe  67.79   144.3
Transformed Data:
    name    height     weight
0   alex  1.670812  51.251360
1   ajay  1.816608  61.910772
2  alice  1.762760  69.413184
3   ravi  1.732788  64.564285
4    joe  1.721866  65.453326
