**Part 4: Data Parsing**

1 - Develop scripts to parse data files:

CSV for survey data:
*    Use Python's csv module to read and validate data.
*   Check for missing or malformed entries (e.g., empty fields, invalid dates).
* Save clean data into a staging table in the database using SQL Bulk Insert

JSON for web feedback:
  * Use Python's json module to parse nested structures.
  * Flatten data and extract fields (e.g., comments, ratings, timestamps).
  *  Map JSON keys to database columns and load into the database using SQL scripts.

XML for external reviews:
  * Use Python's xml.etree.ElementTree library to parse XML structures.
  * Validate schema conformity and extract relevant fields.
  * **bold text** Convert XML data to rows and load them into relational tables.

In [2]:
import csv
import json
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime
import re
import os

# =====================================================
# 1. CSV PARSER - Survey Data with Validation
# =====================================================

def parse_csv_survey(file_path, output_path='customer_survey_clean.csv'):
    """
    Parse CSV survey data with comprehensive validation
    """
    print(f"=== Parsing CSV: {file_path} ===")

    clean_data = []
    errors = []
    row_count = 0

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            reader = csv.DictReader(file)

            for idx, row in enumerate(reader, start=2):  # Start at 2 (1 is header)
                row_count += 1
                issues = []

                # Validate review_id
                if not row.get('review_id') or not row['review_id'].strip():
                    issues.append(f"Row {idx}: Missing review_id")

                # Validate name
                if not row.get('name') or not row['name'].strip():
                    issues.append(f"Row {idx}: Missing name")
                    row['name'] = 'Unknown'

                # Validate email
                email = row.get('email', '').strip()
                if not email:
                    issues.append(f"Row {idx}: Missing email")
                    row['email'] = f'unknown{idx}@example.com'
                elif not re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', email):
                    issues.append(f"Row {idx}: Invalid email format - {email}")

                # Validate region
                if not row.get('region') or not row['region'].strip():
                    issues.append(f"Row {idx}: Missing region")
                    row['region'] = 'Unknown'

                # Validate rating
                try:
                    rating = int(row.get('rating', 0))
                    if rating < 1 or rating > 5:
                        issues.append(f"Row {idx}: Rating {rating} out of range (1-5)")
                        row['rating'] = max(1, min(5, rating))  # Clamp to 1-5
                except (ValueError, TypeError):
                    issues.append(f"Row {idx}: Invalid rating value")
                    row['rating'] = 3  # Default to middle rating

                # Validate comments
                if not row.get('comments') or not row['comments'].strip():
                    issues.append(f"Row {idx}: Missing comments")
                    row['comments'] = 'No comment provided'

                # Validate and standardize date
                date_str = row.get('review_date', '').strip()
                standardized_date = validate_and_convert_date(date_str)
                if standardized_date:
                    row['review_date'] = standardized_date
                else:
                    issues.append(f"Row {idx}: Invalid date format - {date_str}")
                    row['review_date'] = datetime.now().strftime('%Y-%m-%d')

                # Record issues but keep the row (with corrections)
                if issues:
                    errors.extend(issues)

                clean_data.append(row)

        # Save clean data
        if clean_data:
            df = pd.DataFrame(clean_data)
            df.to_csv(output_path, index=False)
            print(f"Processed {row_count} rows")
            print(f"Saved clean data to: {output_path}")

        # Report errors
        if errors:
            print(f"\nFound {len(errors)} validation issues:")
            for error in errors[:10]:  # Show first 10 errors
                print(f"  - {error}")
            if len(errors) > 10:
                print(f"  ... and {len(errors) - 10} more issues")

        return df, errors

    except FileNotFoundError:
        print(f"Error: File not found - {file_path}")
        return None, [f"File not found: {file_path}"]
    except Exception as e:
        print(f"Error parsing CSV: {str(e)}")
        return None, [str(e)]


# =====================================================
# 2. JSON PARSER - Web Feedback with Nested Structures
# =====================================================

def parse_json_feedback(file_path, output_path='web_feedback_clean.csv'):
    """
    Parse JSON web feedback with flattening and validation
    """
    print(f"\n=== Parsing JSON: {file_path} ===")

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)

        # Handle different JSON structures
        if isinstance(data, dict):
            # If JSON is a single object with nested arrays
            if 'feedback' in data:
                data = data['feedback']
            elif 'reviews' in data:
                data = data['reviews']

        # Flatten nested JSON
        df = pd.json_normalize(data)

        print(f"Original columns: {list(df.columns)}")

        # Standardize column names
        column_mapping = {}
        for col in df.columns:
            # Handle nested column names (e.g., 'customer.id' -> 'customer_id')
            new_col = col.replace('.', '_').lower()
            column_mapping[col] = new_col

        df.rename(columns=column_mapping, inplace=True)

        # Ensure required columns exist
        required_columns = {
            'customer_id': 0,
            'customer_email': 'unknown@example.com',
            'feedback_text': 'No feedback provided',
            'sentiment': 'unknown',
            'rating': 3,
            'feedback_date': datetime.now().strftime('%Y-%m-%d')
        }

        for col, default in required_columns.items():
            if col not in df.columns:
                # Try to find similar column
                similar = [c for c in df.columns if col.split('_')[-1] in c]
                if similar:
                    df[col] = df[similar[0]]
                else:
                    df[col] = default

        # Validate ratings if present
        if 'rating' in df.columns:
            df['rating'] = df['rating'].apply(lambda x: validate_rating(x))

        # Validate dates if present
        if 'feedback_date' in df.columns:
            df['feedback_date'] = df['feedback_date'].apply(
                lambda x: validate_and_convert_date(str(x)) or datetime.now().strftime('%Y-%m-%d')
            )

        # Clean text fields
        text_columns = ['feedback_text', 'comments']
        for col in text_columns:
            if col in df.columns:
                df[col] = df[col].fillna('No feedback provided')
                df[col] = df[col].str.strip()

        # Save to CSV
        df.to_csv(output_path, index=False)

        print(f"Processed {len(df)} records")
        print(f"Final columns: {list(df.columns)}")
        print(f"Saved to: {output_path}")
        print(f"\nSample data:")
        print(df.head())

        return df

    except FileNotFoundError:
        print(f"Error: File not found - {file_path}")
        return None
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON format - {str(e)}")
        return None
    except Exception as e:
        print(f"Error parsing JSON: {str(e)}")
        return None


# =====================================================
# 3. XML PARSER - External Reviews with Schema Validation
# =====================================================

def parse_xml_reviews(file_path, output_path='external_reviews_clean.csv'):
    """
    Parse XML external reviews with validation
    """
    print(f"\n=== Parsing XML: {file_path} ===")

    try:
        tree = ET.parse(file_path)
        root = tree.getroot()

        print(f"Root element: {root.tag}")

        data = []
        errors = []

        # Find all review elements (handle different possible structures)
        review_elements = root.findall('.//review') or root.findall('.//item') or root

        for idx, review in enumerate(review_elements, start=1):
            row = {}
            issues = []

            # Extract customer_id
            customer_id = get_xml_text(review, ['customer_id', 'customerId', 'CustomerID'])
            if customer_id:
                try:
                    row['customer_id'] = int(customer_id)
                except ValueError:
                    issues.append(f"Review {idx}: Invalid customer_id - {customer_id}")
                    row['customer_id'] = 0
            else:
                issues.append(f"Review {idx}: Missing customer_id")
                row['customer_id'] = 0

            # Extract product_id
            product_id = get_xml_text(review, ['product_id', 'productId', 'ProductID'])
            if product_id:
                try:
                    row['product_id'] = int(product_id)
                except ValueError:
                    issues.append(f"Review {idx}: Invalid product_id - {product_id}")
                    row['product_id'] = 1
            else:
                row['product_id'] = 1  # Default product

            # Extract rating
            rating_text = get_xml_text(review, ['rating', 'Rating', 'score'])
            try:
                rating = int(rating_text) if rating_text else 3
                row['rating'] = validate_rating(rating)
                if rating != row['rating']:
                    issues.append(f"Review {idx}: Rating {rating} adjusted to {row['rating']}")
            except ValueError:
                issues.append(f"Review {idx}: Invalid rating - {rating_text}")
                row['rating'] = 3

            # Extract comments
            comments = get_xml_text(review, ['comments', 'comment', 'Comments', 'text'])
            row['comments'] = comments if comments else 'No comment provided'

            # Extract date
            date_text = get_xml_text(review, ['review_date', 'reviewDate', 'date', 'Date'])
            row['review_date'] = validate_and_convert_date(date_text) or datetime.now().strftime('%Y-%m-%d')

            # Record errors
            if issues:
                errors.extend(issues)

            data.append(row)

        # Create DataFrame
        df = pd.DataFrame(data)

        # Save to CSV
        df.to_csv(output_path, index=False)

        print(f"Processed {len(df)} reviews")
        print(f"Columns: {list(df.columns)}")
        print(f"Saved to: {output_path}")

        if errors:
            print(f"\nFound {len(errors)} validation issues:")
            for error in errors[:5]:
                print(f"  - {error}")

        print(f"\nSample data:")
        print(df.head())

        return df, errors

    except FileNotFoundError:
        print(f"Error: File not found - {file_path}")
        return None, [f"File not found: {file_path}"]
    except ET.ParseError as e:
        print(f"Error: Invalid XML format - {str(e)}")
        return None, [f"XML Parse Error: {str(e)}"]
    except Exception as e:
        print(f"Error parsing XML: {str(e)}")
        return None, [str(e)]


# =====================================================
# UTILITY FUNCTIONS
# =====================================================

def validate_and_convert_date(date_str):
    """
    Try to parse various date formats and return YYYY-MM-DD
    """
    if not date_str or str(date_str).strip() in ['', 'None', 'NULL']:
        return None

    date_str = str(date_str).strip()

    # Common date formats
    formats = [
        '%Y-%m-%d',
        '%d/%m/%Y',
        '%m/%d/%Y',
        '%Y/%m/%d',
        '%d-%m-%Y',
        '%m-%d-%Y',
        '%Y%m%d',
        '%d.%m.%Y',
        '%Y.%m.%d'
    ]

    for fmt in formats:
        try:
            dt = datetime.strptime(date_str, fmt)
            return dt.strftime('%Y-%m-%d')
        except ValueError:
            continue

    return None


def validate_rating(rating):
    """
    Ensure rating is between 1 and 5
    """
    try:
        rating = int(rating)
        return max(1, min(5, rating))
    except (ValueError, TypeError):
        return 3


def get_xml_text(element, tag_names):
    """
    Try multiple tag names to find text in XML element
    """
    for tag in tag_names:
        child = element.find(tag)
        if child is not None and child.text:
            return child.text.strip()
    return None


# =====================================================
# MAIN EXECUTION
# =====================================================

if __name__ == "__main__":
    print("=" * 60)
    print("DATA PARSING PIPELINE - ETL Process")
    print("=" * 60)

    # 1. Parse CSV
    csv_df, csv_errors = parse_csv_survey('customer_survey.csv')

    # 2. Parse JSON
    json_df = parse_json_feedback('web_feedback.json')

    # 3. Parse XML
    xml_df, xml_errors = parse_xml_reviews('external_reviews.xml')

    print("\n" + "=" * 60)
    print("PARSING COMPLETE")
    print("=" * 60)
    print("\nGenerated files:")
    print(" customer_survey_clean.csv")
    print(" web_feedback_clean.csv")
    print(" external_reviews_clean.csv")
    print("\nNext step: Load these files into MySQL using LOAD DATA INFILE")

DATA PARSING PIPELINE - ETL Process
=== Parsing CSV: customer_survey.csv ===
Processed 3 rows
Saved clean data to: customer_survey_clean.csv

Found 3 validation issues:
  - Row 2: Missing review_id
  - Row 3: Missing review_id
  - Row 4: Missing review_id

=== Parsing JSON: web_feedback.json ===
Original columns: ['customer_id', 'rating', 'comments']
Processed 3 records
Final columns: ['customer_id', 'rating', 'comments', 'customer_email', 'feedback_text', 'sentiment', 'feedback_date']
Saved to: web_feedback_clean.csv

Sample data:
   customer_id  rating                  comments       customer_email  \
0            1       5        Excellent service!  unknown@example.com   
1            2       4  Good experience overall.  unknown@example.com   
2            3       2            Late delivery.  unknown@example.com   

          feedback_text sentiment feedback_date  
0  No feedback provided   unknown    2025-10-05  
1  No feedback provided   unknown    2025-10-05  
2  No feedback prov

**Part 3: Data Integration**

1 - Import data from multiple formats:

*   Extract XML data (e.g., 'external_reviews.xml') into structured tables using XML parsing tools.


In [3]:
import xml.etree.ElementTree as ET
import pandas as pd

tree = ET.parse("external_reviews.xml")
root = tree.getroot()

data = []
for review in root.findall("review"):
    row = {
        "CustomerID": review.find("customer_id").text,
        "Rating": review.find("rating").text,
        "Comments": review.find("comments").text
    }
    data.append(row)

df = pd.DataFrame(data)

#Export the parsed XML file to CSV to be able to use it in workbench
df.to_csv('external_reviews.csv', index=False)
print(df)

  CustomerID Rating                  Comments
0          1      5        Excellent service!
1          2      4  Good experience overall.
2          3      2            Late delivery.
