In [10]:
import numpy as np
import pandas as pd
import json
import xml.etree.ElementTree as ET
import time

print("-"*60)
print("Lab 2:  DATA VARIETY EXPLORATION")
print("-"*60)


------------------------------------------------------------
Lab 2:  DATA VARIETY EXPLORATION
------------------------------------------------------------


In [11]:
file_paths = {
    "json": "employees.json",
    "xml": "employees.xml",
    "csv": "employees.csv",
    "text": "bigdata_text.txt"
}

def process_csv_data():
    """Process the structured data"""
    print("PROCESSING CSV DATA (Structured)")
    print("-"*40)

    start_time = time.time()
    df = pd.read_csv(file_paths["csv"])

    print("CSV Records Loaded: ")
    print("\nSample data: ")
    print(df.head())
    print(f"\nAverage Salary: {df['salary'].mean():.2f}")
    processing_time = time.time() - start_time
    print(f"Processing Time: {processing_time:.2f} seconds")
    return df, processing_time


In [12]:
def process_json_data():
    """Process the semi-structured data"""
    print("PROCESSING JSON DATA (Semi-Structured)")
    print("-"*40)

    start_time = time.time()
    with open(file_paths["json"], "r") as f:
        data = json.load(f)

    records = []
    for emp in data["employees"]:
        records.append(
            {
                "name": emp["name"],
                "city": emp["city"],
                "salary": emp["salary"],
                "skill_count": len(emp["skills"])
            }
        )

    df = pd.DataFrame(records)
    print("JSON Records Loaded: ")
    print("\nSample data: ")
    print(df.head())
    print(f"\nAverage Salary: {df['salary'].mean():.2f}")
    processing_time = time.time() - start_time
    print(f"Processing Time: {processing_time:.2f} seconds")
    return df, processing_time


In [23]:
def process_xml_data():
    """Process the semi-structured data"""
    print("PROCESSING XML DATA (Semi-Structured)")
    print("-"*40)

    start_time = time.time()
    with open(file_paths["xml"], "r") as f:
        data = ET.parse(f)
    root = data.getroot()

    records = []
    for emp in root.findall("employee"):
        skills_elem = emp.find("skills")
        skill_count = len(skills_elem.findall("skill")) if skills_elem is not None else 0
        records.append(
            {
                "name": emp.find("name").text,
                "city": emp.find("city").text,
                "salary": float(emp.find("salary").text),
                "skill_count": skill_count
            }
        )

    df = pd.DataFrame(records)
    print("XML Records Loaded: ")
    print("\nSample data: ")
    print(df.head())
    print(f"\nAverage Salary: {df['salary'].mean():.2f}")
    processing_time = time.time() - start_time
    print(f"Processing Time: {processing_time:.2f} seconds")
    return df, processing_time


In [24]:
def process_text_data():
    """Process the unstructured text data"""
    print("PROCESSING TEXT DATA (Unstructured)")
    print("-"*40)

    start_time = time.time()
    with open(file_paths["text"], "r") as f:
        text_content = f.read()

    words = text_content.lower().split()
    words_count = len(words)
    unique_words = len(set(words))
    sentences = [s.strip() for s in text_content.split('.') if s.strip()]

    word_freq = {}
    for word in words:
        clean_word = word.strip('.,\n')
        if len(clean_word) > 2:
            word_freq[clean_word] = word_freq.get(clean_word, 0) + 1

    top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:5]
    print("Text Analysis Complete")
    print(f"Total Words: {words_count}")
    print(f"Unique Words: {unique_words}")
    print(f"Total Sentences: {len(sentences)}")
    print("Top 5 Words:")
    for word, freq in top_words:
        print(f"  {word}: {freq}")
    processing_time = time.time() - start_time
    print(f"Processing Time: {processing_time:.2f} seconds")


In [25]:
df_csv, csv_time = process_csv_data()
df_json, json_time = process_json_data()
df_xml, xml_time = process_xml_data()
process_text_data()

PROCESSING CSV DATA (Structured)
----------------------------------------
CSV Records Loaded: 

Sample data: 
            name  age         city  salary
0       John Doe   30     New York   75000
1     Jane Smith   25  Los Angeles   68000
2    Bob Johnson   35      Chicago   82000
3    Alice Brown   28      Houston   71000
4  Charlie Davis   32      Phoenix   76000

Average Salary: 74150.00
Processing Time: 0.01 seconds
PROCESSING JSON DATA (Semi-Structured)
----------------------------------------
JSON Records Loaded: 

Sample data: 
            name         city  salary  skill_count
0       John Doe     New York   75000            3
1     Jane Smith  Los Angeles   68000            3
2    Bob Johnson      Chicago   82000            3
3    Alice Brown      Houston   71000            3
4  Charlie Davis      Phoenix   76000            3

Average Salary: 74400.00
Processing Time: 0.00 seconds
PROCESSING XML DATA (Semi-Structured)
----------------------------------------
XML Records Loaded