# HW03: Python Fundamentals

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import sys
import os

sys.path.append('../src')

## 1. NumPy Operations

Demonstrating array creation, elementwise operations, and comparing loop vs vectorized execution.

In [None]:
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
print(f"Original array: {data}")
print(f"Array shape: {data.shape}")
print(f"Array dtype: {data.dtype}")

squared = data ** 2
doubled = data * 2
log_data = np.log(data)

print(f"Squared: {squared}")
print(f"Doubled: {doubled}")
print(f"Natural log: {log_data}")

print(f"\nMean: {np.mean(data)}")
print(f"SD: {np.std(data)}")
print(f"Sum: {np.sum(data)}")
print(f"Max: {np.max(data)}")
print(f"Min: {np.min(data)}")

In [None]:
large_array = np.random.rand(100000)

start_time = time.time()
loop_result = []
for i in range(len(large_array)):
    loop_result.append(large_array[i] ** 2 + 2 * large_array[i])
loop_result = np.array(loop_result)
loop_time = time.time() - start_time

start_time = time.time()
vectorized_result = large_array ** 2 + 2 * large_array
vectorized_time = time.time() - start_time

print(f"Loop execution time: {loop_time:.6f}s")
print(f"Vectorized execution time: {vectorized_time:.6f}s")

## 2. Dataset Loading

In [None]:
df = pd.read_csv('../starter_data.csv')

print(f"Dataset shape: {df.shape}")
print(df.head())
print(df.info())

## 3. Summary Statistics

In [None]:
numeric_summary = df.describe()
print(numeric_summary)

In [None]:
market_stats = df.groupby('market')['gold_price'].agg([
    'count', 'mean', 'median', 'std', 'min', 'max', 'sum'
])

print(market_stats)

## 4. Save Outputs

In [None]:
summary_data = {
    'overall_stats': {
        'total_records': len(df),
        'mean_gold_price': df['gold_price'].mean(),
        'median_gold_price': df['gold_price'].median(),
        'std_gold_price': df['gold_price'].std(),
        'min_gold_price': df['gold_price'].min(),
        'max_gold_price': df['gold_price'].max(),
        'total_sum': df['gold_price'].sum()
    },
    'market_breakdown': market_stats.to_dict()
}

market_stats.to_csv('../data/processed/summary.csv')

## 5. Reusable Functions

Creating utility functions for data analysis tasks.

In [None]:
def get_summary_stats(df, numeric_col='gold_price', group_col='market'):
    overall_stats = {
        'count': len(df),
        'mean': df[numeric_col].mean(),
        'median': df[numeric_col].median(),
        'std': df[numeric_col].std(),
        'min': df[numeric_col].min(),
        'max': df[numeric_col].max(),
        'sum': df[numeric_col].sum()
    }
    
    grouped_stats = df.groupby(group_col)[numeric_col].agg([
        'count', 'mean', 'median', 'std', 'min', 'max', 'sum'
    ]).round(2)
    
    return {
        'overall': overall_stats,
        'by_group': grouped_stats
    }