# HW03: Python Fundamentals

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import sys
import os

# Add src directory to path for importing utility functions
sys.path.append('../src')

## 1. NumPy Operations

Demonstrating array creation, elementwise operations, and comparing loop vs vectorized execution.

In [3]:
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
print(f"Original array: {data}")
print(f"Array shape: {data.shape}")
print(f"Array dtype: {data.dtype}")

squared = data ** 2
doubled = data * 2
log_data = np.log(data)

print(f"Squared: {squared}")
print(f"Doubled: {doubled}")
print(f"Natural log: {log_data}")

print(f"\nMean: {np.mean(data)}")
print(f"SD: {np.std(data)}")
print(f"Sum: {np.sum(data)}")
print(f"Max: {np.max(data)}")
print(f"Min: {np.min(data)}")

Original array: [ 1  2  3  4  5  6  7  8  9 10]
Array shape: (10,)
Array dtype: int64
Squared: [  1   4   9  16  25  36  49  64  81 100]
Doubled: [ 2  4  6  8 10 12 14 16 18 20]
Natural log: [0.         0.69314718 1.09861229 1.38629436 1.60943791 1.79175947
 1.94591015 2.07944154 2.19722458 2.30258509]

Mean: 5.5
SD: 2.8722813232690143
Sum: 55
Max: 10
Min: 1


In [5]:
large_array = np.random.rand(100000)

# Method 1: Using loop
start_time = time.time()
loop_result = []
for i in range(len(large_array)):
    loop_result.append(large_array[i] ** 2 + 2 * large_array[i])
loop_result = np.array(loop_result)
loop_time = time.time() - start_time

# Method 2: Using vectorized operations
start_time = time.time()
vectorized_result = large_array ** 2 + 2 * large_array
vectorized_time = time.time() - start_time

# results
print(f"Loop execution time: {loop_time:.6f}s")
print(f"Vectorized execution time: {vectorized_time:.6f}s")

Loop execution time: 0.032525s
Vectorized execution time: 0.001000s


## 2. Dataset Loading

In [7]:
df = pd.read_csv('../starter_data.csv')

print(f"Dataset shape: {df.shape}")
print(df.head())
print(df.info())

Dataset shape: (10, 3)
  category  value        date
0        A     10  2025-08-01
1        B     15  2025-08-02
2        A     12  2025-08-03
3        B     18  2025-08-04
4        C     25  2025-08-05
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  10 non-null     object
 1   value     10 non-null     int64 
 2   date      10 non-null     object
dtypes: int64(1), object(2)
memory usage: 368.0+ bytes
None


## 3. Summary Statistics

In [8]:
numeric_summary = df.describe()
print(numeric_summary)

           value
count  10.000000
mean   17.600000
std     7.381659
min    10.000000
25%    12.250000
50%    14.500000
75%    23.250000
max    30.000000


In [11]:
category_stats = df.groupby('category')['value'].agg([
    'count', 'mean', 'median', 'std', 'min', 'max', 'sum'
])

print(category_stats)

          count       mean  median       std  min  max  sum
category                                                   
A             4  11.500000    11.5  1.290994   10   13   46
B             3  15.666667    15.0  2.081666   14   18   47
C             3  27.666667    28.0  2.516611   25   30   83


## 4. Save Outputs

In [None]:
summary_data = {
    'overall_stats': {
        'total_records': len(df),
        'mean_value': df['value'].mean(),
        'median_value': df['value'].median(),
        'std_value': df['value'].std(),
        'min_value': df['value'].min(),
        'max_value': df['value'].max(),
        'total_sum': df['value'].sum()
    },
    'category_breakdown': category_stats.to_dict()
}

# Save to CSV
category_stats.to_csv('../data/processed/summary.csv')

## 5. Reusable Functions

Creating utility functions for data analysis tasks.

In [12]:
# Define utility functions
def get_summary_stats(df, numeric_col='value', group_col='category'):
    overall_stats = {
        'count': len(df),
        'mean': df[numeric_col].mean(),
        'median': df[numeric_col].median(),
        'std': df[numeric_col].std(),
        'min': df[numeric_col].min(),
        'max': df[numeric_col].max(),
        'sum': df[numeric_col].sum()
    }
    
    grouped_stats = df.groupby(group_col)[numeric_col].agg([
        'count', 'mean', 'median', 'std', 'min', 'max', 'sum'
    ]).round(2)
    
    return {
        'overall': overall_stats,
        'by_group': grouped_stats
    }