In [23]:
import csv

def average_height(csv_file_path):
    total_height = 0.0
    row_count = 0

    with open(csv_file_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)

        for row in reader:
            height = float(row["height"])
            total_height += height
            row_count += 1

    if row_count == 0:
        return 0.0

    return total_height / row_count


# Example usage:
avg = average_height("people_1M.csv")
print("Average height:", round(avg, 1))


Average height: 177.5


In [26]:
import random

def average_height_random_n(csv_file_path, n, repeats=10):
    averages = []

    for _ in range(repeats):
        # -------- Step 1: count total rows --------
        with open(csv_file_path, "r", encoding="utf-8") as f:
            total_rows = sum(1 for _ in f) - 1  # minus header

        if n > total_rows:
            raise ValueError("n is larger than number of rows in CSV")

        # -------- Step 2: choose random row indices --------
        selected_indices = set(random.sample(range(1, total_rows + 1), n))

        # -------- Step 3: iterate and select rows --------
        heights = []
        with open(csv_file_path, "r", encoding="utf-8") as f:
            reader = csv.DictReader(f)

            for i, row in enumerate(reader, start=1):
                if i in selected_indices:
                    heights.append(float(row["height"]))

                if len(heights) == n:
                    break

        a = sum(heights)/n
        averages.append(round(a, 1))

    # final global average
    overall_average = sum(averages) / repeats

    return averages, round(overall_average, 1)

n = 1000
avgs, final_avg = average_height_random_n("people_1M.csv", n)

print("Averages from each run:", avgs)
print("Overall average:", final_avg)

Averages from each run: [177.3, 177.2, 176.2, 176.6, 177.2, 177.9, 177.6, 177.6, 177.0, 177.7]
Overall average: 177.2
