In [None]:
# Install Java
!apt-get update -y
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Download Hadoop 3.3.6
!wget -q https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz
!tar -xzf hadoop-3.3.6.tar.gz
!mv hadoop-3.3.6 /usr/local/hadoop



Hit:1 https://cli.github.com/packages stable InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,085 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,812 kB]
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [

In [None]:
# Set environment variables
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["HADOOP_HOME"] = "/usr/local/hadoop"
os.environ["PATH"] = f"{os.environ['HADOOP_HOME']}/bin:{os.environ['JAVA_HOME']}/bin:" + os.environ["PATH"]

!hadoop version


Hadoop 3.3.6
Source code repository https://github.com/apache/hadoop.git -r 1be78238728da9266a4f88195058f08fd012bf9c
Compiled by ubuntu on 2023-06-18T08:22Z
Compiled on platform linux-x86_64
Compiled with protoc 3.7.1
From source with checksum 5652179ad55f76cb287d9c633bb53bbd
This command was run using /usr/local/hadoop/share/hadoop/common/hadoop-common-3.3.6.jar


In [None]:
%%bash
cat > mapper.py <<'EOF'
#!/usr/bin/env python3
import sys

# Input format: timestamp,temperature
for line in sys.stdin:
    line = line.strip()
    if line:
        parts = line.split(',')
        date = parts[0].split()[0]  # extract date only
        temp = int(parts[1])
        print(f"{date}\t{temp}")
EOF

chmod +x mapper.py
%%bash
cat > combiner.py <<'EOF'
#!/usr/bin/env python3
import sys
from collections import defaultdict

temps = defaultdict(list)

for line in sys.stdin:
    date, temp = line.strip().split("\t")
    temps[date].append(int(temp))

for date, t_list in temps.items():
    print(f"{date}\t{max(t_list)},{min(t_list)}")
EOF

chmod +x combiner.py


bash: line 16: fg: no job control


In [None]:
%%bash
cat > reducer.py <<'EOF'
#!/usr/bin/env python3
import sys
from collections import defaultdict

temps = defaultdict(list)

for line in sys.stdin:
    date, t_values = line.strip().split("\t")
    max_temp, min_temp = map(int, t_values.split(','))
    temps[date].append((max_temp, min_temp))

for date, t_list in temps.items():
    overall_max = max(t[0] for t in t_list)
    overall_min = min(t[1] for t in t_list)
    print(f"{date}\tMax: {overall_max}, Min: {overall_min}")
EOF

chmod +x reducer.py


In [None]:
%%bash
mkdir -p input
cat > input/weather.csv <<EOF
2025-10-01 06:00,25
2025-10-01 12:00,30
2025-10-01 18:00,28
2025-10-02 06:00,24
2025-10-02 12:00,32
2025-10-02 18:00,29
EOF


In [None]:
!hdfs dfs -rm -r output


2025-10-21 16:27:51,802 INFO Configuration.deprecation: io.bytes.per.checksum is deprecated. Instead, use dfs.bytes-per-checksum
Deleted output


In [None]:
!rm -rf output
!hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar \
    -input input/weather.csv \
    -output output \
    -mapper mapper.py \
    -combiner combiner.py \
    -reducer reducer.py


2025-10-21 16:28:12,002 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2025-10-21 16:28:12,113 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2025-10-21 16:28:12,113 INFO impl.MetricsSystemImpl: JobTracker metrics system started
2025-10-21 16:28:12,137 WARN impl.MetricsSystemImpl: JobTracker metrics system already initialized!
2025-10-21 16:28:12,430 INFO mapred.FileInputFormat: Total input files to process : 1
2025-10-21 16:28:12,454 INFO mapreduce.JobSubmitter: number of splits:1
2025-10-21 16:28:12,669 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local2057063979_0001
2025-10-21 16:28:12,669 INFO mapreduce.JobSubmitter: Executing with tokens: []
2025-10-21 16:28:12,883 INFO mapreduce.Job: The url to track the job: http://localhost:8080/
2025-10-21 16:28:12,887 INFO mapreduce.Job: Running job: job_local2057063979_0001
2025-10-21 16:28:12,888 INFO mapred.LocalJobRunner: OutputCommitter set in config null
2025-10

In [None]:
!cat output/part-00000


2025-10-01	Max: 30, Min: 25
2025-10-02	Max: 32, Min: 24
