In [1]:
# ==============================
# STEP 1: Install Java & Hadoop
# ==============================
!apt-get update -y
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Download Hadoop 3.3.6
!wget -q https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz
!tar -xzf hadoop-3.3.6.tar.gz
!mv hadoop-3.3.6 /usr/local/hadoop

# ==============================
# STEP 2: Setup Hadoop Env
# ==============================
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["HADOOP_HOME"] = "/usr/local/hadoop"
os.environ["PATH"] = f"{os.environ['HADOOP_HOME']}/bin:{os.environ['JAVA_HOME']}/bin:" + os.environ["PATH"]

!hadoop version


0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://cli.github.com/packages stable InRelease [3,917 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://cli.github.com/packages stable/main amd64 Packages [343 B]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,827 kB]
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3,858 kB]
Ge

In [2]:

!mkdir input_data

with open("input_data/input1.txt", "w") as f:
    f.write("Hadoop is an open-source framework used for processing large data\n")

with open("input_data/input2.txt", "w") as f:
    f.write("It distributes data across multiple nodes to enable faster computation\n")

!cat input_data/*


Hadoop is an open-source framework used for processing large data
It distributes data across multiple nodes to enable faster computation


In [4]:
%%bash
cat > mapper.py <<'EOF'
#!/usr/bin/env python3
import sys

for line in sys.stdin:
    line = line.strip()
    for word in line.split():
        print(f"{word}\t1")
EOF

cat > combiner.py <<'EOF'
#!/usr/bin/env python3
import sys
from collections import defaultdict

counts = defaultdict(int)
for line in sys.stdin:
    word, count = line.strip().split("\t")
    counts[word] += int(count)

for word, count in counts.items():
    print(f"{word}\t{count}")
EOF

cat > reducer.py <<'EOF'
#!/usr/bin/env python3
import sys

current_word = None
current_count = 0

for line in sys.stdin:
    word, count = line.strip().split("\t")
    count = int(count)

    if current_word == word:
        current_count += count
    else:
        if current_word:
            print(f"{current_word}\t{current_count}")
        current_word = word
        current_count = count

if current_word:
    print(f"{current_word}\t{current_count}")
EOF

chmod +x mapper.py combiner.py reducer.py




In [5]:
# ==============================
# STEP 5: Run Hadoop Streaming Job
# ==============================
!hadoop fs -rm -r -f input output

# Put input files into HDFS
!hadoop fs -mkdir -p input
!hadoop fs -put input_data/* input/

# Run Hadoop Streaming
!hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
  -input input \
  -output output \
  -mapper mapper.py \
  -combiner combiner.py \
  -reducer reducer.py \
  -file mapper.py \
  -file combiner.py \
  -file reducer.py


2025-11-14 16:51:01,374 WARN streaming.StreamJob: -file option is deprecated, please use generic option -files instead.
packageJobJar: [mapper.py, combiner.py, reducer.py] [] /tmp/streamjob8814433144072036210.jar tmpDir=null
2025-11-14 16:51:02,100 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2025-11-14 16:51:02,204 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2025-11-14 16:51:02,204 INFO impl.MetricsSystemImpl: JobTracker metrics system started
2025-11-14 16:51:02,227 WARN impl.MetricsSystemImpl: JobTracker metrics system already initialized!
2025-11-14 16:51:02,452 INFO mapred.FileInputFormat: Total input files to process : 2
2025-11-14 16:51:02,475 INFO mapreduce.JobSubmitter: number of splits:2
2025-11-14 16:51:02,659 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local1236364078_0001
2025-11-14 16:51:02,659 INFO mapreduce.JobSubmitter: Executing with tokens: []
2025-11-14 16:51:03,050 INFO mapred.LocalDis

In [6]:
!hadoop fs -cat output/part-00000

Hadoop	1
It	1
across	1
an	1
computation	1
data	2
distributes	1
enable	1
faster	1
for	1
framework	1
is	1
large	1
multiple	1
nodes	1
open-source	1
processing	1
to	1
used	1
