In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!java -version

!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!update-alternatives --set javac /usr/lib/jvm/java-8-openjdk-amd64/bin/javac
!update-alternatives --set jps /usr/lib/jvm/java-8-openjdk-amd64/bin/jps
!java -version

#Finding the default Java path
!readlink -f /usr/bin/java | sed "s:bin/java::"
!apt-get install openssh-server -qq > /dev/null
!service ssh start

!grep Port /etc/ssh/sshd_config

#Creating a new rsa key pair with empty password
!ssh-keygen -t rsa -P "" -f ~/.ssh/id_rsa <<< y

# See id_rsa.pub content
!more /root/.ssh/id_rsa.pub

#Copying the key to autorized keys
!cat $HOME/.ssh/id_rsa.pub > $HOME/.ssh/authorized_keys
#Changing the permissions on the key
!chmod 0600 ~/.ssh/authorized_keys

#Conneting with the local machine
!ssh -o StrictHostKeyChecking=no localhost uptime

#Downloading Hadoop 3.2.3
!wget -q https://archive.apache.org/dist/hadoop/common/hadoop-3.2.3/hadoop-3.2.3.tar.gz

#Untarring the file
!sudo tar -xzf hadoop-3.2.3.tar.gz
#Removing the tar file
!rm hadoop-3.2.3.tar.gz

#Copying the hadoop files to user/local
!cp -r hadoop-3.2.3/ /usr/local/
#-r copy directories recursively

#Adding JAVA_HOME directory to hadoop-env.sh file
!sed -i '/export JAVA_HOME=/a export JAVA_HOME=\/usr\/lib\/jvm\/java-8-openjdk-amd64' /usr/local/hadoop-3.2.3/etc/hadoop/hadoop-env.sh

import os
#Creating environment variables
#Creating Hadoop home variable

os.environ["HADOOP_HOME"] = "/usr/local/hadoop-3.2.3"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["JRE_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64/jre"
os.environ["PATH"] += f'{os.environ["JAVA_HOME"]}/bin:{os.environ["JRE_HOME"]}/bin:{os.environ["HADOOP_HOME"]}/sbin'

!$HADOOP_HOME/bin/hdfs namenode -format

#Creating other necessary enviroment variables before starting nodes
os.environ["HDFS_NAMENODE_USER"] = "root"
os.environ["HDFS_DATANODE_USER"] = "root"
os.environ["HDFS_SECONDARYNAMENODE_USER"] = "root"
os.environ["YARN_RESOURCEMANAGER_USER"] = "root"
os.environ["YARN_NODEMANAGER_USER"] = "root"

#Launching hdfs deamons
!$HADOOP_HOME/sbin/start-dfs.sh

#Launching yarn deamons
#nohup causes a process to ignore a SIGHUP signal
!nohup $HADOOP_HOME/sbin/start-yarn.sh

#Listing the running deamons
!jps

!$HADOOP_HOME/bin/hdfs dfs -mkdir /word_count_python

!$HADOOP_HOME/bin/hdfs dfs -put /content/pembukaan_uud1945.txt /word_count_python

openjdk version "11.0.24" 2024-07-16
OpenJDK Runtime Environment (build 11.0.24+8-post-Ubuntu-1ubuntu322.04)
OpenJDK 64-Bit Server VM (build 11.0.24+8-post-Ubuntu-1ubuntu322.04, mixed mode, sharing)
update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java to provide /usr/bin/java (java) in manual mode
update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/javac to provide /usr/bin/javac (javac) in manual mode
update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/jps to provide /usr/bin/jps (jps) in manual mode
openjdk version "1.8.0_422"
OpenJDK Runtime Environment (build 1.8.0_422-8u422-b05-1~22.04-b05)
OpenJDK 64-Bit Server VM (build 25.422-b05, mixed mode)
/usr/lib/jvm/java-8-openjdk-amd64/jre/
 * Starting OpenBSD Secure Shell server sshd
   ...done.
#Port 22
#GatewayPorts no
Generating public/private rsa key pair.
Created directory '/root/.ssh'.
Your identification has been saved in /root/.ssh/id_rsa
Your public key has been saved in /root/.ssh

In [2]:
%%writefile mapper.py
#!/usr/bin/env python

import sys

for line in sys.stdin:
  line = line.strip()
  words = line.split()
  for word in words:
    print('%s\t%s' % (word, 1))

Writing mapper.py


In [3]:
%%writefile reducer.py
#!/usr/bin/env python

from operator import itemgetter
import sys

current_word = None
current_count = 0
word = None

for line in sys.stdin:
  line = line.strip()
  word, count = line.split('\t', 1)
  try:
    count = int(count)
  except ValueError:
    continue

  if current_word == word:
    current_count += count
  else:
    if current_word:
      print('%s\t%s' % (current_word, current_count))
    current_count = count
    current_word = word

if current_word == word:
  print('%s\t%s' % (current_word, current_count))

Writing reducer.py


In [4]:
#Running MapReduce programs
!$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.2.3.jar \
  -input /word_count_python/pembukaan_uud1945.txt \
  -output /word_count_python/output \
  -mapper "python /content/mapper.py" \
  -reducer "python /content/reducer.py"

!$HADOOP_HOME/bin/hdfs dfs -copyToLocal /word_count_python/output/part-00000 /content/pembukaan_uud1945-wordcount.txt

2024-08-17 14:30:47,832 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2024-08-17 14:30:48,077 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2024-08-17 14:30:48,077 INFO impl.MetricsSystemImpl: JobTracker metrics system started
2024-08-17 14:30:48,097 WARN impl.MetricsSystemImpl: JobTracker metrics system already initialized!
2024-08-17 14:30:48,507 INFO mapred.FileInputFormat: Total input files to process : 1
2024-08-17 14:30:48,542 INFO mapreduce.JobSubmitter: number of splits:1
2024-08-17 14:30:48,719 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local1341060779_0001
2024-08-17 14:30:48,719 INFO mapreduce.JobSubmitter: Executing with tokens: []
2024-08-17 14:30:48,926 INFO mapreduce.Job: The url to track the job: http://localhost:8080/
2024-08-17 14:30:48,928 INFO mapreduce.Job: Running job: job_local1341060779_0001
2024-08-17 14:30:48,935 INFO mapred.LocalJobRunner: OutputCommitter set in config null
2024-08