# Installing Hadoop on Google Colab
## Installation may take up to 10 min. based on your connection

In [1]:
# The parent directory:
# /content/

In [1]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.7.0-bin-hadoop2.7"
!wget https://archive.apache.org/dist/hadoop/common/hadoop-2.7.0/hadoop-2.7.0.tar.gz
!tar -xzvf hadoop-2.7.0.tar.gz
!cp -r hadoop-2.7.0/ /usr/local/
!readlink -f /usr/bin/java | sed "s:bin/java::"
!/usr/local/hadoop-2.7.0/bin/hadoop fs -ls

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
hadoop-2.7.0/share/doc/hadoop/hadoop-hdfs-httpfs/apidocs/org/apache/hadoop/fs/http/server/FSOperations.FSRemoveAcl.html
hadoop-2.7.0/share/doc/hadoop/hadoop-hdfs-httpfs/apidocs/org/apache/hadoop/fs/http/server/package-use.html
hadoop-2.7.0/share/doc/hadoop/hadoop-hdfs-httpfs/apidocs/org/apache/hadoop/fs/http/server/FSOperations.FSRename.html
hadoop-2.7.0/share/doc/hadoop/hadoop-hdfs-httpfs/apidocs/org/apache/hadoop/fs/http/server/HttpFSParametersProvider.DataParam.html
hadoop-2.7.0/share/doc/hadoop/hadoop-hdfs-httpfs/apidocs/org/apache/hadoop/fs/http/server/FSOperations.FSRemoveDefaultAcl.html
hadoop-2.7.0/share/doc/hadoop/hadoop-hdfs-httpfs/apidocs/org/apache/hadoop/fs/http/server/HttpFSParametersProvider.AccessTimeParam.html
hadoop-2.7.0/share/doc/hadoop/hadoop-hdfs-httpfs/apidocs/org/apache/hadoop/fs/http/client/
hadoop-2.7.0/share/doc/hadoop/hadoop-hdfs-httpfs/apidocs/org/apache/hadoop/fs/http/client/package-tree.html

In [2]:
!/usr/local/hadoop-2.7.0/bin/hadoop fs -ls

Found 4 items
drwxr-xr-x   - root  root        4096 2023-11-10 14:21 .config
drwxr-xr-x   - 10021 10021       4096 2015-04-10 18:51 hadoop-2.7.0
-rw-r--r--   1 root  root   210343364 2015-04-21 16:47 hadoop-2.7.0.tar.gz
drwxr-xr-x   - root  root        4096 2023-11-10 14:22 sample_data


## Now you're ready to run any command using HDFS as follows:

In [3]:
!/usr/local/hadoop-2.7.0/bin/hadoop fs -mkdir /content/lab1/



In [4]:
!/usr/local/hadoop-2.7.0/bin/hadoop fs -mkdir /content/lab1/input/



In [10]:
!/usr/local/hadoop-2.7.0/bin/hadoop fs -copyFromLocal /content/citiestemp.txt /content/lab1/input/



In [6]:
!java -version

openjdk version "11.0.20.1" 2023-08-24
OpenJDK Runtime Environment (build 11.0.20.1+1-post-Ubuntu-0ubuntu122.04)
OpenJDK 64-Bit Server VM (build 11.0.20.1+1-post-Ubuntu-0ubuntu122.04, mixed mode, sharing)


# Create Mapper

In [30]:
%%file /content/lab1/mapper.py
import sys
# input comes from standard input
for line in sys.stdin:
    tokens = line.strip().split(',')
    city = tokens[0]
    f_temp = tokens[1]
    try:
        c_temp = (float(f_temp) - 32) * 5.0/9.0
    except ValueError:
        # ignore if count was not a number
        continue
    print(f"{city}, {c_temp}")

Overwriting /content/lab1/mapper.py


# Create Reducer

In [35]:
%%file /content/lab1/reducer.py
import sys

# first existance
current_city = None
total_temp = 0
temp_cnt = 0

for line in sys.stdin:
  city, c_temp = line.split(',')
  c_temp = float(c_temp)
  if current_city == city:
    total_temp += c_temp
    temp_cnt += 1
  else:
    if current_city:
        temp_avg = total_temp / temp_cnt
        print(f'{current_city}, {temp_avg}')

    current_city = city
    total_temp = c_temp
    temp_cnt = 1

Overwriting /content/lab1/reducer.py


# Check everything out

In [36]:
!chmod +x /content/lab1/mapper.py /content/lab1/reducer.py

In [37]:
!python3 /content/lab1/mapper.py < /content/lab1/input/citiestemp.txt | python3 /content/lab1/reducer.py > ans.txt

In [38]:
!/usr/local/hadoop-2.7.0/bin/hadoop fs -rm -r /content/lab1/output

rm: `/content/lab1/output': No such file or directory


# Running MapReduce via Hadoop

In [39]:
#Running MapReduce programs
!/usr/local/hadoop-2.7.0/bin/hadoop jar /content/hadoop-2.7.0/share/hadoop/tools/lib/hadoop-streaming-2.7.0.jar \
  -input /content/lab1/input/citiestemp.txt \
  -output /content/lab1/output \
  -mapper "python3 /content/lab1/mapper.py" \
  -reducer "python3 /content/lab1/reducer.py"


23/11/14 09:30:33 INFO Configuration.deprecation: session.id is deprecated. Instead, use dfs.metrics.session-id
23/11/14 09:30:33 INFO jvm.JvmMetrics: Initializing JVM Metrics with processName=JobTracker, sessionId=
23/11/14 09:30:33 INFO jvm.JvmMetrics: Cannot initialize JVM Metrics with processName=JobTracker, sessionId= - already initialized
23/11/14 09:30:33 INFO mapred.FileInputFormat: Total input paths to process : 1
23/11/14 09:30:33 INFO mapreduce.JobSubmitter: number of splits:1
23/11/14 09:30:34 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local397362016_0001
23/11/14 09:30:34 INFO mapreduce.Job: The url to track the job: http://localhost:8080/
23/11/14 09:30:34 INFO mapreduce.Job: Running job: job_local397362016_0001
23/11/14 09:30:34 INFO mapred.LocalJobRunner: OutputCommitter set in config null
23/11/14 09:30:34 INFO mapred.LocalJobRunner: OutputCommitter is org.apache.hadoop.mapred.FileOutputCommitter
23/11/14 09:30:34 INFO output.FileOutputCommitter: File 

In [40]:
!/usr/local/hadoop-2.7.0/bin/hadoop dfs -cat /content/lab1/output/*

DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.

Beijing, 14.38888888888889	
Berlin, 8.685185185185185	
Chicago, 5.259259259259259	
Dallas, 24.59259259259259	
Houston, 26.425925925925924	
London, 13.166666666666668	
Los Angeles, 22.388888888888886	
Melbourne, 16.5	
New York, 13.055555555555555	
Paris, 12.388888888888891	
Philadelphia, 16.40740740740741	
Phoenix, 31.61111111111111	
San Antonio, 25.194444444444443	
San Diego, 21.37037037037037	
San Jose, 18.222222222222225	
Shanghai, 17.98148148148148	
Sydney, 21.907407407407405	
Tokyo, 16.833333333333332	
Toronto, 7.3888888888888875	


# Save lab1 folder as zip file

In [42]:
!zip -r /content/lab1.zip /content/lab1


  adding: content/lab1/ (stored 0%)
  adding: content/lab1/input/ (stored 0%)
  adding: content/lab1/input/citiestemp.txt (deflated 63%)
  adding: content/lab1/input/.citiestemp.txt.crc (stored 0%)
  adding: content/lab1/mapper.py (deflated 34%)
  adding: content/lab1/output/ (stored 0%)
  adding: content/lab1/output/._SUCCESS.crc (stored 0%)
  adding: content/lab1/output/part-00000 (deflated 51%)
  adding: content/lab1/output/.part-00000.crc (stored 0%)
  adding: content/lab1/output/_SUCCESS (stored 0%)
  adding: content/lab1/reducer.py (deflated 53%)
