<a href="https://colab.research.google.com/github/Rodeffs/Year4_Programming/blob/master/Big_Data/03_lab/hadoop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Checking java version

In [1]:
!java --version

openjdk 17.0.16 2025-07-15
OpenJDK Runtime Environment (build 17.0.16+8-Ubuntu-0ubuntu122.04.1)
OpenJDK 64-Bit Server VM (build 17.0.16+8-Ubuntu-0ubuntu122.04.1, mixed mode, sharing)


Installing Hadoop

In [2]:
!wget https://dlcdn.apache.org/hadoop/common/hadoop-3.4.2/hadoop-3.4.2.tar.gz

--2025-12-04 18:42:28--  https://dlcdn.apache.org/hadoop/common/hadoop-3.4.2/hadoop-3.4.2.tar.gz
Resolving dlcdn.apache.org (dlcdn.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to dlcdn.apache.org (dlcdn.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1065831750 (1016M) [application/x-gzip]
Saving to: ‘hadoop-3.4.2.tar.gz’


2025-12-04 18:43:17 (109 MB/s) - ‘hadoop-3.4.2.tar.gz’ saved [1065831750/1065831750]



In [3]:
!tar -xf hadoop-3.4.2.tar.gz

Installing the dataset

In [4]:
!curl -L https://www.kaggle.com/api/v1/datasets/download/beta3logic/3m-academic-papers-titles-and-abstracts -o dataset.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1409M  100 1409M    0     0  77.6M      0  0:00:18  0:00:18 --:--:-- 83.3M


In [5]:
!unzip dataset.zip

Archive:  dataset.zip
  inflating: cleaned_papers.csv      


Setting the variables

In [6]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
os.environ["JRE_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64/jre"
os.environ["HADOOP_HOME"] = "/content/hadoop-3.4.2"
os.environ["PATH"] += ":$JAVA_HOME/bin:$JRE_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin"

Splitting dataset into multiple files

In [7]:
%%writefile splitter.py

import re
import pandas as pd


df_input = "/content/cleaned_papers.csv"

df = pd.read_csv(df_input)
i, j = 0, 0
max_lines = 100000
file = open(f"/content/datasets/dataset{j}.txt", mode="w", encoding="utf-8")

for row in df.itertuples(index=False):
    line = str(row.title + ". " + row.abstract).lower()  # объединить оба столбца и перевести в нижний регистр
    line = re.sub(r"\s*\n\s*", ' ', line)  # убрать переносы на след. строку
    file.write(line + "\n")
    i += 1

    if i == max_lines:
        file.close()
        i = 0
        j += 1
        file = open(f"/content/datasets/dataset{j}.txt", mode="w", encoding="utf-8")

file.close()


Writing splitter.py


In [8]:
!mkdir datasets

In [9]:
!python splitter.py

Making a HDFS directory and copying files there

In [12]:
!$HADOOP_HOME/bin/hdfs dfs -mkdir /popular_topics

In [13]:
!$HADOOP_HOME/bin/hdfs dfs -put /content/datasets /popular_topics

Writing a mapper

In [14]:
%%writefile mapper.py

#!/usr/bin/env python

import re
import sys

regexp = r"([^a-z^\s^'^-])|(?:^|[^a-z])['-]|['-](?:^|[^a-z])|'*(?<![a-z-])(?:a|an|the|and|or|as|of|in|on|yet|our|than|then|however|at|but|was|were|which|there|this|that|thus|we|to|for|is|are|where|have|has|been|since|with|such|another|also|by|often|can|could|so|from|its|via|will|hence|should|would|shall|what|although|these|those|do|does|did|under|above|else|if|while|when|who|based|way|very|many|much|due|because|onto|into|out|finally|their|they|may|might|up|down|either|neither|nor|within|according|others|about|therefore|no|not|towards|beyond|behind|over|how|both|without|other|another|more|most|moreover|be|furthermore|why|paper|focuses|well|must|consider|using|used|commonly|some|given|among|able|present|his|her|he|she|obtained|makes|give|make|further|use|introduce|employ|uses|show|allows|gives|introduces|considers|through|take|takes|enable|enables|allow|every|each|called|provide|provides|cannot|allowing|even|though|after|around|upon|you|new)(?![a-z-])'*"

for line in sys.stdin:
    for combination in re.split(regexp, line):
        if combination is None:
            continue

        combination = combination.strip()

        if len(re.split(r"\s+", combination)) >= 2: # считаем за темы пары слов больше 2
            print(combination + ";1")


Writing mapper.py


In [15]:
!chmod 777 mapper.py

Writing a reducer

In [16]:
%%writefile reducer.py

#!/usr/bin/env python

import sys

prev_entry = None
result = 0

for line in sys.stdin:
    line = line.strip()
    entry, value = line.split(';')

    # Проверки, что вводимые данные можно преобразовать

    try:
        value = int(value)
    except ValueError:
        continue

    if entry is None:
        continue

    # Т.к. Hadoop по умолчанию сортирует вывод mapper, то можно сделать так

    if prev_entry != entry and prev_entry is not None:
        print(f"{prev_entry};{result}")
        result = 0

    prev_entry = entry
    result += value

Writing reducer.py


In [17]:
!chmod 777 reducer.py

Now we can run the Hadoop MapReduce program

In [18]:
!$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.4.2.jar \
-input /popular_topics/datasets \
-output /popular_topics/output \
-mapper "python /content/mapper.py" \
-reducer "python /content/reducer.py"

aliyun-java-sdk-core-4.5.10.jar        hadoop-fs2img-3.4.2.jar
aliyun-java-sdk-kms-2.11.0.jar	       hadoop-gridmix-3.4.2.jar
aliyun-java-sdk-ram-3.1.0.jar	       hadoop-kafka-3.4.2.jar
aliyun-sdk-oss-3.13.2.jar	       hadoop-minicluster-3.4.2.jar
analyticsaccelerator-s3-1.2.1.jar      hadoop-resourceestimator-3.4.2.jar
azure-data-lake-store-sdk-2.3.9.jar    hadoop-rumen-3.4.2.jar
azure-keyvault-core-1.0.0.jar	       hadoop-sls-3.4.2.jar
azure-storage-7.0.1.jar		       hadoop-streaming-3.4.2.jar
bundle-2.29.52.jar		       hamcrest-core-1.3.jar
hadoop-aliyun-3.4.2.jar		       ini4j-0.5.4.jar
hadoop-archive-logs-3.4.2.jar	       jdk.tools-1.8.jar
hadoop-archives-3.4.2.jar	       jdom2-2.0.6.1.jar
hadoop-aws-3.4.2.jar		       junit-4.13.2.jar
hadoop-azure-3.4.2.jar		       kafka-clients-3.9.0.jar
hadoop-azure-datalake-3.4.2.jar        lz4-java-1.7.1.jar
hadoop-client-3.4.2.jar		       ojalgo-43.0.jar
hadoop-datajoin-3.4.2.jar	       opentracing-api-0.33.0.jar
hadoop-distcp-3.4.2.jar		    