## `Mount Google Drive`

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [16]:
import os

# Install Java if not already installed
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Define the URL and file name for Hadoop
hadoop_url = "https://downloads.apache.org/hadoop/common/hadoop-3.4.0/hadoop-3.4.0.tar.gz"
hadoop_file = "hadoop-3.4.0.tar.gz"

# Check if the Hadoop tar.gz file already exists
if not os.path.exists(hadoop_file):
    # Download the Hadoop tar.gz file if it doesn't exist
    !wget -q $hadoop_url
else:
    print("Hadoop tar.gz file already exists. Skipping download.")

# Extract the Hadoop tar.gz file
!tar -xzf $hadoop_file


Hadoop tar.gz file already exists. Skipping download.


### Set Hadoop Environment Variables

In [3]:
import os

# Set the path to your Hadoop installation
hadoop_path = '/content/hadoop-3.4.0'

# Set environment variables
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['HADOOP_HOME'] = hadoop_path
os.environ['PATH'] = os.environ['PATH'] + f':{hadoop_path}/bin:{hadoop_path}/sbin'


In [4]:
# Remove existing directory and its contents
!$hadoop_path/bin/hadoop fs -rm -r -skipTrash /content/drive/MyDrive/My_Data

# Create a new empty directory
!$hadoop_path/bin/hadoop fs -mkdir -p /content/drive/MyDrive/My_Data

Deleted /content/drive/MyDrive/My_Data


### You can also create hd_path and dir_path to access hadoop and the new directory, but it is optional




In [5]:
hd_path = '/content/hadoop-3.4.0/bin/hadoop'
dir_path = '/content/drive/MyDrive/My_Data'

In [6]:
!$hadoop_path/bin/hadoop fs -ls /content/drive/MyDrive/My_Data
##shoud return nothing

### Create a file.txt file and copy it into My_Data directory using a temp file, then remove the temp file.

In [7]:
new_data="house, dog, cat, rat, house, bee"
# Create a temporary file
temp_file="file.txt"
!echo "$new_data" >> "$temp_file"
# Copy the temp_file to the dir_path/file.txt
!$hd_path fs -put -f "$temp_file" "$dir_path/file.txt"
# Clean up temporary file
!rm "$temp_file"

### Show content of the file.txt using -cat

In [8]:
!$hadoop_path/bin/hadoop fs -cat "$dir_path/file.txt"

house, dog, cat, rat, house, bee


### Exploring hdfs

In [9]:
hdfs_path = '/content/hadoop-3.4.0/bin'


### Install Python package ndfs for interacting with HDFS

## `Convert word file to txt`-- optional.
-  When your input file is Word.DOC;
- No need to do covertion if it is a .txt file

In [10]:
!pip install python-docx



In [11]:
import docx
local_file_path = '/content/drive/MyDrive/HW6_Kowarsch.docx'

doc = docx.Document(local_file_path)
text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])

with open('/content/drive/MyDrive/HW6_Kowarsch.txt', 'w') as f:
    f.write(text)

### Use hadoop executable to interact with HDFS, includign creating directories

In [12]:
hadoop_path = '/content/hadoop-3.4.0/bin/hdfs'  # Correct path to HDFS executable
hdfs_path = '/HDFS_Data'  # Folder path in HDFS
local_txt_file = '/content/drive/MyDrive/HW6_Kowarsch.txt'
hdfs_txt_file = f'{hdfs_path}/HW6_Kowarsch.txt'

# Create the directory in HDFS
!{hadoop_path} dfs -mkdir -p {hdfs_path}

# Copy the local text file to HDFS
!{hadoop_path} dfs -put -f {local_txt_file} {hdfs_txt_file}


In [13]:
!pip install pyspark



In [14]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Clear the Spark session
spark.stop()
#reconnect the session
spark = SparkSession.builder.getOrCreate()

# Read the text file from HDFS
hdfs_txt_file = '/HDFS_Data/HW6_Kowarsch.txt' #replace this file with your own txt.file
df = spark.read.text(hdfs_txt_file)

# Perform word count
word_count = (df
              .select("value")
              .withColumn("words", f.split("value", " "))
              .select("words")
              .withColumn("word", f.explode("words"))
              .groupBy("word")
              .count()
              .orderBy("count", ascending=False))

# Display the results
word_count.show(n=20, truncate=False)

+-------+-----+
|word   |count|
+-------+-----+
|the    |76   |
|       |71   |
|of     |47   |
|to     |30   |
|is     |26   |
|a      |26   |
|in     |20   |
|The    |16   |
|that   |16   |
|be     |13   |
|and    |12   |
|privacy|12   |
|query  |10   |
|,      |9    |
|data   |9    |
|patient|9    |
|for    |8    |
|on     |8    |
|or     |8    |
|set    |7    |
+-------+-----+
only showing top 20 rows



### Clear off all directories

In [15]:
hadoop_path = '/content/hadoop-3.4.0/bin/hadoop'  # Path to Hadoop binaries
hdfs_path = '/HDFS_Data'  # Folder path in HDFS
local_path = '/content/drive/MyDrive/My_Data'  # Local folder path in Hadoop

# Delete the directory in HDFS
!{hadoop_path} fs -rm -r {hdfs_path}

# Delete the directory in Hadoop
!rm -rf {local_path}


2024-05-22 23:51:08,053 INFO Configuration.deprecation: io.bytes.per.checksum is deprecated. Instead, use dfs.bytes-per-checksum
Deleted /HDFS_Data
