<a href="https://colab.research.google.com/github/SingaLeCapi/learning_pyspark/blob/main/Spark_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=5b8aaddfa22aebbb68cd91e73a510027d0593051f54a539bf6ab0adfa2098f4c
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

spark

In [4]:
from google.colab import files

uploaded = files.upload()


Saving 1342-0.txt to 1342-0.txt


In [5]:
# Ingesting data
book = spark.read.text("/content/1342-0.txt")
book

DataFrame[value: string]

In [6]:
# Printing the Schema of the DataFrame
book.printSchema()
# Displaying list of tuple of format (column_name, column_type)
print(book.dtypes)

root
 |-- value: string (nullable = true)

[('value', 'string')]


In [7]:
# Exploring DataFrame structure using the show() method.
book.show() # By default will display the first 20 rows and truncate long values

+--------------------+
|               value|
+--------------------+
|The Project Guten...|
|                    |
|This eBook is for...|
|almost no restric...|
|re-use it under t...|
|with this eBook o...|
|                    |
|                    |
|Title: Pride and ...|
|                    |
| Author: Jane Austen|
|                    |
|Posting Date: Aug...|
|Release Date: Jun...|
|Last Updated: Mar...|
|                    |
|   Language: English|
|                    |
|Character set enc...|
|                    |
+--------------------+
only showing top 20 rows



In [8]:
# The show() method take three optinal parameters
book.show(30, truncate=40)

+----------------------------------------+
|                                   value|
+----------------------------------------+
|The Project Gutenberg EBook of Pride ...|
|                                        |
|This eBook is for the use of anyone a...|
|almost no restrictions whatsoever.  Y...|
|re-use it under the terms of the Proj...|
|with this eBook or online at www.gute...|
|                                        |
|                                        |
|              Title: Pride and Prejudice|
|                                        |
|                     Author: Jane Austen|
|                                        |
|Posting Date: August 26, 2008 [EBook ...|
|                Release Date: June, 1998|
|            Last Updated: March 10, 2018|
|                                        |
|                       Language: English|
|                                        |
|           Character set encoding: UTF-8|
|                                        |
|*** START 

In [10]:
# Transformation of the book DataFrame
# Splitting each sentence of the DataFrame to a list of word
from pyspark.sql.functions import split
lines = book.select(split(book.value, " ").alias("line"))
lines.show(5)

+--------------------+
|                line|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re-use, it, unde...|
+--------------------+
only showing top 5 rows



In [11]:
# Checking the schema Lines, this will help us understand what has been done in the background
lines.printSchema()

root
 |-- line: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [14]:
# This is the continuation of the transformation of our DataFrame
# In this section we will be exploding the previous created DataFrame "lines"
from pyspark.sql.functions import explode, col
words = lines.select(explode(col("line")).alias("word"))

words.show(15)

+----------+
|      word|
+----------+
|       The|
|   Project|
| Gutenberg|
|     EBook|
|        of|
|     Pride|
|       and|
|Prejudice,|
|        by|
|      Jane|
|    Austen|
|          |
|      This|
|     eBook|
|        is|
+----------+
only showing top 15 rows



In [18]:
# In the above display we can see that "Prejudice" has a comma athe end, also there is a space between "Auten" and "This"
# How focus in this section will be to clean the above DataFrame and ensure we have a clean data.
# We will first change the entire DataFrame to lower case
from pyspark.sql.functions import col, lower
words_lower = words.select(lower(col("word")).alias("word_lower"))

words_lower.show()

+----------+
|word_lower|
+----------+
|       the|
|   project|
| gutenberg|
|     ebook|
|        of|
|     pride|
|       and|
|prejudice,|
|        by|
|      jane|
|    austen|
|          |
|      this|
|     ebook|
|        is|
|       for|
|       the|
|       use|
|        of|
|    anyone|
+----------+
only showing top 20 rows



In [20]:
# Using regex_extract to remove any form of punctuation
from pyspark.sql.functions import regexp_extract

words_clean = words_lower.select(regexp_extract(col("word_lower"), "[a-z]+", 0).alias("word"))

words_clean.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|         |
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
+---------+
only showing top 20 rows



In [21]:
# Now that we have change all records in lower case and remove punctuations.
# We will remove all spaces using the filter() method provided by PySpark
words_nonnull = words_clean.filter(col("word") != "")

words_nonnull.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
| anywhere|
+---------+
only showing top 20 rows

