In [1]:
# Setup Spark SQL
# Note if running locally you need the JVM https://www.oracle.com/java/technologies/downloads/
# Consider running in https://colab.research.google.com/
%pip install pyspark



In [2]:

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("BibleCounter").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('WARN')

In [3]:
# Download the bible as txt
!curl -L "https://www.gutenberg.org/cache/epub/10/pg10.txt" > bible.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4351k  100 4351k    0     0  7459k      0 --:--:-- --:--:-- --:--:-- 7451k


In [6]:
# Use SparkSQL to list all the words in the bible with their counts sorted descending by count.
# Turn in your code and sample of your results (at least top 20).
# Be sure to ignore case and punctuation and eliminate the blank lines.
from pyspark.sql.functions import lower, regexp_replace, split, explode, col

# 1. 读取全文，并去掉空行
lines = spark.read.text("bible.txt") \
    .filter(col("value") != "")

# 2. 清洗：全转小写、把非 a–z 字符都替成空格；然后切分、explode 成每行一个单词
words = (
    lines
    .select(regexp_replace(lower(col("value")), "[^a-z\\s]", " ").alias("clean_line"))
    .select(explode(split(col("clean_line"), "\\s+")).alias("word"))
    .filter(col("word") != "")
)

# 3. 注册成临时视图，用 Spark SQL 做分组计数、排序、取前 20
words.createOrReplaceTempView("words_view")

top20 = spark.sql("""
  SELECT word,
         COUNT(*) AS cnt
    FROM words_view
   GROUP BY word
   ORDER BY cnt DESC
   LIMIT 20
""")

top20.show(20, False)

+-----+-----+
|word |cnt  |
+-----+-----+
|the  |64309|
|and  |51762|
|of   |34846|
|to   |13680|
|that |12927|
|in   |12727|
|he   |10422|
|shall|9840 |
|unto |8997 |
|for  |8997 |
|i    |8854 |
|his  |8473 |
|a    |8235 |
|lord |7964 |
|they |7378 |
|be   |7030 |
|is   |7012 |
|him  |6659 |
|not  |6624 |
|them |6430 |
+-----+-----+



In [None]:
# What to try something else?