## 5. PySpark

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [2]:
!wget https://dlcdn.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
!tar xf spark-3.3.1-bin-hadoop3.tgz
!rm spark-3.3.1-bin-hadoop3.tgz

--2022-12-11 21:21:10--  https://dlcdn.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
Resolving dlcdn.apache.org (dlcdn.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to dlcdn.apache.org (dlcdn.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 299350810 (285M) [application/x-gzip]
Saving to: ‘spark-3.3.1-bin-hadoop3.tgz’


2022-12-11 21:21:11 (206 MB/s) - ‘spark-3.3.1-bin-hadoop3.tgz’ saved [299350810/299350810]



In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop3"

!pip install -q findspark
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) #  This will format our output tables a bit nicer when not using the show() method
spark

import multiprocessing

In [5]:
sc = spark.sparkContext

In [6]:
!wget https://csserver.ucd.ie/~thomas/tweets.tsv

--2022-12-11 21:21:41--  https://csserver.ucd.ie/~thomas/tweets.tsv
Resolving csserver.ucd.ie (csserver.ucd.ie)... 193.1.133.60
Connecting to csserver.ucd.ie (csserver.ucd.ie)|193.1.133.60|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22470255 (21M) [text/tab-separated-values]
Saving to: ‘tweets.tsv’


2022-12-11 21:21:43 (15.0 MB/s) - ‘tweets.tsv’ saved [22470255/22470255]



In [7]:
df = spark.read.options(delimiter="\t", header=True,inferSchema=True).csv("tweets.tsv")
df.select("*").show(30)

+---------+-------------------+-------------------+---------------+--------------------+
|wordle_id|           tweet_id|         tweet_date| tweet_username|          tweet_text|
+---------+-------------------+-------------------+---------------+--------------------+
|      210|1482553374591660037|2022-01-16 03:20:43|       bpszebes|Wordle 210 4/6  ⬛...|
|      210|1482553387937898499|2022-01-16 03:20:46|     cruisecoup|Wordle 210 4/6  ⬜...|
|      210|1482553422276698113|2022-01-16 03:20:55|     DestroVega|Wordle 210 4/6  ⬜...|
|      210|1482553436910628866|2022-01-16 03:20:58|    brenmardash|Wordle 210 3/6  ⬜...|
|      210|1482553445726908420|2022-01-16 03:21:00|    KatieHowse2|Wordle 210 3/6  ⬛...|
|      210|1482553448025395202|2022-01-16 03:21:01|        iconoco|Wordle 210 4/6  ⬛...|
|      210|1482553451439603720|2022-01-16 03:21:01|   ParickHarmon|Wordle 210 3/6  ?...|
|      210|1482553460251709443|2022-01-16 03:21:04|     Revnan2001|Wordle 210 4/6  ⬜...|
|      210|1482553474

In [9]:
rdd = df.rdd.map(list)
rddCollect = rdd.collect()

In [10]:
#Most tweeted Wordle Puzzle 
df.groupBy("wordle_id").count().orderBy("count", ascending=False).show(1)

+---------+-----+
|wordle_id|count|
+---------+-----+
|      223|15776|
+---------+-----+
only showing top 1 row



In [11]:
#Count of wordle, play, the  in descending order
list_of_count = df.filter(df.tweet_text.rlike("the")).count() ,df.filter(df.tweet_text.rlike("play")).count(),df.filter(df.tweet_text.rlike("Wordle")).count()
print (list_of_count)
sorted_list = list(list_of_count)
sorted_list.sort(reverse=True)
print('count of (wordle) : ',sorted_list[0])
print('count of (the) : ',sorted_list[1])
print('count of (play) :',sorted_list[2])

(4096, 617, 136964)
count of (wordle) :  136964
count of (the) :  4096
count of (play) : 617


In [15]:
!pip install pyspark


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 38 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 63.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=37b8227d2b4722d823cefcb95630b15c5a67510b56a8ddd21e62ef54b94abee7
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [38]:
df.createOrReplaceTempView("most_games_tweets")
output = spark.sql("""SELECT DATE_FORMAT(tweet_date,'E') as most_tweeted_day, count(1) 
            FROM most_games_tweets GROUP BY most_tweeted_day ORDER BY count(1) DESC""")
output.show(1)

+----------------+--------+
|most_tweeted_day|count(1)|
+----------------+--------+
|             Fri|   28737|
+----------------+--------+
only showing top 1 row

