In [1]:
#connecting to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#installing opendjdk-8 for JAVA
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [3]:
#downloading apache spark
!wget -q https://downloads.apache.org/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz

In [4]:
#extracting apache spark
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

In [5]:
#intalling apache spark
!pip install -q findspark

In [7]:
#setting JAVA_HOME and SPARK_HOME variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [8]:
#importing findspark library
import findspark
findspark.init()

In [9]:
#finding the spark library path
findspark.find()

'/content/spark-3.1.1-bin-hadoop2.7'

In [10]:
#creating our first sparksession
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [11]:
spark

In [13]:
#code to get the Spark UI
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip
get_ipython().system_raw('./ngrok http 4050 &')
!curl -s http://localhost:4040/api/tunnels

--2021-05-09 15:56:12--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 3.229.173.44, 52.202.162.45, 3.216.229.131, ...
Connecting to bin.equinox.io (bin.equinox.io)|3.229.173.44|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13832437 (13M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip.1’


2021-05-09 15:56:12 (29.9 MB/s) - ‘ngrok-stable-linux-amd64.zip.1’ saved [13832437/13832437]

Archive:  ngrok-stable-linux-amd64.zip
replace ngrok? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ngrok                   
{"tunnels":[{"name":"command_line (http)","uri":"/api/tunnels/command_line%20%28http%29","public_url":"http://4976f4f65aa8.ngrok.io","proto":"http","config":{"addr":"http://localhost:4050","inspect":true},"metrics":{"conns":{"count":0,"gauge":0,"rate1":0,"rate5":0,"rate15":0,"p50":0,"p90":0,"p95":0,"p99":0},"http":{"count":0,"rate1":0,"rate5":0,"rate15":0,"p50":0,"p9

In [14]:
#read dataset to dataframe
df = spark.read.csv("/content/drive/MyDrive/covid19/covid19_tweets.csv", header=True, inferSchema=True)

In [16]:
#Show column details
df.printSchema()

root
 |-- user_name: string (nullable = true)
 |-- user_location: string (nullable = true)
 |-- user_description: string (nullable = true)
 |-- user_created: string (nullable = true)
 |-- user_followers: string (nullable = true)
 |-- user_friends: string (nullable = true)
 |-- user_favourites: string (nullable = true)
 |-- user_verified: string (nullable = true)
 |-- date: string (nullable = true)
 |-- text: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- source: string (nullable = true)
 |-- is_retweet: string (nullable = true)



In [17]:
#Just like in Pandas Dataframe we have the df.head() function, here we have the show() function.
df.show(5)

+---------------+--------------------+--------------------+-------------------+--------------+------------+---------------+-------------+-------------------+--------------------+-----------+-------------------+----------+
|      user_name|       user_location|    user_description|       user_created|user_followers|user_friends|user_favourites|user_verified|               date|                text|   hashtags|             source|is_retweet|
+---------------+--------------------+--------------------+-------------------+--------------+------------+---------------+-------------+-------------------+--------------------+-----------+-------------------+----------+
|        ᏉᎥ☻լꂅϮ|          astroworld|wednesday addams ...|2017-05-26 05:46:42|           624|         950|          18775|        False|2020-07-25 12:27:21|If I smelled the ...|       null| Twitter for iPhone|     False|
|Tom Basile 🇺🇸|        New York, NY|Husband, Father, ...|2009-04-16 20:06:23|          2253|        1677|        

In [18]:
#number of rows in the dataframe, which we would, just use the count() function.
df.count()

323158

In [19]:
#Using the select() function we can mention any columns we want to view.
df.select("user_name","user_location","user_followers","user_favourites").show(5)

+---------------+--------------------+--------------+---------------+
|      user_name|       user_location|user_followers|user_favourites|
+---------------+--------------------+--------------+---------------+
|        ᏉᎥ☻լꂅϮ|          astroworld|           624|          18775|
|Tom Basile 🇺🇸|        New York, NY|          2253|             24|
|Time4fisticuffs|    Pewee Valley, KY|          9275|           7254|
|    ethel mertz|Stuck in the Middle |           197|           1488|
|       DIPR-J&K|   Jammu and Kashmir|        101009|            101|
+---------------+--------------------+--------------+---------------+
only showing top 5 rows



In [20]:
#have a look at the statistics regarding the dataframe. The describe() function is best suited for such purposes.
df.describe().show()

+-------+--------------------+-------------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+------------------+--------------------+
|summary|           user_name|      user_location|    user_description|        user_created|   user_followers|        user_friends|     user_favourites|       user_verified|                date|              text|            hashtags|            source|          is_retweet|
+-------+--------------------+-------------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+------------------+--------------------+
|  count|              323145|             223456|              252729|              235222|           185656|              180727|              179496|              179214|  

In [21]:
#The distinct() will come in handy when we want to determine the unique values in the categorical 
#columns in the dataframe.
df.select("user_location").distinct().show()

+--------------------+
|       user_location|
+--------------------+
|     Gainesville, FL|
| 2011-06-12 18:18:43|
| 2020-03-30 02:14:49|
|      ['dorathians']|
|            Zirakpur|
|𝓘 𝓪𝓲𝓷𝓽 𝓰𝓸?...|
|             Ahch-To|
| Mumbai, Maharashtra|
|Perth, Western Au...|
|           Bangalore|
|ÜT: -26.532499,28...|
| 2015-02-01 18:00:41|
| 2020-05-04 09:37:29|
| 2011-07-25 19:44:07|
|    ['alreadyvideo']|
|['mask', 'bighead...|
|['WithYouEverySte...|
|            Talagang|
|Marburg-Biedenkop...|
| 2020-01-28 13:04:13|
+--------------------+
only showing top 20 rows



In [22]:
#Counting and Removing Null values
from pyspark.sql import functions as F
df.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns]).show()


+---------+-------------+----------------+------------+--------------+------------+---------------+-------------+------+------+--------+------+----------+
|user_name|user_location|user_description|user_created|user_followers|user_friends|user_favourites|user_verified|  date|  text|hashtags|source|is_retweet|
+---------+-------------+----------------+------------+--------------+------------+---------------+-------------+------+------+--------+------+----------+
|       13|        99702|           70429|       87936|        137502|      142431|         143662|       143944|159247|156615|  232780|212314|    213230|
+---------+-------------+----------------+------------+--------------+------------+---------------+-------------+------+------+--------+------+----------+

