In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,regexp_replace
from pyspark.sql.types import DoubleType,IntegerType

In [0]:
spark = SparkSession.builder.appName("Real time End to End PySpark Project").getOrCreate()

In [0]:
google_df = spark.read.csv("/FileStore/tables/googleplaystore.csv",header="true",inferSchema="true")
google_df.show(2)

+--------------------+--------------+------+-------+----+--------+----+-----+--------------+--------------------+----------------+-----------+------------+
|                 App|      Category|Rating|Reviews|Size|Installs|Type|Price|Content Rating|              Genres|    Last Updated|Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+--------+----+-----+--------------+--------------------+----------------+-----------+------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 19M| 10,000+|Free|    0|      Everyone|        Art & Design| January 7, 2018|      1.0.0|4.0.3 and up|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967| 14M|500,000+|Free|    0|      Everyone|Art & Design;Pret...|January 15, 2018|      2.0.0|4.0.3 and up|
+--------------------+--------------+------+-------+----+--------+----+-----+--------------+--------------------+----------------+-----------+------------+
only showing top 2 rows



In [0]:
google_df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)



In [0]:
# Drop columns which are not required for the KPI's
google_df = google_df.drop("Size","Content Rating","Last Updated","Current Ver","Android Ver")
google_df.show()

+--------------------+--------------+------+-------+-----------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|   Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+-----------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|    10,000+|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|   500,000+|Free|    0|Art & Design;Pret...|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5,000,000+|Free|    0|        Art & Design|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50,000,000+|Free|    0|        Art & Design|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|   100,000+|Free|    0|Art & Design;Crea...|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|    50,000+|Free|    0|        Art & Design|
|Smoke Effect Phot...|ART_AND_DESIGN|   3.8|    178|    50,000+|Free|    0|        Art & Design|
|    Infinite Painter|ART_AND_

In [0]:
google_df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Genres: string (nullable = true)



In [0]:
# Some columns are not in proper format. We will try to convert them.
# Cast Rating directly to DoubleType, and Reviews to IntegerType
# Installs and Price had unnecessary characters in them so we use regexp_replace function from pyspark.sql.functions to replace those chars with "" before casting them to Integer and Double Type resp.

google_df = google_df.withColumn("Rating",col("Rating").cast(DoubleType()))\
    .withColumn("Reviews",col("Reviews").cast(IntegerType()))\
        .withColumn("Installs",regexp_replace("Installs","[^0-9]","").cast(IntegerType()))\
            .withColumn("Price",regexp_replace("Price","[^0-9.]","").cast(DoubleType()))
google_df.show()            

+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|  0.0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|  0.0|Art & Design;Pret...|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|  0.0|        Art & Design|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|  0.0|        Art & Design|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|  100000|Free|  0.0|Art & Design;Crea...|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|   50000|Free|  0.0|        Art & Design|
|Smoke Effect Phot...|ART_AND_DESIGN|   3.8|    178|   50000|Free|  0.0|        Art & Design|
|    Infinite Painter|ART_AND_DESIGN|   4.1|  36815| 1000000

In [0]:
#just check if price is coming up correctly
google_df.filter(google_df.Type!="Free").show()

+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+
|TurboScan: scan d...|      BUSINESS|   4.7|  11442|  100000|Paid| 4.99|            Business|
|Tiny Scanner Pro:...|      BUSINESS|   4.8|  10295|  100000|Paid| 4.99|            Business|
|TurboScan: scan d...|      BUSINESS|   4.7|  11442|  100000|Paid| 4.99|            Business|
|Tiny Scanner Pro:...|      BUSINESS|   4.8|  10295|  100000|Paid| 4.99|            Business|
|  Puffin Browser Pro| COMMUNICATION|   4.0|  18247|  100000|Paid| 3.99|       Communication|
|Moco+ - Chat, Mee...|        DATING|   4.2|   1545|   10000|Paid| 3.99|              Dating|
|          Calculator|        DATING|   2.6|     57|    1000|Paid| 6.99|              Dating|
|   Truth or Dare Pro|        DATING|   NaN|      0|      50

In [0]:
google_df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Installs: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Genres: string (nullable = true)



In [0]:
# Now we will go for KPI's
# We will solve them in SQL and in pyspark dataframes
# For SQL we will create a temperary view

google_df.createOrReplaceTempView("google_view")

In [0]:
%sql select * from google_view limit 10;

App,Category,Rating,Reviews,Installs,Type,Price,Genres
Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,0.0,Art & Design
Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,0.0,Art & Design;Pretend Play
"U Launcher Lite – FREE Live Cool Themes, Hide Apps",ART_AND_DESIGN,4.7,87510,5000000,Free,0.0,Art & Design
Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,0.0,Art & Design
Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,0.0,Art & Design;Creativity
Paper flowers instructions,ART_AND_DESIGN,4.4,167,50000,Free,0.0,Art & Design
Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,50000,Free,0.0,Art & Design
Infinite Painter,ART_AND_DESIGN,4.1,36815,1000000,Free,0.0,Art & Design
Garden Coloring Book,ART_AND_DESIGN,4.4,13791,1000000,Free,0.0,Art & Design
Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,10000,Free,0.0,Art & Design;Creativity


In [0]:
# Now we can go for KPI's
# First by spark dataframe
top_reviews = (google_df.groupBy("App").agg({"Reviews":"sum"}).orderBy("sum(Reviews)",ascending=False))
display(top_reviews)


App,sum(Reviews)
Instagram,266241989.0
WhatsApp Messenger,207348304.0
Clash of Clans,179558781.0
Messenger – Text and Video Chat for Free,169932272.0
Subway Surfers,166331958.0
Candy Crush Saga,156993136.0
Facebook,156286514.0
8 Ball Pool,99386198.0
Clash Royale,92530298.0
Snapchat,68045010.0


In [0]:
%sql select App,sum(Reviews) from google_view group by 1 order by 2 desc limit 6;  

App,sum(Reviews)
Instagram,266241989
WhatsApp Messenger,207348304
Clash of Clans,179558781
Messenger – Text and Video Chat for Free,169932272
Subway Surfers,166331958
Candy Crush Saga,156993136


In [0]:
installed_apps = (google_df.groupBy("App","Type").agg({"Installs":"sum"}).orderBy("sum(Installs)",asccending=False))
display(installed_apps)

App,Type,sum(Installs)
Life Made WI-Fi Touchscreen Photo Frame,0,
cronometra-br,Paid,0.0
EG | Explore Folegandros,Paid,0.0
CX Network,Free,0.0
Ak Parti Yardım Toplama,Paid,0.0
Popsicle Launcher for Android P 9.0 launcher,Paid,0.0
Sweden Newspapers,Free,0.0
Ain Arabic Kids Alif Ba ta,Paid,0.0
AP Series Solution Pro,Paid,0.0
Pekalongan CJ,Free,0.0


In [0]:
%sql select App,Type,sum(Installs) from google_view group by 1,2 order by 3 desc limit 6;

App,Type,sum(Installs)
Subway Surfers,Free,6000000000
Google Drive,Free,4000000000
Instagram,Free,4000000000
Hangouts,Free,4000000000
Google Photos,Free,4000000000
Google News,Free,4000000000


In [0]:
category_dist = (google_df.groupBy("Category").agg({"Installs":"sum"}).orderBy("sum(Installs)",ascending=False))
display(category_dist)

Category,sum(Installs)
GAME,35086024415.0
COMMUNICATION,32647276251.0
PRODUCTIVITY,14176091369.0
SOCIAL,14069867902.0
TOOLS,11452771915.0
FAMILY,10258263505.0
PHOTOGRAPHY,10088247655.0
NEWS_AND_MAGAZINES,7496317760.0
TRAVEL_AND_LOCAL,6868887146.0
VIDEO_PLAYERS,6222002720.0


In [0]:
%sql select Category, sum(Installs) from google_view group by 1 order by 2 desc limit 7;

Category,sum(Installs)
GAME,35086024415
COMMUNICATION,32647276251
PRODUCTIVITY,14176091369
SOCIAL,14069867902
TOOLS,11452771915
FAMILY,10258263505
PHOTOGRAPHY,10088247655


In [0]:
top_paid = (google_df.filter(google_df.Type=="Paid").groupBy("App").agg({"Price":"sum"}).orderBy("sum(Price)",ascending=False))
display(top_paid)

App,sum(Price)
I'm Rich - Trump Edition,400.0
I am Rich Plus,399.99
I AM RICH PRO PLUS,399.99
I'm Rich/Eu sou Rico/أنا غني/我很有錢,399.99
I Am Rich Premium,399.99
most expensive app (H),399.99
I Am Rich Pro,399.99
I am rich(premium),399.99
I am Rich,399.99
I am Rich!,399.99


In [0]:
%sql select App,sum(Price) from google_view where Type='Paid' group by 1 order by 2 desc limit 7;

App,sum(Price)
I'm Rich - Trump Edition,400.0
I Am Rich Pro,399.99
I am rich,399.99
I am rich(premium),399.99
I am Rich Plus,399.99
I am Rich,399.99
I'm Rich/Eu sou Rico/أنا غني/我很有錢,399.99


In [0]:
top_rated = (google_df.filter((google_df.Type=="Paid") & (google_df.Rating!="NaN")).groupBy("App").agg({"Rating":"avg"}).orderBy("avg(Rating)",ascending=False))
display(top_rated)

App,avg(Rating)
AC DC Power Monitor,5.0
AJ Gray Dark Icon Pack,5.0
AP Art History Flashcards,5.0
Hey AJ! It's Bedtime!,5.0
Easy Hotspot Ad Free,5.0
DL Image Manager,5.0
FHR 5-Tier 2.0,5.0
Super Hearing Secret Voices Recorder PRO,5.0
AJ Blue Icon Pack,5.0
Mu.F.O.,5.0


In [0]:
%sql select App,avg(Rating) from google_view where Type = 'Paid' and Rating<>'NaN' group by 1 order by 2 desc limit 7;

App,avg(Rating)
DL Image Manager,5.0
P-Home for KLWP,5.0
FHR 5-Tier 2.0,5.0
AJ Gray Dark Icon Pack,5.0
Super Hearing Secret Voices Recorder PRO,5.0
Easy Hotspot Ad Free,5.0
AJ Blue Icon Pack,5.0
