In [4]:
import findspark
findspark.init()

In [6]:
import pyspark
from pyspark.sql.types import StructType,StructField, StringType,IntegerType
from pyspark.sql.functions import *
from pyspark.sql import SparkSession


In [7]:
# Spark Session
spark = (
    SparkSession
    .builder
    .appName("Project")
    .master("local[*]")
    .getOrCreate()
)

spark

In [9]:
df = spark.read.option("header", True).option("inferSchema", True).csv("C:/Users/vaishnavi/Desktop/Pyspark/Google Store Pyspark Project/googleplaystore.csv")


In [15]:
df.show(1)

+--------------------+--------------+------+-------+----+--------+----+-----+--------------+------------+---------------+-----------+------------+
|                 App|      Category|Rating|Reviews|Size|Installs|Type|Price|Content Rating|      Genres|   Last Updated|Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+--------+----+-----+--------------+------------+---------------+-----------+------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 19M| 10,000+|Free|    0|      Everyone|Art & Design|January 7, 2018|      1.0.0|4.0.3 and up|
+--------------------+--------------+------+-------+----+--------+----+-----+--------------+------------+---------------+-----------+------------+
only showing top 1 row



In [14]:
df.count()

10841

In [16]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)



In [17]:
df.describe()

DataFrame[summary: string, App: string, Category: string, Rating: string, Reviews: string, Size: string, Installs: string, Type: string, Price: string, Content Rating: string, Genres: string, Last Updated: string, Current Ver: string, Android Ver: string]

In [18]:
df = df.drop("Size","Content Rating","Content Rating","Android Ver")

In [19]:
df.show(5)

+--------------------+--------------+------+-------+-----------+----+-----+--------------------+----------------+------------------+
|                 App|      Category|Rating|Reviews|   Installs|Type|Price|              Genres|    Last Updated|       Current Ver|
+--------------------+--------------+------+-------+-----------+----+-----+--------------------+----------------+------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|    10,000+|Free|    0|        Art & Design| January 7, 2018|             1.0.0|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|   500,000+|Free|    0|Art & Design;Pret...|January 15, 2018|             2.0.0|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5,000,000+|Free|    0|        Art & Design|  August 1, 2018|             1.2.4|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50,000,000+|Free|    0|        Art & Design|    June 8, 2018|Varies with device|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|   100,000+|Free|

In [20]:
df = df.drop("Current Ver")

In [22]:
df.show(5)

+--------------------+--------------+------+-------+-----------+----+-----+--------------------+----------------+
|                 App|      Category|Rating|Reviews|   Installs|Type|Price|              Genres|    Last Updated|
+--------------------+--------------+------+-------+-----------+----+-----+--------------------+----------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|    10,000+|Free|    0|        Art & Design| January 7, 2018|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|   500,000+|Free|    0|Art & Design;Pret...|January 15, 2018|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5,000,000+|Free|    0|        Art & Design|  August 1, 2018|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50,000,000+|Free|    0|        Art & Design|    June 8, 2018|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|   100,000+|Free|    0|Art & Design;Crea...|   June 20, 2018|
+--------------------+--------------+------+-------+-----------+----+-----+-------------

In [21]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)



In [32]:
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.types import IntegerType, FloatType

In [33]:
df = df.withColumn("Rating", col("Rating").cast(FloatType())) \
       .withColumn("Reviews", col("Reviews").cast(IntegerType())) \
       .withColumn("Installs", regexp_replace(col("Installs"), "[^0-9]", "")) \
       .withColumn("Installs", col("Installs").cast(IntegerType())) \
       .withColumn("Price", regexp_replace(col("Price"), "[$]", "")) \
       .withColumn("Price", col("Price").cast(FloatType()))

In [34]:
df.show()

+--------------------+--------------+------+-------+--------+----+-----+--------------------+------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|      Last Updated|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|  0.0|        Art & Design|   January 7, 2018|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|  0.0|Art & Design;Pret...|  January 15, 2018|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|  0.0|        Art & Design|    August 1, 2018|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|  0.0|        Art & Design|      June 8, 2018|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|  100000|Free|  0.0|Art & Design;Crea...|     June 20, 2018|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|   50000|Free|  0.0|        Art & Design|   

In [40]:
df.createOrReplaceTempView("Gdata")

In [46]:
spark.sql("SELECT * FROM Gdata").show()

+--------------------+--------------+------+-------+--------+----+-----+--------------------+------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|      Last Updated|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|  0.0|        Art & Design|   January 7, 2018|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|  0.0|Art & Design;Pret...|  January 15, 2018|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|  0.0|        Art & Design|    August 1, 2018|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|  0.0|        Art & Design|      June 8, 2018|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|  100000|Free|  0.0|Art & Design;Crea...|     June 20, 2018|
|Paper flowers ins...|ART_AND_DESIGN|   4.4|    167|   50000|Free|  0.0|        Art & Design|   

In [48]:
#TOP Reviews given to apps
spark.sql("select App , sum(Reviews) as Count from Gdata group by App order by count desc limit 10").show()

+--------------------+---------+
|                 App|    Count|
+--------------------+---------+
|           Instagram|266241989|
|  WhatsApp Messenger|207348304|
|      Clash of Clans|179558781|
|Messenger – Text ...|169932272|
|      Subway Surfers|166331958|
|    Candy Crush Saga|156993136|
|            Facebook|156286514|
|         8 Ball Pool| 99386198|
|        Clash Royale| 92530298|
|            Snapchat| 68045010|
+--------------------+---------+



In [51]:
#Top 10 Install Apps
spark.sql("select App,Type , sum(Installs)as Count from Gdata group by App,Type order by Count desc ").show()

+--------------------+----+----------+
|                 App|Type|     Count|
+--------------------+----+----------+
|      Subway Surfers|Free|6000000000|
|           Instagram|Free|4000000000|
|        Google Drive|Free|4000000000|
|            Hangouts|Free|4000000000|
|       Google Photos|Free|4000000000|
|         Google News|Free|4000000000|
|    Candy Crush Saga|Free|3500000000|
|  WhatsApp Messenger|Free|3000000000|
|               Gmail|Free|3000000000|
|        Temple Run 2|Free|3000000000|
|Skype - free IM &...|Free|3000000000|
|Google Chrome: Fa...|Free|3000000000|
|Messenger – Text ...|Free|3000000000|
|Maps - Navigate &...|Free|3000000000|
|     Viber Messenger|Free|2500000000|
|   Google Play Games|Free|2000000000|
|            Facebook|Free|2000000000|
|            Snapchat|Free|2000000000|
|imo free video ca...|Free|2000000000|
|  Google Street View|Free|2000000000|
+--------------------+----+----------+
only showing top 20 rows



In [52]:
#Category wise Distribution
spark.sql("select Category, sum(Installs)as Count from Gdata group by Category order by Count desc ").show()

+-------------------+-----------+
|           Category|      Count|
+-------------------+-----------+
|               GAME|35086024415|
|      COMMUNICATION|32647276251|
|       PRODUCTIVITY|14176091369|
|             SOCIAL|14069867902|
|              TOOLS|11452771915|
|             FAMILY|10258263505|
|        PHOTOGRAPHY|10088247655|
| NEWS_AND_MAGAZINES| 7496317760|
|   TRAVEL_AND_LOCAL| 6868887146|
|      VIDEO_PLAYERS| 6222002720|
|           SHOPPING| 3247848785|
|      ENTERTAINMENT| 2869160000|
|    PERSONALIZATION| 2325494782|
|BOOKS_AND_REFERENCE| 1921469576|
|             SPORTS| 1751174498|
| HEALTH_AND_FITNESS| 1582072512|
|           BUSINESS| 1001914865|
|            FINANCE|  876648734|
|          EDUCATION|  871452000|
|MAPS_AND_NAVIGATION|  719281890|
+-------------------+-----------+
only showing top 20 rows



In [57]:
#Top Paid Apps
spark.sql("select App,sum(Price) as Price from Gdata where type='Paid' group by App order by Price desc").show()

+--------------------+------------------+
|                 App|             Price|
+--------------------+------------------+
|I'm Rich - Trump ...|             400.0|
|  I AM RICH PRO PLUS|  399.989990234375|
|   I Am Rich Premium|  399.989990234375|
|      I am Rich Plus|  399.989990234375|
|I'm Rich/Eu sou R...|  399.989990234375|
|most expensive ap...|  399.989990234375|
|       I Am Rich Pro|  399.989990234375|
|  I am rich(premium)|  399.989990234375|
|           I am Rich|  399.989990234375|
|          I am Rich!|  399.989990234375|
|         💎 I'm rich|  399.989990234375|
|I am rich (Most e...|  399.989990234375|
|           I am rich|  399.989990234375|
|         Eu Sou Rico|  394.989990234375|
|           I Am Rich|  389.989990234375|
| I am extremely Rich|  379.989990234375|
|       I am rich VIP|  299.989990234375|
|        EP Cook Book|             200.0|
|Vargo Anesthesia ...|159.97999572753906|
|       cronometra-br|154.99000549316406|
+--------------------+-------------