In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import *

In [0]:
path="/FileStore/tables/googlestore.csv"
Apps=spark.read.load("/FileStore/tables/googlestore.csv", format='csv', header=True, sep=',', escape='"', inferschema=True)
Apps.show(3)

+--------------------+--------------+------+-------+----+----------+----+-----+--------------+--------------------+----------------+-----------+------------+
|                 App|      Category|Rating|Reviews|Size|  Installs|Type|Price|Content Rating|              Genres|    Last Updated|Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+----------+----+-----+--------------+--------------------+----------------+-----------+------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 19M|   10,000+|Free|    0|      Everyone|        Art & Design| January 7, 2018|      1.0.0|4.0.3 and up|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967| 14M|  500,000+|Free|    0|      Everyone|Art & Design;Pret...|January 15, 2018|      2.0.0|4.0.3 and up|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510|8.7M|5,000,000+|Free|    0|      Everyone|        Art & Design|  August 1, 2018|      1.2.4|4.0.3 and up|
+--------------------+--------------+------+-------+

In [0]:
Apps=Apps.drop("Android Ver","Current Ver","Last Updated","Size","Genres")
Apps.show(10)

+--------------------+--------------+------+-------+--------+----+-----+--------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|Content Rating|
+--------------------+--------------+------+-------+--------+----+-----+--------------+
|Photo Editor & Ca...|ART_AND_DESIGN|     4|    159|   10000|Free|    0|      Everyone|
| Coloring book moana|ART_AND_DESIGN|     3|    967|  500000|Free|    0|      Everyone|
|U Launcher Lite –...|ART_AND_DESIGN|     4|  87510| 5000000|Free|    0|      Everyone|
|Sketch - Draw & P...|ART_AND_DESIGN|     4| 215644|50000000|Free|    0|          Teen|
|Pixel Draw - Numb...|ART_AND_DESIGN|     4|    967|  100000|Free|    0|      Everyone|
|Paper flowers ins...|ART_AND_DESIGN|     4|    167|   50000|Free|    0|      Everyone|
|Smoke Effect Phot...|ART_AND_DESIGN|     3|    178|   50000|Free|    0|      Everyone|
|    Infinite Painter|ART_AND_DESIGN|     4|  36815| 1000000|Free|    0|      Everyone|
|Garden Coloring Book|ART_AND_DE

In [0]:
Apps.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: integer (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)



In [0]:
Apps = Apps.withColumn('Reviews', Apps['Reviews'].cast(IntegerType())).\
    withColumn('Rating', Apps['Rating'].cast(IntegerType()))

In [0]:
from pyspark.sql.functions import col, regexp_replace
Apps = Apps.withColumn("Installs", regexp_replace(col("Installs"),"[^0-9]",""))\
        .withColumn("Installs", col("Installs").cast (IntegerType()))\
        .withColumn("Price", regexp_replace(col("Price"),"[$]",""))\
            .withColumn("Price", col("Price").cast(IntegerType()))
Apps.show(10)

+--------------------+--------------+------+-------+--------+----+-----+--------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|Content Rating|
+--------------------+--------------+------+-------+--------+----+-----+--------------+
|Photo Editor & Ca...|ART_AND_DESIGN|     4|    159|   10000|Free|    0|      Everyone|
| Coloring book moana|ART_AND_DESIGN|     3|    967|  500000|Free|    0|      Everyone|
|U Launcher Lite –...|ART_AND_DESIGN|     4|  87510| 5000000|Free|    0|      Everyone|
|Sketch - Draw & P...|ART_AND_DESIGN|     4| 215644|50000000|Free|    0|          Teen|
|Pixel Draw - Numb...|ART_AND_DESIGN|     4|    967|  100000|Free|    0|      Everyone|
|Paper flowers ins...|ART_AND_DESIGN|     4|    167|   50000|Free|    0|      Everyone|
|Smoke Effect Phot...|ART_AND_DESIGN|     3|    178|   50000|Free|    0|      Everyone|
|    Infinite Painter|ART_AND_DESIGN|     4|  36815| 1000000|Free|    0|      Everyone|
|Garden Coloring Book|ART_AND_DE

In [0]:
Apps.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: integer (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Installs: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Content Rating: string (nullable = true)



In [0]:
Top_reviews=Apps.groupBy("App").sum('Reviews')
Top_reviews.sort(Top_reviews['sum(reviews)'].desc()).show(10)

+--------------------+------------+
|                 App|sum(Reviews)|
+--------------------+------------+
|           Instagram|   266241989|
|  WhatsApp Messenger|   207348304|
|      Clash of Clans|   179558781|
|Messenger – Text ...|   169932272|
|      Subway Surfers|   166331958|
|    Candy Crush Saga|   156993136|
|            Facebook|   156286514|
|         8 Ball Pool|    99386198|
|        Clash Royale|    92530298|
|            Snapchat|    68045010|
+--------------------+------------+
only showing top 10 rows



In [0]:
Top_install=Apps.groupBy("App","Type").sum("Installs")
Top_install.sort(Top_install['sum(installs)'].desc()).show(10)

+------------------+----+-------------+
|               App|Type|sum(Installs)|
+------------------+----+-------------+
|    Subway Surfers|Free|   6000000000|
|      Google Drive|Free|   4000000000|
|       Google News|Free|   4000000000|
|     Google Photos|Free|   4000000000|
|         Instagram|Free|   4000000000|
|          Hangouts|Free|   4000000000|
|  Candy Crush Saga|Free|   3500000000|
|      Temple Run 2|Free|   3000000000|
|WhatsApp Messenger|Free|   3000000000|
|             Gmail|Free|   3000000000|
+------------------+----+-------------+
only showing top 10 rows



In [0]:
category=Apps.groupBy("Category").sum("Installs")
category.sort(category['sum(installs)'].desc()).show(10)

+------------------+-------------+
|          Category|sum(Installs)|
+------------------+-------------+
|              GAME|  35086024415|
|     COMMUNICATION|  32647276251|
|      PRODUCTIVITY|  14176091369|
|            SOCIAL|  14069867902|
|             TOOLS|  11452771915|
|            FAMILY|  10258263505|
|       PHOTOGRAPHY|  10088247655|
|NEWS_AND_MAGAZINES|   7496317760|
|  TRAVEL_AND_LOCAL|   6868887146|
|     VIDEO_PLAYERS|   6222002720|
+------------------+-------------+
only showing top 10 rows



In [0]:
Paid_apps=Apps.groupBy("App").sum("Price")
Paid_apps.sort(Paid_apps['sum(Price)'].desc()).show(10)

+--------------------+----------+
|                 App|sum(Price)|
+--------------------+----------+
|I'm Rich - Trump ...|       400|
|I'm Rich/Eu sou R...|       399|
|       I Am Rich Pro|       399|
|  I AM RICH PRO PLUS|       399|
|           I am Rich|       399|
|most expensive ap...|       399|
|  I am rich(premium)|       399|
|         💎 I'm rich|       399|
|      I am Rich Plus|       399|
|   I Am Rich Premium|       399|
+--------------------+----------+
only showing top 10 rows



In [0]:
Paid_rating_apps=Apps.groupBy("App","Rating").sum("Price")
Paid_rating_apps.sort(Paid_rating_apps['sum(Price)'].desc()).show(10)

+--------------------+------+----------+
|                 App|Rating|sum(Price)|
+--------------------+------+----------+
|I'm Rich - Trump ...|     3|       400|
|           I am rich|     3|       399|
|I'm Rich/Eu sou R...|     0|       399|
|most expensive ap...|     4|       399|
|         💎 I'm rich|     3|       399|
|           I am Rich|     4|       399|
|          I am Rich!|     3|       399|
|I am rich (Most e...|     4|       399|
|       I Am Rich Pro|     4|       399|
|   I Am Rich Premium|     4|       399|
+--------------------+------+----------+
only showing top 10 rows

