In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
from pyspark.sql.functions import * 
from pyspark.sql.functions import regexp_replace, col

In [3]:
spark=SparkSession.builder.appName("play").getOrCreate()

In [6]:
df=spark.read.csv("googleplaystore.csv",inferSchema=True,header=True)

In [9]:
df.show()

+--------------------+--------------+------+-------+----+-----------+----+-----+--------------+--------------------+------------------+------------------+------------+
|                 App|      Category|Rating|Reviews|Size|   Installs|Type|Price|Content Rating|              Genres|      Last Updated|       Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+-----------+----+-----+--------------+--------------------+------------------+------------------+------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 19M|    10,000+|Free|    0|      Everyone|        Art & Design|   January 7, 2018|             1.0.0|4.0.3 and up|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967| 14M|   500,000+|Free|    0|      Everyone|Art & Design;Pret...|  January 15, 2018|             2.0.0|4.0.3 and up|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510|8.7M| 5,000,000+|Free|    0|      Everyone|        Art & Design|    August 1, 2018|             1.2.4|4.0.3 

In [8]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)



In [10]:
df.drop("size","Content Rating","Last Updated","Android Ver")

DataFrame[App: string, Category: string, Rating: string, Reviews: string, Installs: string, Type: string, Price: string, Genres: string, Current Ver: string]

In [11]:
df1=df.withColumn("Reviews",col("Reviews").cast(IntegerType())).withColumn("Installs",regexp_replace(col("Installs"),"[^0-9]",""))\
.withColumn("Installs",col("Installs").cast(IntegerType())).withColumn("Price",regexp_replace(col("Price"),"[$]",""))\
.withColumn("Installs",col("Installs").cast(IntegerType()))

In [20]:
df1.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)



In [13]:
df1.show()

+--------------------+--------------+------+-------+----+--------+----+-----+--------------+--------------------+------------------+------------------+------------+
|                 App|      Category|Rating|Reviews|Size|Installs|Type|Price|Content Rating|              Genres|      Last Updated|       Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+--------+----+-----+--------------+--------------------+------------------+------------------+------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 19M|   10000|Free|    0|      Everyone|        Art & Design|   January 7, 2018|             1.0.0|4.0.3 and up|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967| 14M|  500000|Free|    0|      Everyone|Art & Design;Pret...|  January 15, 2018|             2.0.0|4.0.3 and up|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510|8.7M| 5000000|Free|    0|      Everyone|        Art & Design|    August 1, 2018|             1.2.4|4.0.3 and up|
|Sketch - 

In [22]:
#Average number of installs per category
df1.groupBy("Category").agg(avg("Installs").alias("Average Installs")).orderBy("Average Installs",ascending=False).show()

+-------------------+--------------------+
|           Category|    Average Installs|
+-------------------+--------------------+
|      COMMUNICATION| 8.435988695348836E7|
|             SOCIAL| 4.769446746440678E7|
|      VIDEO_PLAYERS| 3.555430125714286E7|
|       PRODUCTIVITY| 3.343417775707547E7|
|               GAME|3.0669601761363637E7|
|        PHOTOGRAPHY| 3.011417210447761E7|
|   TRAVEL_AND_LOCAL| 2.662359358914729E7|
| NEWS_AND_MAGAZINES|2.6488755335689045E7|
|      ENTERTAINMENT|1.9256107382550336E7|
|              TOOLS|1.3585731809015421E7|
|           SHOPPING|1.2491726096153846E7|
|BOOKS_AND_REFERENCE|   8318050.112554112|
|    PERSONALIZATION|   5932384.647959184|
|          EDUCATION|   5586230.769230769|
|MAPS_AND_NAVIGATION|   5288837.426470588|
|             FAMILY|   5201959.181034483|
|            WEATHER|   5196347.804878049|
| HEALTH_AND_FITNESS|   4653154.447058824|
|             SPORTS|   4560350.255208333|
|            FINANCE|   2395215.120218579|
+----------

In [35]:
#Number of Paid Apps
count=df1.where(col("Type") != "Free").count()
print(f"Total number of Paid apps {count}")

Total number of Paid apps 804


In [41]:
df1.createOrReplaceTempView("play")

In [42]:
spark.sql("select * from play").show()

+--------------------+--------------+------+-------+----+--------+----+-----+--------------+--------------------+------------------+------------------+------------+
|                 App|      Category|Rating|Reviews|Size|Installs|Type|Price|Content Rating|              Genres|      Last Updated|       Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+--------+----+-----+--------------+--------------------+------------------+------------------+------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 19M|   10000|Free|    0|      Everyone|        Art & Design|   January 7, 2018|             1.0.0|4.0.3 and up|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967| 14M|  500000|Free|    0|      Everyone|Art & Design;Pret...|  January 15, 2018|             2.0.0|4.0.3 and up|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510|8.7M| 5000000|Free|    0|      Everyone|        Art & Design|    August 1, 2018|             1.2.4|4.0.3 and up|
|Sketch - 

In [44]:
#App with highest number of Installs
spark.sql("select App from play order by Installs DESC limit 1").show()

+-----------------+
|              App|
+-----------------+
|Google Play Books|
+-----------------+



In [53]:
#top 5 Apps with highest Reviews
spark.sql("select App,Reviews from play order by Reviews DESC limit 10").show()

+--------------------+--------+
|                 App| Reviews|
+--------------------+--------+
|            Facebook|78158306|
|            Facebook|78128208|
|  WhatsApp Messenger|69119316|
|  WhatsApp Messenger|69119316|
|  WhatsApp Messenger|69109672|
|           Instagram|66577446|
|           Instagram|66577313|
|           Instagram|66577313|
|           Instagram|66509917|
|Messenger – Text ...|56646578|
+--------------------+--------+

