In [1]:
import sys
import os
import random
from operator import add, mul
from pyspark import SparkContext, SparkConf
from pyspark import SparkFiles
from pyspark.sql import SparkSession, SQLContext

In [2]:
#conf = SparkConf().setAppName("M5-CA1-TGA")
#sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.appName("M5-CA1-TGA").getOrCreate() # singleton instance

In [3]:
sc = spark.sparkContext
spark.sparkContext.applicationId

u'application_1528714825862_139422'

### 1. Load csv into spark as a text file

In [4]:
location = "/user/edureka_672184/use_cases/AppleStore.csv"
apple = sc.textFile(location)

### 2. Parse the data as csv.

In [5]:
# split by comma
# exclude header
import re
pattern = ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
apple_lines = apple.map(lambda line: re.split(pattern, line)).filter((lambda cols: cols[0]!='""'))
apple_lines.count()

7197

### 3. Convert bytes to MB and GB in a new column

In [6]:
# divide by 1000
# divide by 1000000
MB = apple_lines.map(lambda x: float(x[3]) / 1000.)
GB = MB.map(lambda x: x / 1000.)

In [7]:
#GB.collect()

### 4. List top 10 trending apps

In [8]:
# id, track_name, rating_count_ver
apple_lines.sortBy(lambda line: float(line[7]), ascending=False).map(lambda line: (line[2], line[7])).take(10)

[(u'"Infinity Blade"', u'177050'),
 (u'"Geometry Dash Meltdown"', u'117470'),
 (u'"My Verizon"', u'107245'),
 (u'"Real Basketball"', u'94315'),
 (u'"Zillow Real Estate - Homes for Sale & for Rent"', u'88478'),
 (u'"WhatsApp Messenger"', u'73088'),
 (u'"Clear Vision (17+)"', u'69225'),
 (u'"Guess My Age \ue020 Math Magic"', u'68841'),
 (u'"Trigger Fist"', u'58269'),
 (u'"Zappos: shop shoes & clothes, fast free shipping"', u'39452')]

### 5. The difference in the average number of screenshots displayed of highest and lowest rating apps.

In [9]:
# "ipadSc_urls.num"

# user rating above 3 and below 3

highest = apple_lines.filter(lambda line: float(line[8]) > 3.0)
higest_sc = highest.map(lambda line: float(line[14])).mean()
lowest = apple_lines.filter(lambda line: float(line[8]) < 3.0)
lowest_sc = lowest.map(lambda line: float(line[14])).mean()

print(higest_sc - lowest_sc)

1.3675834041


### 6. What percentage of high rated apps support multiple languages.

In [10]:
# lang.num
total_higest = highest.count()
print(total_higest)
highest_multiple = highest.map(lambda line: float(line[15])).filter(lambda value: value > 1).count()
print(str(highest_multiple * 100 / total_higest) + " %")

5483
50 %


In [11]:
# location = "/user/edureka_672184/use_cases/appleStore_description.csv"
# # df = spark.read.option("header", "true").option('quote', '"').option('escape', '"').load(location)
# # df = spark.read.csv(location, sep=',', escape='"', header=True)
# #goog_df = spark.read.csv(GOOG_CSV,inferSchema=True,header=True)
# df = spark.read.csv(location, header=True)

### 7. How does app details contribute to user ratings?

### 8. Compare the statistics of different app groups/genres.


In [52]:
apple_lines.take(3)

[[u'"1"',
  u'"281656475"',
  u'"PAC-MAN Premium"',
  u'100788224',
  u'"USD"',
  u'3.99',
  u'21292',
  u'26',
  u'4',
  u'4.5',
  u'"6.3.5"',
  u'"4+"',
  u'"Games"',
  u'38',
  u'5',
  u'10',
  u'1'],
 [u'"2"',
  u'"281796108"',
  u'"Evernote - stay organized"',
  u'158578688',
  u'"USD"',
  u'0',
  u'161065',
  u'26',
  u'4',
  u'3.5',
  u'"8.2.2"',
  u'"4+"',
  u'"Productivity"',
  u'37',
  u'5',
  u'23',
  u'1'],
 [u'"3"',
  u'"281940292"',
  u'"WeatherBug - Local Weather, Radar, Maps, Alerts"',
  u'100524032',
  u'"USD"',
  u'0',
  u'188583',
  u'2822',
  u'3.5',
  u'4.5',
  u'"5.0.0"',
  u'"4+"',
  u'"Weather"',
  u'37',
  u'5',
  u'3',
  u'1']]

In [54]:
genre_prices = apple_lines.map(lambda line: (line[12], float(line[5])))
grp = genre_prices.groupByKey()
# show mean app price by genere
print(list((j[0], sum(list(j[1]))/len(list(j[1])) ) for j in grp.take(10)))

[(u'"Travel"', 1.1203703703703702), (u'"Food & Drink"', 1.552380952380952), (u'"Photo & Video"', 1.4732951289398313), (u'"Music"', 4.835434782608699), (u'"Reference"', 4.836875000000004), (u'"Business"', 5.116315789473686), (u'"Navigation"', 4.1247826086956545), (u'"Lifestyle"', 0.8854166666666657), (u'"Social Networking"', 0.3398802395209582), (u'"Productivity"', 4.330561797752813)]


### 9. Does length of app description contribute to the ratings?

In [19]:
location = "/user/edureka_672184/use_cases/appleStore_description.csv"
description = sc.textFile(location)
pattern = ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
#pattern = '(?:((?:"(?:[^"]*"")*[^"]*"(?:,|$))|(?:[^",]*(?:,|$)))|(.+))?'
# split by pattern and ignore header
description = description.map(lambda line: line.split(pattern))
# remove white space
description = description.filter(lambda cols: cols[0] != '')
# remove header
description = description.filter(lambda cols: 'id' not in cols[0])

current_id = 0
def wordCountByID(alist):
    # get global variable
    global current_id
    # split each line by comma
    elements = alist[0].split(",")
    # remove double quotes
    idd = elements[0].replace('"', "")
    # if first element is a digit
    # it is an application id
    if(idd.isdigit()):
        # convert it to int
        current_id = int(idd)
        # return nothing
        return []
    # it is a list full of text
    else:
        # associate words with the current id
        return [[(current_id, (word, 1)) for word in statement.split()] for statement in elements]
# get pairs like this (id, (word,1)    
description_wList = description.map(wordCountByID).filter(lambda listt: len(listt) != 0)
# flatten list of lists and group by key
word_freq = description_wList.flatMap(lambda listt: listt[0]).groupByKey()
# find length of each iteratable per group the sort it by that length in ascending order, then select top 10
top10 = word_freq.map(lambda j: (j[0], len(list(j[1])))).sortBy(lambda line: line[1], ascending=False).take(10)
# find least 10 
least10 = word_freq.map(lambda j: (j[0], len(list(j[1])))).sortBy(lambda line: line[1], ascending=True).take(10)

In [21]:
top10ids = [i[0] for i in top10]
least10ids = [i[0] for i in least10]

In [23]:
least10ids

[976257847,
 391965015,
 289446241,
 519952689,
 1065719308,
 1167873588,
 1185580782,
 1086688310,
 982936366,
 395893124]

In [55]:
# get ratings of top 10
def manipulate1(x):
    # column one is id
    # 1:-1 ignores the first and last quotations
    if int(x[1][1:-1]) in top10ids:
        # column 8 is overall rating
        return x[8]
apple_lines.map(manipulate).filter(lambda x: x).collect()

[u'3', u'2', u'3', u'1.5', u'0', u'4', u'0', u'0', u'0', u'4']

In [56]:
# get ratings of least 10
def manipulate2(x):
    if int(x[1][1:-1]) in least10ids:
        return x[8]
apple_lines.map(manipulate).filter(lambda x: x).collect()

[u'3', u'2', u'3', u'1.5', u'0', u'4', u'0', u'0', u'0', u'4']

Most of the top 10 highly described apps have a rating about 4. Apps with small description are rated below 4, including 2 and 3.

In [59]:
# 10. Create a spark-submit application for the same and print the findings in the log
!spark2-submit /mnt/home/edureka_672184/m5q10.py

19/07/22 06:34:20 INFO spark.SparkContext: Running Spark version 2.1.0.cloudera2
19/07/22 06:34:20 INFO spark.SecurityManager: Changing view acls to: edureka_672184
19/07/22 06:34:20 INFO spark.SecurityManager: Changing modify acls to: edureka_672184
19/07/22 06:34:20 INFO spark.SecurityManager: Changing view acls groups to: 
19/07/22 06:34:20 INFO spark.SecurityManager: Changing modify acls groups to: 
19/07/22 06:34:20 INFO spark.SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(edureka_672184); groups with view permissions: Set(); users  with modify permissions: Set(edureka_672184); groups with modify permissions: Set()
19/07/22 06:34:21 INFO util.Utils: Successfully started service 'sparkDriver' on port 41120.
19/07/22 06:34:21 INFO spark.SparkEnv: Registering MapOutputTracker
19/07/22 06:34:21 INFO spark.SparkEnv: Registering BlockManagerMaster
19/07/22 06:34:22 INFO storage.BlockManagerMasterEndpoint: Using org.apache.s

19/07/22 06:34:34 INFO mapred.FileInputFormat: Total input paths to process : 1
19/07/22 06:34:35 INFO spark.SparkContext: Starting job: sortBy at /mnt/home/edureka_672184/m5q10.py:72
19/07/22 06:34:35 INFO scheduler.DAGScheduler: Registering RDD 5 (groupByKey at /mnt/home/edureka_672184/m5q10.py:70)
19/07/22 06:34:35 INFO scheduler.DAGScheduler: Got job 0 (sortBy at /mnt/home/edureka_672184/m5q10.py:72) with 2 output partitions
19/07/22 06:34:35 INFO scheduler.DAGScheduler: Final stage: ResultStage 1 (sortBy at /mnt/home/edureka_672184/m5q10.py:72)
19/07/22 06:34:35 INFO scheduler.DAGScheduler: Parents of final stage: List(ShuffleMapStage 0)
19/07/22 06:34:35 INFO scheduler.DAGScheduler: Missing parents: List(ShuffleMapStage 0)
19/07/22 06:34:35 INFO scheduler.DAGScheduler: Submitting ShuffleMapStage 0 (PairwiseRDD[5] at groupByKey at /mnt/home/edureka_672184/m5q10.py:70), which has no missing parents
19/07/22 06:34:35 INFO memory.MemoryStore: Block broadcast_2 stored as values in mem

19/07/22 06:34:48 INFO scheduler.TaskSetManager: Finished task 1.0 in stage 3.0 (TID 5) in 246 ms on ip-20-0-31-210.ec2.internal (executor 1) (2/2)
19/07/22 06:34:48 INFO cluster.YarnScheduler: Removed TaskSet 3.0, whose tasks have all completed, from pool 
19/07/22 06:34:48 INFO scheduler.DAGScheduler: ResultStage 3 (sortBy at /mnt/home/edureka_672184/m5q10.py:72) finished in 0.248 s
19/07/22 06:34:48 INFO scheduler.DAGScheduler: Job 1 finished: sortBy at /mnt/home/edureka_672184/m5q10.py:72, took 0.271565 s
19/07/22 06:34:48 INFO spark.SparkContext: Starting job: runJob at PythonRDD.scala:441
19/07/22 06:34:48 INFO scheduler.DAGScheduler: Registering RDD 11 (sortBy at /mnt/home/edureka_672184/m5q10.py:72)
19/07/22 06:34:48 INFO scheduler.DAGScheduler: Got job 2 (runJob at PythonRDD.scala:441) with 1 output partitions
19/07/22 06:34:48 INFO scheduler.DAGScheduler: Final stage: ResultStage 6 (runJob at PythonRDD.scala:441)
19/07/22 06:34:48 INFO scheduler.DAGScheduler: Parents of final

19/07/22 06:34:49 INFO spark.SparkContext: Starting job: sortBy at /mnt/home/edureka_672184/m5q10.py:74
19/07/22 06:34:49 INFO scheduler.DAGScheduler: Got job 4 (sortBy at /mnt/home/edureka_672184/m5q10.py:74) with 2 output partitions
19/07/22 06:34:49 INFO scheduler.DAGScheduler: Final stage: ResultStage 10 (sortBy at /mnt/home/edureka_672184/m5q10.py:74)
19/07/22 06:34:49 INFO scheduler.DAGScheduler: Parents of final stage: List(ShuffleMapStage 9)
19/07/22 06:34:49 INFO scheduler.DAGScheduler: Missing parents: List()
19/07/22 06:34:49 INFO scheduler.DAGScheduler: Submitting ResultStage 10 (PythonRDD[16] at sortBy at /mnt/home/edureka_672184/m5q10.py:74), which has no missing parents
19/07/22 06:34:49 INFO memory.MemoryStore: Block broadcast_8 stored as values in memory (estimated size 8.9 KB, free 92.6 MB)
19/07/22 06:34:49 INFO memory.MemoryStore: Block broadcast_8_piece0 stored as bytes in memory (estimated size 5.2 KB, free 92.5 MB)
19/07/22 06:34:49 INFO storage.BlockManagerInfo:

19/07/22 06:34:50 INFO spark.MapOutputTrackerMasterEndpoint: Asked to send map output locations for shuffle 2 to 20.0.31.210:39858
19/07/22 06:34:50 INFO spark.MapOutputTrackerMaster: Size of output statuses for shuffle 2 is 170 bytes
19/07/22 06:34:50 INFO scheduler.TaskSetManager: Finished task 0.0 in stage 13.0 (TID 15) in 26 ms on ip-20-0-31-210.ec2.internal (executor 1) (1/1)
19/07/22 06:34:50 INFO cluster.YarnScheduler: Removed TaskSet 13.0, whose tasks have all completed, from pool 
19/07/22 06:34:50 INFO scheduler.DAGScheduler: ResultStage 13 (runJob at PythonRDD.scala:441) finished in 0.026 s
19/07/22 06:34:50 INFO scheduler.DAGScheduler: Job 5 finished: runJob at PythonRDD.scala:441, took 0.284706 s
19/07/22 06:34:50 INFO mapred.FileInputFormat: Total input paths to process : 1
19/07/22 06:34:50 INFO spark.SparkContext: Starting job: collect at /mnt/home/edureka_672184/m5q10.py:79
19/07/22 06:34:50 INFO scheduler.DAGScheduler: Got job 6 (collect at /mnt/home/edureka_672184/m5