In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
sc = SparkContext(conf=conf)

In [11]:
def parseLine(line):
    fields = line.split(',')
    age = float(fields[2])
    numFriends = float(fields[3])
    return (age, numFriends)

# read in excel file
lines = sc.textFile("../../SparkData/fakefriends.xls")

# return the necessary tuples
rdd = lines.map(parseLine)

In [12]:
lines.top(5)

['99,Keiko,69,491',
 '98,Will,44,178',
 '97,Nerys,69,361',
 '96,Ezri,25,233',
 '95,Odo,29,173']

In [17]:
rdd.top(5)

[(69.0, 491.0), (69.0, 470.0), (69.0, 431.0), (69.0, 361.0), (69.0, 236.0)]

In [20]:
# use mapValues to ease computation cost
# return (x, 1) to increment entry # when needed
# reduceByKey allows me to add the two elements from each tuple together
#   the # of entries as well as the total
totalsByAge = rdd.mapValues(lambda x: (x, 1)).reduceByKey( \
    lambda x, y: (x[0] + y[0], x[1] + y[1]))

# divide total friends # by # of entries per key (age)
averagesByAge = totalsByAge.mapValues(lambda x: x[0]/ x[1])

# return list that contains all elements of RDD 
results = averagesByAge.collect()

In [31]:
results_by_age = sorted(results, key=lambda tup: tup[0])

In [32]:
results_by_age

[(18.0, 343.375),
 (19.0, 213.27272727272728),
 (20.0, 165.0),
 (21.0, 350.875),
 (22.0, 206.42857142857142),
 (23.0, 246.3),
 (24.0, 233.8),
 (25.0, 197.45454545454547),
 (26.0, 242.05882352941177),
 (27.0, 228.125),
 (28.0, 209.1),
 (29.0, 215.91666666666666),
 (30.0, 235.8181818181818),
 (31.0, 267.25),
 (32.0, 207.9090909090909),
 (33.0, 325.3333333333333),
 (34.0, 245.5),
 (35.0, 211.625),
 (36.0, 246.6),
 (37.0, 249.33333333333334),
 (38.0, 193.53333333333333),
 (39.0, 169.28571428571428),
 (40.0, 250.8235294117647),
 (41.0, 268.55555555555554),
 (42.0, 303.5),
 (43.0, 230.57142857142858),
 (44.0, 282.1666666666667),
 (45.0, 309.53846153846155),
 (46.0, 223.69230769230768),
 (47.0, 233.22222222222223),
 (48.0, 281.4),
 (49.0, 184.66666666666666),
 (50.0, 254.6),
 (51.0, 302.14285714285717),
 (52.0, 340.6363636363636),
 (53.0, 222.85714285714286),
 (54.0, 278.0769230769231),
 (55.0, 295.53846153846155),
 (56.0, 306.6666666666667),
 (57.0, 258.8333333333333),
 (58.0, 116.5454545454

In [23]:
results_by_ave_num_friends = sorted(results, key=lambda tup: tup[1])

In [24]:
results_by_ave_num_friends

[(58.0, 116.54545454545455),
 (20.0, 165.0),
 (39.0, 169.28571428571428),
 (49.0, 184.66666666666666),
 (38.0, 193.53333333333333),
 (25.0, 197.45454545454547),
 (60.0, 202.71428571428572),
 (22.0, 206.42857142857142),
 (32.0, 207.9090909090909),
 (28.0, 209.1),
 (35.0, 211.625),
 (19.0, 213.27272727272728),
 (67.0, 214.625),
 (29.0, 215.91666666666666),
 (59.0, 220.0),
 (62.0, 220.76923076923077),
 (53.0, 222.85714285714286),
 (46.0, 223.69230769230768),
 (27.0, 228.125),
 (43.0, 230.57142857142858),
 (47.0, 233.22222222222223),
 (24.0, 233.8),
 (69.0, 235.2),
 (30.0, 235.8181818181818),
 (26.0, 242.05882352941177),
 (34.0, 245.5),
 (23.0, 246.3),
 (36.0, 246.6),
 (37.0, 249.33333333333334),
 (40.0, 250.8235294117647),
 (50.0, 254.6),
 (61.0, 256.22222222222223),
 (57.0, 258.8333333333333),
 (31.0, 267.25),
 (41.0, 268.55555555555554),
 (68.0, 269.6),
 (66.0, 276.44444444444446),
 (54.0, 278.0769230769231),
 (64.0, 281.3333333333333),
 (48.0, 281.4),
 (44.0, 282.1666666666667),
 (55.0

Who knew 63 year olds were so populat on this social networking site.