## 0. Detecting Pysaprk

In [1]:
import findspark
findspark.init() 

## 1. Initialization

In [2]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
sc = SparkContext(conf = conf)

## 2. Parsing data into RDD

In [3]:
def parseLine(line):
    fields = line.split(',') # split with delim comma
    age = int(fields[2]) # take 3rd column for age
    numFriends = int(fields[3]) # take 4th for # of friends
    return (age, numFriends)

## 3. Extracting Data to RDD

In [5]:
# loading data
lines = sc.textFile(".\\fakefriends.csv")
# extracting 3rd, 4th values from each row
rdd = lines.map(parseLine) 

## 4. Aggregation
    - vectorization
    - sum 
    - mean

In [6]:
# value-vectorization & sum 
totalByAge = rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0]+y[0], x[1]+y[1]))

# sum/occurences = mean
averageByAge = totalByAge.mapValues(lambda x: x[0]/x[1])


## 5. Collect and Display 

In [7]:
# retrieve all the elements of smaller dataset (all nodes) to driver node 
results = averageByAge.collect()

# display
for result in results:
    print(result)

(33, 325.3333333333333)
(26, 242.05882352941177)
(55, 295.53846153846155)
(40, 250.8235294117647)
(68, 269.6)
(59, 220.0)
(37, 249.33333333333334)
(54, 278.0769230769231)
(38, 193.53333333333333)
(27, 228.125)
(53, 222.85714285714286)
(57, 258.8333333333333)
(56, 306.6666666666667)
(43, 230.57142857142858)
(36, 246.6)
(22, 206.42857142857142)
(35, 211.625)
(45, 309.53846153846155)
(60, 202.71428571428572)
(67, 214.625)
(19, 213.27272727272728)
(30, 235.8181818181818)
(51, 302.14285714285717)
(25, 197.45454545454547)
(21, 350.875)
(42, 303.5)
(49, 184.66666666666666)
(48, 281.4)
(50, 254.6)
(39, 169.28571428571428)
(32, 207.9090909090909)
(58, 116.54545454545455)
(64, 281.3333333333333)
(31, 267.25)
(52, 340.6363636363636)
(24, 233.8)
(20, 165.0)
(62, 220.76923076923077)
(41, 268.55555555555554)
(44, 282.1666666666667)
(69, 235.2)
(65, 298.2)
(61, 256.22222222222223)
(28, 209.1)
(66, 276.44444444444446)
(46, 223.69230769230768)
(29, 215.91666666666666)
(18, 343.375)
(47, 233.22222222222