In [3]:
from pyspark.sql import SparkSession

# Crear una sesión de Spark
spark = SparkSession.builder.appName("MiApp").getOrCreate()

# Ahora puedes acceder a 'sc' y otros métodos de Spark
sc = spark.sparkContext

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/14 02:02:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/14 02:02:52 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/08/14 02:02:52 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


### Map function

In [4]:
data = sc.parallelize([1,2,3,4,5,6,7,8])
doble = data.map(lambda x : x * 2)
print("num elementos",doble.count())
print("elementos",doble.collect())
print("media",doble.mean())
print("elementos ordenados",doble.takeOrdered(3))
print("elementos ordenados de mayor a menor",doble.top(5))

print ("La suma de los elementos al cuadrado es %d" % doble.sum())

menor6 = doble.filter(lambda x : x < 6)
print ("Hay %d elementos al cuadrado menores que 6" % menor6.count())

                                                                                

num elementos 8
elementos [2, 4, 6, 8, 10, 12, 14, 16]
media 9.0
elementos ordenados [2, 4, 6]
elementos ordenados de mayor a menor [16, 14, 12, 10, 8]
La suma de los elementos al cuadrado es 72
Hay 2 elementos al cuadrado menores que 6


### Sumatorio de elementos de un vector

In [6]:
def partitionSumatorio(nums):
    sumCount = [0, 0]
    for num in nums:
        sumCount[0] += num
        sumCount[1] += 1
    return [sumCount]


def combinarNumeros(c1, c2):
    return (c1[0] + c2[0], c1[1] + c2[1])


def basicAvg(nums):
    """Obtener la media"""
    sumCount = nums.mapPartitions(partitionSumatorio).reduce(combinarNumeros)
    print("Sumatorio",sumCount[0])
    print("Num elementos",sumCount[1])
    return sumCount[0] / float(sumCount[1])

In [7]:
nums = sc.parallelize([1, 2, 3, 4, 5])
avg = basicAvg(nums)
avg

Sumatorio 15
Num elementos 5


3.0

### Agrupar elementos de un diccionario

In [8]:
def diccionarioAgrupar(nums):
    sumCount = nums.combineByKey((lambda x: (x, 1)),
                                 (lambda x, y: (x[0] + y, x[1] + 1)),
                                 (lambda x, y: (x[0] + y[0], x[1] + y[1])))
    return sumCount.collectAsMap()

In [10]:
diccionario = sc.parallelize([("coffee", 1), ("pandas", 2), ("coffee", 3), ("very", 4)])
resultado = diccionarioAgrupar(diccionario)
print (resultado)

{'coffee': (4, 2), 'pandas': (2, 1), 'very': (4, 1)}


### Accumulator

In [11]:
from pyspark.accumulators import AccumulatorParam

class Sumador(AccumulatorParam):
    def zero(self, initialValue):
        return 1
    def addInPlace(self, v1, v2):
        return v1+v2

In [12]:
acc = sc.accumulator(1,Sumador())
sc.parallelize([1,2,3,4]).foreach(lambda x: acc.add(x))

print("value %d " % acc.value)

value 31 


### Filter /map

In [13]:
def basicSquareNoOnes(nums):
    """Square the numbers"""
    return nums.map(lambda x: x * x).filter(lambda x: x != 1)

In [15]:
nums = sc.parallelize([1, 2, 3, 4])
output = sorted(basicSquareNoOnes(nums).collect())
for num in output:
    print ("%i " % (num))

4 
9 
16 


### flatMapValues

In [21]:
import itertools

def combineIfBothPresent(itrs):
    """Return an iterable of the elements from
    both itr1 and itr2 if there are elements in both itr1 and itr2 otherwise
    return an empty itrable"""
    iter1 = itrs[0].__iter__()
    iter2 = itrs[1].__iter__()
    try:
        e1 = next(iter1)
        e2 = next(iter2)
        return itertools.chain([e1], [e2], iter1, iter2)
    except StopIteration:
        return []


def intersectByKey(rdd1, rdd2):
    """Intersect two RDDs by key"""
    return rdd1.cogroup(rdd2).flatMapValues(combineIfBothPresent)

In [22]:
rdd1 = sc.parallelize([("pyspark", 1), ("pandas", 2), ("pyspark", 3), ("apache", 4)])
rdd2 = sc.parallelize([("pyspark", 20), ("pandas", 21)])
print (intersectByKey(rdd1, rdd2).collect())

[('pandas', 2), ('pandas', 21), ('pyspark', 1), ('pyspark', 20), ('pyspark', 3)]
