<a href="https://colab.research.google.com/github/SilvanaJ90/udemy_data_engineer/blob/main/Pair_RDDs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
pip install pyspark



In [7]:
from pyspark import SparkContext

In [5]:
sc = SparkContext()

**Creando un Pair RDD a partir de una lista de tuplas**

In [8]:
lista_tuplas = [('a', 1), ('b', 2), ('c', 3)]

In [9]:
lista_tuplas

[('a', 1), ('b', 2), ('c', 3)]

In [10]:
pair_rdd = sc.parallelize(lista_tuplas)

In [11]:
pair_rdd.collect()

[('a', 1), ('b', 2), ('c', 3)]

**Utilizando zip()**

In [12]:
pair_rdd2 = sc.parallelize(zip((['a', 'b', 'c']), range(1,4,1))) #start, stop, range

In [13]:
pair_rdd2.collect()

[('a', 1), ('b', 2), ('c', 3)]

Ejemplo utilizando un archivo

In [14]:
rdd_celestina = sc.textFile('/content/drive/MyDrive/dataset/La+Celestina.txt')

In [15]:
rdd_celestina.map(lambda x: (x.split(" ")[0],x))

PythonRDD[4] at RDD at PythonRDD.scala:53

In [16]:
rdd_celestina.takeSample(False, 15)

['ni mas rara": "Amici veri maxime in adversis haerent:',
 '',
 '_Etor_ **** Hector, hijo de Priamo y Hecuba.  En "La',
 '',
 '',
 '_incusarla, incusarnos, incuses_ **** incusar: acusar',
 'alrededor de Pleberio cuando este da su discurso final). Y aunque',
 'tan franco, que te dara para esso e para mas.',
 '_estimar_ **** juzgar, creer;',
 '',
 'llaman ypocrita: que diran, sino que andas royendo los sanctos?',
 'matriz (R-P, 159)',
 '_gozasse_ **** gozase',
 '',
 'alegra el corazon cada vez que las veo: porque se que hablan']

**Keyby()**

In [17]:
rdd = sc.parallelize(range(5))

In [18]:
rdd.collect()

[0, 1, 2, 3, 4]

In [19]:
pair_rdd = rdd.keyBy(lambda x: x+2)

In [20]:
pair_rdd.collect()

[(2, 0), (3, 1), (4, 2), (5, 3), (6, 4)]

**zipWithIndex()**

In [21]:
rdd = sc.parallelize(['a','b','c','d','e'])

In [22]:
pair_rdd = rdd.zipWithIndex()

In [23]:
pair_rdd.collect()

[('a', 0), ('b', 1), ('c', 2), ('d', 3), ('e', 4)]

**zipWithUniqueld()**

In [24]:
rdd.zipWithUniqueId().glom().collect()

[[('a', 0), ('b', 2)], [('c', 1), ('d', 3), ('e', 5)]]

**zip() con dos RDDs**

In [25]:
rdd1 = sc.parallelize(range(5),3)

In [26]:
rdd2 = sc.parallelize(range(100, 105, 1) ,3)

In [27]:
rdd1.glom().collect()

[[0], [1, 2], [3, 4]]

In [28]:
rdd2.glom().collect()

[[100], [101, 102], [103, 104]]

In [29]:
pair_rdd = rdd1.zip(rdd2)

In [30]:
pair_rdd.collect()

[(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]

**Transformaciones de agregaciòn para pair RRDs**

reduceByKey() y folByKey() - son transformaciones y no acciones

In [37]:
tupla = [('a', 1), ('b', 2), ('c', 3), ('a', 4), ('b', 5), ('c', 6),('c',7)]

In [39]:
pair_rdd = sc.parallelize(tupla)

In [40]:
pair_rdd.collect()

[('a', 1), ('b', 2), ('c', 3), ('a', 4), ('b', 5), ('c', 6), ('c', 7)]

In [41]:
from operator import add

In [42]:
rdd_reduce = pair_rdd.reduceByKey(add)

In [43]:
rdd_reduce.collect()

[('b', 7), ('c', 16), ('a', 5)]

In [44]:
rdd_fold = pair_rdd.foldByKey(0, add)

In [45]:
rdd_fold.collect()

[('b', 7), ('c', 16), ('a', 5)]

groupByKey - es costoso - agrupa valores para cada clave

In [46]:
rdd_group = pair_rdd.groupByKey()

In [47]:
rdd_group.collect()

[('b', <pyspark.resultiterable.ResultIterable at 0x7b8e0a837dc0>),
 ('c', <pyspark.resultiterable.ResultIterable at 0x7b8df0437970>),
 ('a', <pyspark.resultiterable.ResultIterable at 0x7b8df0434910>)]

In [48]:
lista = [(x, list(y)) for x,y in rdd_group.collect()]

In [49]:
lista

[('b', [2, 5]), ('c', [3, 6, 7]), ('a', [1, 4])]

combineByKey(createCombiner(), mergeValue(), mergeCombiners()) - parecido a agregate

In [50]:
rdd_suma_cuenta = pair_rdd.combineByKey((lambda x: (x, 1)),
                                        (lambda x, y: (x[0]+y, x[1]+1)),
                                        (lambda x, y: (x[0]+y[0], x[1]+y[1])))

In [51]:
rdd_suma_cuenta.collect()

[('b', (7, 2)), ('c', (16, 3)), ('a', (5, 2))]

In [52]:
rdd_media = rdd_suma_cuenta.mapValues(lambda x: x[0]/x[1])

In [53]:
rdd_media.collect()

[('b', 3.5), ('c', 5.333333333333333), ('a', 2.5)]

**Transformaciones para claves o valores**

In [54]:
pair_rdd.collect()

[('a', 1), ('b', 2), ('c', 3), ('a', 4), ('b', 5), ('c', 6), ('c', 7)]

In [55]:
pair_rdd.keys().collect()

['a', 'b', 'c', 'a', 'b', 'c', 'c']

In [56]:
pair_rdd.values().collect()

[1, 2, 3, 4, 5, 6, 7]

In [57]:
pair_rdd.sortByKey().collect()

[('a', 1), ('a', 4), ('b', 2), ('b', 5), ('c', 3), ('c', 6), ('c', 7)]

mapValues() - aplica la funcion sobre los valores
flatMapValues() - aplica la funcion sobre los valores y crea una lista simplificada

In [60]:
pair_rdd.mapValues(lambda x: (x*10)).collect()

[('a', 10), ('b', 20), ('c', 30), ('a', 40), ('b', 50), ('c', 60), ('c', 70)]

In [62]:
pair_rdd.flatMapValues(lambda x: (x,x*10)).collect()

[('a', 1),
 ('a', 10),
 ('b', 2),
 ('b', 20),
 ('c', 3),
 ('c', 30),
 ('a', 4),
 ('a', 40),
 ('b', 5),
 ('b', 50),
 ('c', 6),
 ('c', 60),
 ('c', 7),
 ('c', 70)]

**Joins**

In [63]:
rdd1 = sc.parallelize([('a', 1), ('b', 2)])

In [65]:
rdd1.collect()

[('a', 1), ('b', 2)]

In [64]:
rdd2 = sc.parallelize([('a', 4), ('b', 5), ('c', 6)])

In [66]:
rdd2.collect()

[('a', 4), ('b', 5), ('c', 6)]

In [67]:
rdd1.join(rdd2).collect() #igual a un inner join

[('b', (2, 5)), ('a', (1, 4))]

In [68]:
rdd1.leftOuterJoin(rdd2).collect()

[('b', (2, 5)), ('a', (1, 4))]

In [69]:
rdd2.leftOuterJoin(rdd1).collect()

[('b', (5, 2)), ('c', (6, None)), ('a', (4, 1))]

In [70]:
rdd1.rightOuterJoin(rdd2).collect()

[('b', (2, 5)), ('c', (None, 6)), ('a', (1, 4))]

In [71]:
rdd2.rightOuterJoin(rdd1).collect()

[('b', (5, 2)), ('a', (4, 1))]

In [72]:
rdd1.fullOuterJoin(rdd2).collect()
# te une todo

[('b', (2, 5)), ('c', (None, 6)), ('a', (1, 4))]

**subtractByKey()**

In [73]:
rdd1.subtractByKey(rdd2).collect()

[]

In [74]:
rdd2.subtractByKey(rdd1).collect()

[('c', 6)]

**cogroup()**

In [75]:
rdd3 = rdd1.cogroup(rdd2)

In [76]:
rdd3.collect()

[('b',
  (<pyspark.resultiterable.ResultIterable at 0x7b8df06f7f70>,
   <pyspark.resultiterable.ResultIterable at 0x7b8df04377c0>)),
 ('c',
  (<pyspark.resultiterable.ResultIterable at 0x7b8df04351e0>,
   <pyspark.resultiterable.ResultIterable at 0x7b8df0436bf0>)),
 ('a',
  (<pyspark.resultiterable.ResultIterable at 0x7b8df0435db0>,
   <pyspark.resultiterable.ResultIterable at 0x7b8df0437040>))]

In [79]:
rdd3.mapValues(lambda x: [list(i) for i in x]).collect()

[('b', [[2], [5]]), ('c', [[], [6]]), ('a', [[1], [4]])]

In [80]:
rdd1.collectAsMap()

{'a': 1, 'b': 2}

countByKey()

In [81]:
rdd1.countByKey()

defaultdict(int, {'a': 1, 'b': 1})

lookup()

In [82]:
rdd1.lookup('a')

[1]