In [1]:
# Libraries
from pyspark import SparkContext

In [2]:
sc = SparkContext(master='local', appName='TransformationsAndActions')

In [3]:
rdd1 = sc.parallelize([1,2,3])
# In is important to check the datatype.
type(rdd1)

pyspark.rdd.RDD

In [4]:
# To vizualice and collect the data
rdd1.collect()

[1, 2, 3]

In [5]:
# To see that the first Job was successful we can use SparkUI
sc

In [6]:
# reading the path where is the data
# Linux commands inside Anaconda
!ls /home/lastorder/Documents/curso-apache-spark-platzi/files


deporte.csv	 deportistaError.csv  modelo_relacional.jpg
deportista2.csv  evento.csv	      paises.csv
deportista.csv	 juegos.csv	      resultados.csv


In [7]:
path = "/home/lastorder/Documents/curso-apache-spark-platzi/files/"

In [8]:
# Creating our first RDD
OlimpicTeamsRDD = sc.textFile(path+"paises.csv")\
    .map(lambda line : line.split(","))

In [9]:
# Take will show us an specific number of lines
OlimpicTeamsRDD.take(15)

[['id', 'equipo', 'sigla'],
 ['1', '30. Februar', 'AUT'],
 ['2', 'A North American Team', 'MEX'],
 ['3', 'Acipactli', 'MEX'],
 ['4', 'Acturus', 'ARG'],
 ['5', 'Afghanistan', 'AFG'],
 ['6', 'Akatonbo', 'IRL'],
 ['7', 'Alain IV', 'SUI'],
 ['8', 'Albania', 'ALB'],
 ['9', 'Alcaid', 'POR'],
 ['10', 'Alcyon-6', 'FRA'],
 ['11', 'Alcyon-7', 'FRA'],
 ['12', 'Aldebaran', 'ITA'],
 ['13', 'Aldebaran II', 'ITA'],
 ['14', 'Aletta', 'IRL']]

In [11]:
# To count how many countries we have.
# To select the country acronyms, we need to select the third column index:[0,1,2]
# Similar to SQL, we use COUNT DISTINCT and select the column we need with a lambda
OlimpicTeamsRDD.map(lambda x: (x[2])).distinct().count()


231

In [12]:
# Spark group all the data in a key-value format
# the first value of the lambda will be the key value(x[2])(the value spark will use to group the data)
# If we use len in mapvalues, Spark will show us the number of elements of the group list.
OlimpicTeamsRDD.map(lambda x : (x[2],x[1])).groupByKey().mapValues(len).take(5)

[('sigla', 1), ('AUT', 11), ('MEX', 9), ('ARG', 18), ('AFG', 1)]

In [13]:
# If we use list in mapvalues, Spark will show us a list with all the values of the group
OlimpicTeamsRDD.map(lambda x : (x[2],x[1])).groupByKey().mapValues(list).take(5)

[('sigla', ['equipo']),
 ('AUT',
  ['30. Februar',
   'Austria',
   'Austria-1',
   'Austria-2',
   'Breslau',
   'Brigantia',
   'Donar III',
   'Evita VI',
   'May-Be 1960',
   '"R.-V. Germania; Leitmeritz"',
   'Surprise']),
 ('MEX',
  ['A North American Team',
   'Acipactli',
   'Chamukina',
   'Mexico',
   'Mexico-1',
   'Mexico-2',
   'Nausikaa 4',
   'Tlaloc',
   'Xolotl']),
 ('ARG',
  ['Acturus',
   'Antares',
   'Arcturus',
   'Ardilla',
   'Argentina',
   'Argentina-1',
   'Argentina-2',
   'Blue Red',
   'Covunco III',
   'Cupidon III',
   'Djinn',
   'Gullvinge',
   'Matrero II',
   'Mizar',
   'Pampero',
   'Rampage',
   'Tango',
   'Wiking']),
 ('AFG', ['Afghanistan'])]

In [15]:
# NOTE!!: the Use of Collect is not a good practice because when you use it, collect all the data around the
# all the clusters in the system (all computers that are running spark), so if you had a lot of RDDs with
# TB of data, collect will capture all that data from all the CPUS and it will send it to the Spark Driver
# (the central computer), that can couse several problems to the system.

ArgTeams = OlimpicTeamsRDD.filter(lambda l : 'ARG' in l)
ArgTeams.collect()

# NEVER USE COLLECT UNTIL YOU KNOW THAT THE DATA IS LOCAL AND SMALL !!

[['4', 'Acturus', 'ARG'],
 ['37', 'Antares', 'ARG'],
 ['42', 'Arcturus', 'ARG'],
 ['43', 'Ardilla', 'ARG'],
 ['45', 'Argentina', 'ARG'],
 ['46', 'Argentina-1', 'ARG'],
 ['47', 'Argentina-2', 'ARG'],
 ['119', 'Blue Red', 'ARG'],
 ['238', 'Covunco III', 'ARG'],
 ['252', 'Cupidon III', 'ARG'],
 ['288', 'Djinn', 'ARG'],
 ['436', 'Gullvinge', 'ARG'],
 ['644', 'Matrero II', 'ARG'],
 ['672', 'Mizar', 'ARG'],
 ['774', 'Pampero', 'ARG'],
 ['843', 'Rampage', 'ARG'],
 ['1031', 'Tango', 'ARG'],
 ['1162', 'Wiking', 'ARG']]

In [16]:
# Count might take several minutes if the Data is really big, in that cases we can use countApprox
OlimpicTeamsRDD.count()

1185

In [17]:
# countApprox will take a maximum num of second before end the process, if the process end before the number
# of second we give it then it will show the result in that moment.
# If the time ends and the process is still runing, then it will show us the total count until that moment
OlimpicTeamsRDD.countApprox(20)

1185

In [None]:
sc.stop()