In [1]:
# Libraries
from pyspark import SparkContext

In [2]:
sc = SparkContext(master='local', appName='TransformationsAndActions')

In [3]:
rdd1 = sc.parallelize([1,2,3])
# In is important to check the datatype.
type(rdd1)

pyspark.rdd.RDD

In [4]:
# To vizualice and collect the data
rdd1.collect()

[1, 2, 3]

In [5]:
# To see that the first Job was successful we can use SparkUI
sc

In [6]:
# reading the path where is the data
# Linux commands inside Anaconda
!ls /home/lastorder/Documents/curso-apache-spark-platzi/files


deporte.csv	 deportistaError.csv  modelo_relacional.jpg
deportista2.csv  evento.csv	      paises.csv
deportista.csv	 juegos.csv	      resultados.csv


In [7]:
path = "/home/lastorder/Documents/curso-apache-spark-platzi/files/"

In [8]:
# Creating our first RDD
OlimpicTeamsRDD = sc.textFile(path+"paises.csv")\
    .map(lambda line : line.split(","))

In [9]:
# Take will show us an specific number of lines
OlimpicTeamsRDD.take(15)

[['id', 'equipo', 'sigla'],
 ['1', '30. Februar', 'AUT'],
 ['2', 'A North American Team', 'MEX'],
 ['3', 'Acipactli', 'MEX'],
 ['4', 'Acturus', 'ARG'],
 ['5', 'Afghanistan', 'AFG'],
 ['6', 'Akatonbo', 'IRL'],
 ['7', 'Alain IV', 'SUI'],
 ['8', 'Albania', 'ALB'],
 ['9', 'Alcaid', 'POR'],
 ['10', 'Alcyon-6', 'FRA'],
 ['11', 'Alcyon-7', 'FRA'],
 ['12', 'Aldebaran', 'ITA'],
 ['13', 'Aldebaran II', 'ITA'],
 ['14', 'Aletta', 'IRL']]

In [11]:
# To count how many countries we have.
# To select the country acronyms, we need to select the third column index:[0,1,2]
# Similar to SQL, we use COUNT DISTINCT and select the column we need with a lambda
OlimpicTeamsRDD.map(lambda x: (x[2])).distinct().count()


231

In [12]:
# Spark group all the data in a key-value format
# the first value of the lambda will be the key value(x[2])(the value spark will use to group the data)
# If we use len in mapvalues, Spark will show us the number of elements of the group list.
OlimpicTeamsRDD.map(lambda x : (x[2],x[1])).groupByKey().mapValues(len).take(5)

[('sigla', 1), ('AUT', 11), ('MEX', 9), ('ARG', 18), ('AFG', 1)]

In [13]:
# If we use list in mapvalues, Spark will show us a list with all the values of the group
OlimpicTeamsRDD.map(lambda x : (x[2],x[1])).groupByKey().mapValues(list).take(5)

[('sigla', ['equipo']),
 ('AUT',
  ['30. Februar',
   'Austria',
   'Austria-1',
   'Austria-2',
   'Breslau',
   'Brigantia',
   'Donar III',
   'Evita VI',
   'May-Be 1960',
   '"R.-V. Germania; Leitmeritz"',
   'Surprise']),
 ('MEX',
  ['A North American Team',
   'Acipactli',
   'Chamukina',
   'Mexico',
   'Mexico-1',
   'Mexico-2',
   'Nausikaa 4',
   'Tlaloc',
   'Xolotl']),
 ('ARG',
  ['Acturus',
   'Antares',
   'Arcturus',
   'Ardilla',
   'Argentina',
   'Argentina-1',
   'Argentina-2',
   'Blue Red',
   'Covunco III',
   'Cupidon III',
   'Djinn',
   'Gullvinge',
   'Matrero II',
   'Mizar',
   'Pampero',
   'Rampage',
   'Tango',
   'Wiking']),
 ('AFG', ['Afghanistan'])]

In [15]:
# NOTE!!: the Use of Collect is not a good practice because when you use it, collect all the data around the
# all the clusters in the system (all computers that are running spark), so if you had a lot of RDDs with
# TB of data, collect will capture all that data from all the CPUS and it will send it to the Spark Driver
# (the central computer), that can couse several problems to the system.

ArgTeams = OlimpicTeamsRDD.filter(lambda l : 'ARG' in l)
ArgTeams.collect()

# NEVER USE COLLECT UNTIL YOU KNOW THAT THE DATA IS LOCAL AND SMALL !!

[['4', 'Acturus', 'ARG'],
 ['37', 'Antares', 'ARG'],
 ['42', 'Arcturus', 'ARG'],
 ['43', 'Ardilla', 'ARG'],
 ['45', 'Argentina', 'ARG'],
 ['46', 'Argentina-1', 'ARG'],
 ['47', 'Argentina-2', 'ARG'],
 ['119', 'Blue Red', 'ARG'],
 ['238', 'Covunco III', 'ARG'],
 ['252', 'Cupidon III', 'ARG'],
 ['288', 'Djinn', 'ARG'],
 ['436', 'Gullvinge', 'ARG'],
 ['644', 'Matrero II', 'ARG'],
 ['672', 'Mizar', 'ARG'],
 ['774', 'Pampero', 'ARG'],
 ['843', 'Rampage', 'ARG'],
 ['1031', 'Tango', 'ARG'],
 ['1162', 'Wiking', 'ARG']]

In [16]:
# Count might take several minutes if the Data is really big, in that cases we can use countApprox
OlimpicTeamsRDD.count()

1185

In [17]:
# countApprox will take a maximum num of second before end the process, if the process end before the number
# of second we give it then it will show the result in that moment.
# If the time ends and the process is still runing, then it will show us the total count until that moment
OlimpicTeamsRDD.countApprox(20)

1185

In [18]:
!ls /home/lastorder/Documents/curso-apache-spark-platzi/files

deporte.csv	 deportistaError.csv  modelo_relacional.jpg
deportista2.csv  evento.csv	      paises.csv
deportista.csv	 juegos.csv	      resultados.csv


In [19]:
# Exporting 2 RDDs, the first one contains the header and the second one contains the data.
OlimpicAthleteRDD = sc.textFile(path+'deportista.csv').map(lambda l : l.split(','))
OlimpicAthleteRDD2 = sc.textFile(path+'deportista2.csv').map(lambda l : l.split(','))

# To make a union between the RDDs we can use:
OlimpicAthleteRDD = OlimpicAthleteRDD.union(OlimpicAthleteRDD2)


In [20]:
# To make sure that the data is not corrupted, we can use count() to verify that spark is working correctly
# with that data
OlimpicAthleteRDD.count()

135572

In [21]:
# To see the first 10 rows of the RDD, we use top function (similar to SQL)
OlimpicAthleteRDD.top(10)

[['deportista_id', 'nombre', 'genero', 'edad', 'altura', 'peso', 'equipo_id'],
 ['99999', 'Alexander Grant Alick Rennie', '1', '32', '182', '71', '967'],
 ['99998', 'Robert John Bob Renney', '1', '21', '178', '90', '66'],
 ['99997', 'Thomas Renner', '1', '24', '183', '86', '71'],
 ['99996', 'Sara Renner', '2', '21', '168', '63', '174'],
 ['99995', 'Robert Renner', '1', '22', '182', '75', '944'],
 ['99994', 'Peter Campbell Renner', '1', '24', '186', '75', '716'],
 ['99993', 'Ingeborg Renner', '2', '22', '168', '60', '1150'],
 ['99992', 'Karlheinz Heinz Renneberg', '1', '25', '182', '87', '399'],
 ['99991', 'Paul Wisner Renne', '1', '24', '177', '73', '1096']]

In [22]:
OlimpicTeamsRDD.top(10)

[['id', 'equipo', 'sigla'],
 ['999', 'Stella-2', 'NOR'],
 ['998', 'State VI', 'CAN'],
 ['997', 'Starlight III', 'GBR'],
 ['996', 'Starita', 'NED'],
 ['995', 'Stade Franais AC-2', 'FRA'],
 ['994', 'St. Rose-2', 'USA'],
 ['993', 'St. Margrite', 'NED'],
 ['992', 'St. Louis Southwest Turnverein #2-3', 'USA'],
 ['991', 'St. Louis Southwest Turnverein #1-2', 'USA']]

In [23]:
# To make a JOIN we select the key value to make de join (the first value in the lambda)
# and the rest of the columns we need, we can use take to see if the join was successful
OlimpicAthleteRDD.map(lambda l: [l[-1],l[:-1]]) \
    .join(OlimpicTeamsRDD.map(lambda x: [x[0],x[2]])).take(10)

[('199', (['1', 'A Dijiang', '1', '24', '180', '80'], 'CHN')),
 ('199', (['2', 'A Lamusi', '1', '23', '170', '60'], 'CHN')),
 ('199', (['602', 'Abudoureheman', '1', '22', '182', '75'], 'CHN')),
 ('199', (['1463', 'Ai Linuer', '1', '25', '160', '62'], 'CHN')),
 ('199', (['1464', 'Ai Yanhan', '2', '14', '168', '54'], 'CHN')),
 ('199', (['3605', 'An Weijiang', '1', '22', '178', '72'], 'CHN')),
 ('199', (['3610', 'An Yulong', '1', '19', '173', '70'], 'CHN')),
 ('199', (['3611', 'An Zhongxin', '2', '23', '170', '65'], 'CHN')),
 ('199', (['4639', 'Ao Changrong', '1', '25', '173', '71'], 'CHN')),
 ('199', (['4641', 'Ao Tegen', '1', '21', '181', '90'], 'CHN'))]

In [24]:
# Another form to see a more representative sample of the JOIN is with takeSample function
# takeSample will take a random sample of the RDD with certains parameters
# the first paramneter is to set if we want to see repetitive values (True or False)
# the second parameter set the number of rows to show
# the third parameter is to set a random seed, to initialize the random process
OlimpicAthleteRDD.map(lambda l: [l[-1],l[:-1]]) \
    .join(OlimpicTeamsRDD.map(lambda x: [x[0],x[2]])).takeSample(False,10,25)

[('273',
  (['56134', 'Niels Christian Kold Jrgensen', '1', '21', '0', '0'], 'DEN')),
 ('624', (['13109', 'Harry Arthur Bonavia', '1', '20', '0', '0'], 'MLT')),
 ('399', (['98241', 'Eberhard Radzik', '1', '25', '168', '75'], 'GER')),
 ('96', (['83781', 'Albert Muylle', '1', '0', '0', '0'], 'BEL')),
 ('1096',
  (['101476', 'Clarence Franklin Robison', '1', '25', '192', '77'], 'USA')),
 ('967', (['18615', 'Lesley Carstens', '2', '27', '0', '0'], 'RSA')),
 ('362', (['78848', 'mile Mercier', '1', '0', '0', '0'], 'FRA')),
 ('810',
  (['94563', 'Andrzej Ryszard Pitkowski', '1', '22', '169', '68'], 'POL')),
 ('1096', (['114820', 'Jerome Steinert', '1', '28', '0', '0'], 'USA')),
 ('576', (['32099', 'Assaf ElMurr', '1', '24', '0', '0'], 'LIB'))]

In [25]:
# Importing a new RDD with the results of the olimpic
results = sc.textFile(path+'resultados.csv').map(lambda l : l.split(','))

In [26]:
# Making a new RDDs with filtering only the results with a metal (results different to NA)
WinnerResults = results.filter(lambda l: 'NA' not in l[1])

In [27]:
# checking the results
WinnerResults.take(10)

[['resultado_id', 'medalla', 'deportista_id', 'juego_id', 'evento_id'],
 ['4', 'Gold', '4', '2', '4'],
 ['38', 'Bronze', '15', '7', '19'],
 ['39', 'Bronze', '15', '7', '20'],
 ['41', 'Bronze', '16', '50', '14'],
 ['42', 'Bronze', '17', '17', '21'],
 ['43', 'Gold', '17', '17', '22'],
 ['45', 'Gold', '17', '17', '24'],
 ['49', 'Gold', '17', '17', '28'],
 ['51', 'Bronze', '17', '19', '22']]

In [28]:
AthleteCountry = OlimpicAthleteRDD.map(lambda l: [l[-1],l[:-1]]) \
    .join(OlimpicTeamsRDD.map(lambda x: [x[0],x[2]]))

In [29]:
AthleteCountry.top(10)

[('999', (['92679', 'Trygve Bjarne Pedersen', '1', '35', '0', '0'], 'NOR')),
 ('999', (['1144', 'Henrik Agersborg', '1', '47', '0', '0'], 'NOR')),
 ('999', (['10765', 'Einar Berntsen', '1', '28', '0', '0'], 'NOR')),
 ('998',
  (['111659', 'G. Bernard Bernie Skinner', '1', '34', '182', '82'], 'CAN')),
 ('996', (['116030', 'Edward Eddy Stutterheim', '1', '39', '0', '0'], 'NED')),
 ('995', (['71728', 'Gordon Frederick Love', '1', '26', '0', '0'], 'FRA')),
 ('995', (['101553', 'Louis Roche', '1', '0', '0', '0'], 'FRA')),
 ('994', (['87856', 'Leo Anthony O Connell', '1', '20', '0', '0'], 'USA')),
 ('994', (['55319', 'Johnson', '1', '0', '0', '0'], 'USA')),
 ('994', (['53520', 'Henry Wood Jameson', '1', '21', '0', '0'], 'USA'))]

In [None]:
sc.stop()