# Creating a spark session

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkPractice").master("local[4]").getOrCreate()
spark

24/09/13 06:55:10 WARN Utils: Your hostname, Tulasis-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.29.179 instead (on interface en0)
24/09/13 06:55:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/13 06:55:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
sc = spark.sparkContext

# Create an RDD using parallelize

In [66]:
data = [("data1","data2"),("data3","data4"),("data5","data6"),("data7","data8")]

dataFrame = spark.createDataFrame(data,["name1","name2"])

In [68]:
dataFrame.show()

+-----+-----+
|name1|name2|
+-----+-----+
|data1|data2|
|data3|data4|
|data5|data6|
|data7|data8|
+-----+-----+



In [72]:
numbersRdd = sc.parallelize([1,2,3,4,5,6,7])

In [74]:
numbersRdd

ParallelCollectionRDD[37] at readRDDFromFile at PythonRDD.scala:289

In [76]:
# Get number of partitions

numbersRdd.getNumPartitions()


4

In [78]:
# Get result from Rdd

output = numbersRdd.collect()
print(output)

[1, 2, 3, 4, 5, 6, 7]


In [80]:
# Get first two elements from the Rdd

numbersRdd.take(2)

[1, 2]

In [82]:
# Get first element from the Rdd
# Gets first record from the Rdd where as take(n) returns array of n elements from the Rdd

numbersRdd.first()

1

In [84]:
complexRdd = sc.parallelize([["Tulasi",22,"Chubb"],["Ram",23, "Google"],["Reddy",24,"Microsoft"]])

In [86]:
complexRdd.first()

['Tulasi', 22, 'Chubb']

In [88]:
complexRdd.collect()

[['Tulasi', 22, 'Chubb'], ['Ram', 23, 'Google'], ['Reddy', 24, 'Microsoft']]

In [90]:
complexRdd.take(3)

[['Tulasi', 22, 'Chubb'], ['Ram', 23, 'Google'], ['Reddy', 24, 'Microsoft']]

# Create RDD through a file

In [93]:
data = sc.textFile("/Users/tulasiramreddygade/Downloads/customers-100.csv")

data.getNumPartitions()

2

In [95]:
data2 = sc.textFile("/Users/tulasiramreddygade/Downloads/customers-100.csv",4)

data.getNumPartitions()

2

In [97]:
data.collect()

['Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website',
 '1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/',
 '2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/',
 '3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/',
 '4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/',
 '5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodw

In [99]:
data.first()

'Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website'

# Create RDD from another RDD

In [102]:
rdd1 = data.map(lambda zone: zone.split(",") )

In [104]:
rdd1.first()

['Index',
 'Customer Id',
 'First Name',
 'Last Name',
 'Company',
 'City',
 'Country',
 'Phone 1',
 'Phone 2',
 'Email',
 'Subscription Date',
 'Website']

In [106]:
rdd2 = rdd1.filter(lambda zone : zone[5]=="East Leonard" or zone[4]=="Vega-Gentry")
rdd2.collect()

[['1',
  'DD37Cf93aecA6Dc',
  'Sheryl',
  'Baxter',
  'Rasmussen Group',
  'East Leonard',
  'Chile',
  '229.077.5154',
  '397.884.0519x718',
  'zunigavanessa@smith.info',
  '2020-08-24',
  'http://www.stephenson.com/'],
 ['2',
  '1Ef7b82A4CAAD10',
  'Preston',
  'Lozano',
  'Vega-Gentry',
  'East Jimmychester',
  'Djibouti',
  '5153435776',
  '686-620-1820x944',
  'vmata@colon.com',
  '2021-04-23',
  'http://www.hobbs.com/']]

# Pair RDD

In [109]:
import math

numbersRDD = sc.parallelize([2,3,4,5,6,7,8])

numWithSquarootPairRDD = numbersRDD.map(lambda num : (num,math.sqrt(num)))

pairedRDD.collect()

[(2, 1.4142135623730951),
 (3, 1.7320508075688772),
 (4, 2.0),
 (5, 2.23606797749979),
 (6, 2.449489742783178),
 (7, 2.6457513110645907),
 (8, 2.8284271247461903)]

# Calculate count of records by Borough

In [115]:
organizationData = sc.textFile("/Users/tulasiramreddygade/Downloads/organizations-10000.csv")

organizationData.take(5)

['Index,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees',
 '1,522816eF8fdBE6d,Mckinney PLC,http://soto.com/,Sri Lanka,Synergized global system engine,1988,Dairy,3930',
 '2,70C7FBD7e6Aa3Ea,Cunningham LLC,http://harding-duffy.com/,Namibia,Team-oriented fault-tolerant adapter,2018,Library,7871',
 '3,428B397eA2d7290,Ruiz-Walls,http://www.atkins.biz/,Iran,Re-contextualized bifurcated moderator,2003,Hospital / Health Care,3095',
 '4,9D234Ae8Cc51C1c,"Parrish, Osborne and Clarke",http://salazar.info/,British Indian Ocean Territory (Chagos Archipelago),Fully-configurable next generation concept,1989,Supermarkets,5422']

In [151]:
# Split organization data as it is string csv

import re

organizationDataSplit = organizationData.map(lambda zone : zone.split(","))

organizationDataSplit.take(3)

[['Index',
  'Organization Id',
  'Name',
  'Website',
  'Country',
  'Description',
  'Founded',
  'Industry',
  'Number of employees'],
 ['1',
  '522816eF8fdBE6d',
  'Mckinney PLC',
  'http://soto.com/',
  'Sri Lanka',
  'Synergized global system engine',
  '1988',
  'Dairy',
  '3930'],
 ['2',
  '70C7FBD7e6Aa3Ea',
  'Cunningham LLC',
  'http://harding-duffy.com/',
  'Namibia',
  'Team-oriented fault-tolerant adapter',
  '2018',
  'Library',
  '7871']]

In [153]:

# make a pair RDD from the splitted organization data

orgCountryNoOfEmployeesPairRDD = organizationDataSplit.map(lambda zone : (zone[4],zone[8]))

orgCountryNoOfEmployeesPairRDD.take(2)

[('Country', 'Number of employees'), ('Sri Lanka', '3930')]

In [155]:
orgCountryNoOfEmployeesPairRDD.collect()

[('Country', 'Number of employees'),
 ('Sri Lanka', '3930'),
 ('Namibia', '7871'),
 ('Iran', '3095'),
 ('http://salazar.info/', 'Supermarkets'),
 ('https://www.brooks-scott.net/', 'Nanotechnology'),
 ('Ecuador', '7233'),
 ('Sierra Leone', '6022'),
 ('Zimbabwe', '4580'),
 ('Ecuador', '3245'),
 ('Timor-Leste', '1785'),
 ('Ukraine', '2985'),
 ('United Kingdom', '839'),
 ('Thailand', '3135'),
 ('Turkey', '7261'),
 ('Kiribati', '2427'),
 ('Canada', '6477'),
 ('Colombia', '4263'),
 ('Suriname', '2546'),
 ('Burundi', '219'),
 ('Kazakhstan', '581'),
 ('Malta', '6796'),
 ('Andorra', '7859'),
 ('Yemen', '3670'),
 ('Central African Republic', '236'),
 ('http://marquez.org/', 'Design'),
 ('Brazil', '4905'),
 ('https://olsen.com/', 'Commercial Real Estate'),
 ('http://mathews.com/', 'Textiles'),
 ('Isle of Man', '9958'),
 ('Timor-Leste', '1884'),
 ('http://www.ortega.com/', 'Environmental Services'),
 ('Iraq', '358'),
 ('Kenya', '5154'),
 ('http://www.williamson-mahoney.com/', 'Photography'),
 ('ht

In [157]:
boroughCountRDD = orgCountryNoOfEmployeesPairRDD.reduceByKey(lambda value1, value2: value1 + value2)

In [159]:
boroughCountRDD.take(3)

[('Country', 'Number of employees'),
 ('Sri Lanka',
  '393094718106921948289946367588074809040708872926362615890389106617740676771531351935893599471386158931828415373199995997348749169664933840'),
 ('Ecuador',
  '723332452712486126595726979545751881209563682156601583664137562041576915608338893461494809214062189711586')]

In [161]:
boroughCountRDD.collect()

[('Country', 'Number of employees'),
 ('Sri Lanka',
  '393094718106921948289946367588074809040708872926362615890389106617740676771531351935893599471386158931828415373199995997348749169664933840'),
 ('Ecuador',
  '723332452712486126595726979545751881209563682156601583664137562041576915608338893461494809214062189711586'),
 ('Sierra Leone',
  '6022440077335529953270657976853904943116928918331675684535371484234954212564395596878151103'),
 ('Ukraine',
  '298510288422995835629294524103290136928339522125306159148902440507715745612812493611326661430'),
 ('Kiribati',
  '242753913062844811011866687762328186801251667876577486725267736142426169165826667552536'),
 ('Andorra',
  '785987615701728639146112825325679415119571921175479875812630748875757667961541199625763877513'),
 ('https://olsen.com/', 'Commercial Real EstateTobacco'),
 ('http://mathews.com/', 'TextilesPrinting'),
 ('http://www.ortega.com/', 'Environmental Services'),
 ('Kenya',
  '5154260165994574865733656118049809348377086307976886579

In [167]:
# sort RDD by keys

boroughCountRDD.sortByKey().first()

('Afghanistan',
 '24248175337985253880209787132937539444952587963539247911825276034669752277312615311385132573424')

In [173]:
boroughCountRDD.sortByKey().keys().take(10)

['Afghanistan',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica (the territory South of 60 deg S)',
 'Antigua and Barbuda',
 'Argentina']

In [175]:
boroughCountRDD.values().take(10)

['Number of employees',
 '393094718106921948289946367588074809040708872926362615890389106617740676771531351935893599471386158931828415373199995997348749169664933840',
 '723332452712486126595726979545751881209563682156601583664137562041576915608338893461494809214062189711586',
 '6022440077335529953270657976853904943116928918331675684535371484234954212564395596878151103',
 '298510288422995835629294524103290136928339522125306159148902440507715745612812493611326661430',
 '242753913062844811011866687762328186801251667876577486725267736142426169165826667552536',
 '785987615701728639146112825325679415119571921175479875812630748875757667961541199625763877513',
 'Commercial Real EstateTobacco',
 'TextilesPrinting',
 'Environmental Services']

In [177]:
boroughCountRDD.distinct().take(10)

[('Country', 'Number of employees'),
 ('Sierra Leone',
  '6022440077335529953270657976853904943116928918331675684535371484234954212564395596878151103'),
 ('Kiribati',
  '242753913062844811011866687762328186801251667876577486725267736142426169165826667552536'),
 ('http://www.williamson-mahoney.com/', 'Photography'),
 ('http://www.zavala.com/', 'Venture Capital / VC'),
 ('http://riggs.com/', 'CosmeticsRailroad Manufacture'),
 ('Korea',
  '856339345707237313037009573773277682941653632803113767317181195559166455437100917524856417109262563707900843523345617336629595500310624831807846063148260679412663472162822783923766141878828138331513817658862142051179978436086258'),
 ('Morocco',
  '45641970940158246084926360674475378124547055375774722441867220041527324732598781791644437796643013362836687643971321053427379388025735'),
 ('https://www.middleton.com/', 'Fine Art'),
 ('Macao',
  '228440287445429335644700508360892922514196547429904419214662351244647091798514555838385248889322251367578032789482

# Identify transformation and Action operation

In [7]:
numbersRDD = sc.parallelize([1,2,3,4,5,6])

In [9]:
numbersRDD.first()

                                                                                

1

In [196]:
taxiZonesRDD = sc.textFile("/Users/tulasiramreddygade/Downloads/apache-spark-3-fundamentals/DataFiles/Raw/TaxiZones.csv")

In [200]:
taxiZonesRDD.first()

'1,EWR,Newark Airport,EWR'

In [206]:
taxiZonesRDDSplit = taxiZonesRDD.map(lambda zone: zone.split(",") )
taxiZonesRDDSplit.take(3)

[['1', 'EWR', 'Newark Airport', 'EWR'],
 ['2', 'Queens', 'Jamaica Bay', 'Boro Zone'],
 ['3', 'Bronx', 'Allerton/Pelham Gardens', 'Boro Zone']]

In [210]:
taxiZonesRDDSplit.map(lambda zone: zone[1]).distinct().collect()

['Queens', 'Bronx', 'Manhattan', 'Staten Island', 'Brooklyn', 'Unknown', 'EWR']

In [35]:
# Read data from the csv file

taxiZonesRDD = sc.textFile("/Users/tulasiramreddygade/Downloads/apache-spark-3-fundamentals/DataFiles/Raw/TaxiZones.csv")

# split the data 

taxiZonesRDDSplit = taxiZonesRDD.map(lambda zone1 : zone1.split(","))


# create a pair RDD with <Borough,1>

taxiZonesPairRDD = taxiZonesRDDSplit.map(lambda zone2 : (zone2[1],1))

# reduceByKey

taxiZonesReduceByKey = taxiZonesPairRDD.reduceByKey(lambda value1, value2 : value1 + value2)


# filter

taxiZonesFiltered = taxiZonesReduceByKey.filter(lambda zone3 : zone3[1]>10)



In [37]:
taxiZonesFiltered.collect()

[('Queens', 69),
 ('Bronx', 43),
 ('Manhattan', 69),
 ('Staten Island', 20),
 ('Brooklyn', 61)]

In [41]:
taxiZonesMutliplied = taxiZonesFiltered.map(lambda x : (x,x[1]*12))

taxiZonesMutliplied.collect()

[(('Queens', 69), 828),
 (('Bronx', 43), 516),
 (('Manhattan', 69), 828),
 (('Staten Island', 20), 240),
 (('Brooklyn', 61), 732)]