# Installing Pyspark in Googlecolab

In [None]:
!pip install pyspark py4j


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Creating new SparkContext

In [None]:
import pyspark
from pyspark import SparkContext

sc = SparkContext.getOrCreate()

In [None]:
sc

#Creating RDD from list

In [None]:
rdd = sc.parallelize([1,2,3,4,5,6])

In [None]:
rdd.collect()

[1, 2, 3, 4, 5, 6]

In [None]:
rdd.sum()


21

In [None]:
rdd.max()

6

In [None]:
x = sc.parallelize(['A','A','B','B','C'])

In [None]:
x.collect()

['A', 'A', 'B', 'B', 'C']

In [None]:
type(x)

pyspark.rdd.RDD

In [None]:
# to know methods on rdd
dir(x)            

['__add__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_computeFractionForSampleSize',
 '_defaultReducePartitions',
 '_id',
 '_is_barrier',
 '_is_protocol',
 '_jrdd',
 '_jrdd_deserializer',
 '_memory_limit',
 '_pickled',
 '_reserialize',
 '_to_java_object_rdd',
 'aggregate',
 'aggregateByKey',
 'barrier',
 'cache',
 'cartesian',
 'checkpoint',
 'cleanShuffleDependencies',
 'coalesce',
 'cogroup',
 'collect',
 'collectAsMap',
 'collectWithJobGroup',
 'combineByKey',
 'context',
 'count',
 'countApprox',
 'countApproxDistinct',
 'countByKey',
 'countByValue',
 'ctx',
 'dist

# Help

In [None]:
help(x)   #toknow exact syntax

Help on RDD in module pyspark.rdd object:

class RDD(typing.Generic)
 |  RDD(*args, **kwds)
 |  
 |  A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
 |  Represents an immutable, partitioned collection of elements that can be
 |  operated on in parallel.
 |  
 |  Method resolution order:
 |      RDD
 |      typing.Generic
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __add__(self: 'RDD[T]', other: 'RDD[U]') -> 'RDD[Union[T, U]]'
 |      Return the union of this RDD and another one.
 |      
 |      Examples
 |      --------
 |      >>> rdd = sc.parallelize([1, 1, 2, 3])
 |      >>> (rdd + rdd).collect()
 |      [1, 1, 2, 3, 1, 1, 2, 3]
 |  
 |  __getnewargs__(self) -> NoReturn
 |  
 |  __init__(self, jrdd: 'JavaObject', ctx: 'SparkContext', jrdd_deserializer: pyspark.serializers.Serializer = AutoBatchedSerializer(CloudPickleSerializer()))
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __repr__(self) -> str
 |     

# RDD Actions 

In [None]:

l= [1,2,3,4,5,6,7,8,9,10,12,]
rdd2= sc.parallelize(l)

In [None]:
#Actions : collect (convert RDD to in-memory list)

rdd2.collect()          

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12]

In [None]:
#Action - take() (prints first elements)

rdd2.take(5)

[1, 2, 3, 4, 5]

In [None]:
#Action - top( prints top elements)

rdd2.top(5)

[12, 10, 9, 8, 7]

In [None]:
#Actions :  takeSample (take some sample random values from list, if it's true it will repeat same value again ,False means unique)

rdd2.takeSample(False,4)

[8, 12, 9, 7]

In [None]:
# Actions - Aggregate functions - gives single output value
#Action - count( no.of elements)

rdd2.count()

11

In [None]:
# Actions - Aggregate functions - gives single output value
# Action - min,max

print(rdd2.min())
print(rdd2.max())


1
12


In [None]:
# Actions - Aggregate functions - gives single output value
#Actions - sum(),mean(),stdev

print(rdd2.sum())
print(rdd2.mean())
print(rdd2.stdev())

67
6.090909090909091
3.3153786416019035


In [None]:
# Actions - stats- complete info about count,mean,stdev,max,min
rdd2.stats()

(count: 11, mean: 6.090909090909091, stdev: 3.3153786416019035, max: 12.0, min: 1.0)

In [None]:
# Actions - Aggregate functions - gives single output value
# Actions- reduce that aggregates a data set(RDD) element using function

print(rdd2.reduce(lambda x,y: x*y))
print(rdd2.reduce(lambda x,y: x+y))


43545600
67


In [None]:
#Actions - CountByValue - count of same values

rdd2.countByValue()

defaultdict(int,
            {1: 1,
             2: 1,
             3: 1,
             4: 1,
             5: 1,
             6: 1,
             7: 1,
             8: 1,
             9: 1,
             10: 1,
             12: 1})

In [None]:
#Actions - CountByKey 

l1= [('r',1),('k',2),('p',2),('q',2),('r',1),('i',2),('r',1),('i',2)]

rdd3=sc.parallelize(l1)
rdd3.countByKey()

defaultdict(int, {'r': 3, 'k': 1, 'p': 1, 'q': 1, 'i': 2})

In [None]:
#Actions - CountByValue - count of same values
rdd3.countByValue()

defaultdict(int,
            {('r', 1): 3, ('k', 2): 1, ('p', 2): 1, ('q', 2): 1, ('i', 2): 2})

In [None]:
#Actions - fold - aggregate the elements of each partition
from operator import *

rdd2.fold(1,add)

70

In [None]:
# ACtions- range of even numbers

R= sc.parallelize(range(0,20,2))
R.collect()

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

In [None]:
# Actions - variance (all n values variance)
R.variance()

33.0

In [None]:
#Actions - sample variance - (n-1) values variance

R.sampleVariance()

36.666666666666664

In [None]:
# Read file csv file using RDD

rdd1 = sc.textFile("/content/emp.csv")
rdd1.collect()

['EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO,UPDATED_DATE',
 '7369,SMITH,CLERK,7902,17-12-1980,800,300,20,01-01-2022',
 '7499,ALLEN,SALESMAN,7698,20-02-1981,1600,300,30,01-01-2022',
 '7521,WARD,SALESMAN,7698,22-02-1981,1250,500,30,01-01-2022',
 '7566,JONES,MANAGER,7839,04-02-1981,2975,null,20,05-01-2022',
 '7654,MARTIN,SALESMAN,7698,21-09-1981,1250,1400,30,03-01-2022',
 '7698,SGR,null,7839,05-01-1981,2850,1600,30,04-01-2022',
 '7782,RAVI,MANAGER,7839,06-09-1981,2450,100,null,02-01-2022',
 '7788,SCOTT,ANALYST,7566,19-04-1987,null,null,20,02-01-2022',
 'null,null,PRESIDENT,null,01-11-1981,null,null,null,02-01-2022',
 '7844,TURNER,SALESMAN,7698,09-08-1981,1500,null,30,02-01-2022',
 '7876,ADAMS,CLERK,7788,23-05-1987,1100,null,20,03-01-2022',
 '7900,JAMES,CLERK,7698,12-03-1981,950,null,30,03-01-2022',
 '7902,FORD,ANALYST,7566,12-03-1981,3000,null,20,03-01-2022',
 '7369,SMITH,CLERK,7902,17-12-1980,800,null,20,04-01-2022',
 '7499,ALLEN,SALESMAN,7698,20-02-1981,1600,300,30,04-01-2022',
 '752

In [None]:
#Actions - saveAsTextFile

rdd1.coalesce(3).saveAsTextFile('/content/sample_data/rdd1')

In [None]:
#Actions - saveAsPickleFile

rdd1.coalesce(2).saveAsPickleFile('/content/sample_data/r2')

# RDD transformations

In [None]:
# map - Return new distributed dataset formed by passing each element of source through a function 

x= rdd.map(lambda x : (x,x*3))

print( 'Values:',rdd.collect())
print('Values:',x.collect())

Values: [1, 2, 3, 4, 5, 6]
Values: [(1, 3), (2, 6), (3, 9), (4, 12), (5, 15), (6, 18)]


In [None]:
#flatMap -Similar to map ,but each input iteam can be mapped to 0 or more output items

y = rdd.flatMap(lambda x : (x, x*4,x/4,x**4))

y.collect()

[1,
 4,
 0.25,
 1,
 2,
 8,
 0.5,
 16,
 3,
 12,
 0.75,
 81,
 4,
 16,
 1.0,
 256,
 5,
 20,
 1.25,
 625,
 6,
 24,
 1.5,
 1296]

In [None]:
# filter - Return a new dataset formed by selecting those elements of source

z= rdd.filter( lambda x : x%2 == 1)

z.collect()

[1, 3, 5]

In [None]:
#Partition given list and print partition lists

l = [1,2,3,4,5,6,7,8,9,10]

P = sc.parallelize(l,4)

P.glom().collect()

[[1, 2], [3, 4], [5, 6], [7, 8, 9, 10]]

In [None]:

#mapPartitions - Similar to map but runs separetely on each partition 

def f(iterator) : yield sum(iterator)

M = P.mapPartitions(f)

print(P.glom().collect())
print(M.glom().collect())

[[1, 2], [3, 4], [5, 6], [7, 8, 9, 10]]
[[3], [7], [11], [34]]


In [None]:
#mapPartitionWIthIndex - Similar to Mappartition ,but also provides an integer value is index of partition

def ff(Index,iterator): yield (Index,sum(iterator))

I = P.mapPartitionsWithIndex(ff)

print(P.glom().collect())
print(I.glom().collect())

[[1, 2], [3, 4], [5, 6], [7, 8, 9, 10]]
[[(0, 3)], [(1, 7)], [(2, 11)], [(3, 34)]]


In [None]:
#sample - a fraction of fraction data ,with or without replacement ,using a given random number

# sample(withreplacement.fraction,seed)  ,if (with replacement) - true - will repeat same value ,else (withreplacement) -False - unique values

s = sc.parallelize([1,2,3,4,5,6,7,8,9,10])

S=s.sample(True,1)

print(s.collect())
print(S.collect())



[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[2, 3, 4, 5, 5, 6, 7]


In [None]:
#union,intersection

A = sc.parallelize(range(20))
B = sc.parallelize(range(0,20,2))
print(A.collect())
print(B.collect())

C = A.union(B)
D = A.intersection(B)

print(C.collect())
print(D.collect())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
[0, 4, 8, 12, 16, 2, 6, 10, 14, 18]


In [None]:
# distinct 

D = C.distinct()
print(D.collect())

[0, 4, 8, 12, 16, 1, 5, 9, 13, 17, 2, 6, 10, 14, 18, 3, 7, 11, 15, 19]


In [None]:
# groupByKey

L = [('A',1),('A',2),('B',3),('B',4),('C',5),('C',6)]

g = sc.parallelize(L)
y= g.groupByKey()


print([(j[0],[i for i in j[1]]) for j in y.collect()] )

[('C', [5, 6]), ('A', [1, 2]), ('B', [3, 4])]


# Spark Caching 



In [None]:
#caching -caching is  used to save the data(RDD/Dataframe/Dataset) in a cluster-wide in memory,cache() method default saves data in MEMORY_ONLY.Used to store small amount od data. This is Very Useful for accessing repeated data .such as querying a small data set or when running an iterative algorithm

rdd = sc.textFile('/content/sample_data/california_housing_test.csv')


In [None]:
# Getting total time taken for count before cache Enabled
# print(t.timeit(number =5))

import timeit

start = timeit.default_timer()

rdd.count()
rdd.min()
rdd.max()
rdd.collect()


end = timeit.default_timer()

print('elapsed time:',(end-start))

elapsed time: 0.6071388719999504


In [None]:
# caching rdd 
# cahe will save dataesets default in memory
rdd.cache()


/content/sample_data/california_housing_test.csv MapPartitionsRDD[85] at textFile at NativeMethodAccessorImpl.java:0

In [None]:
#  Getting total time taken for count dfter cache Enabled
# print(t.timeit(number =5))

import timeit

start = timeit.default_timer()

rdd.count()
rdd.min()
rdd.max()
rdd.collect()


end = timeit.default_timer()

print('elapsed time:',(end-start))

elapsed time: 0.7174905510000826


# persist

In [None]:
#same like cahe ( stores data in memeory) but we can store large amout of  data and also persist() method is used to store it to the user-defined storage levels like (memory only , disk only ,memory and disk only etc)

#rdd.persist(pyspark.StorageLevel.MEMORY_ONLY)
 
#rdd.persist(pyspark.StorageLevel.DISK_ONLY)

rdd.persist(pyspark.StorageLevel.MEMORY_AND_DISK)

/content/sample_data/california_housing_test.csv MapPartitionsRDD[85] at textFile at NativeMethodAccessorImpl.java:0

In [None]:
# clears cache and persist data manually
rdd.unpersist()

/content/sample_data/california_housing_test.csv MapPartitionsRDD[85] at textFile at NativeMethodAccessorImpl.java:0

# Adventages of Cache and persist 

In [None]:
#Cost efficient – Spark computations are very expensive hence reusing the computations are used to save cost.
#Time efficient – Reusing the repeated computations saves lots of time.
#Execution time – Saves execution time of the job and we can perform more jobs on the same cluster.

#Broadcast variables

In [None]:
# allow programmer to keep a read only variable cached on each machine rather than shipping a copy of it with tasks
# spark useses efficient broadcast algoritham to reduce communication cost

Board = sc.broadcast([1,2,3,4,5,6,7,8,9])

print(Board.value)
print(type(Board))


In [None]:
# clear the boarcast variables by using unpersist or destroy

#Board.unpersist()

Board.destroy

# Accumulators

In [None]:
#Accumulators are variables that are only "added" to through an associative and commutative operation and can therefore efficiently supported in parallel
# they can be used to implement counters or sums .
#spark natively supports accumulators of numeric types and programmers can add support for new types.

accum = sc.accumulator(100)

sc.parallelize([1,2,3,4,5,6,7,8,9]).foreach(lambda x : accum.add(x))

accum.value

In [None]:
help(accum)

# Transformations  - Joins

In [None]:
#join - takes two pair of rdd ,return only matched records from both RDD's

J1 = sc.parallelize([('A',2),('B',3),('C',4),('D',5),('E',6),('F',7)])

J2 = sc.parallelize([('C',2),('D',3),('E',4),('F',5),('G',6),('H',7)])

join = J1.join(J2)

print(join.collect())


[('F', (7, 5)), ('C', (4, 2)), ('D', (5, 3)), ('E', (6, 4))]


In [None]:
# leftouterjoin  - returns matched records from both rdd and unmatched records from left rdd

left = J1.leftOuterJoin(J2)
print( left.collect())

[('A', (2, None)), ('B', (3, None)), ('F', (7, 5)), ('C', (4, 2)), ('D', (5, 3)), ('E', (6, 4))]


In [None]:
#RightOuterJoin - returns matched records from both rdd and unmatched from right rdd

right = J1.rightOuterJoin(J2)
print(right.collect())

[('F', (7, 5)), ('H', (None, 7)), ('C', (4, 2)), ('D', (5, 3)), ('E', (6, 4)), ('G', (None, 6))]


In [None]:
#fullOuterJoin - returned all records from both RDD'S

full = J1.fullOuterJoin(J2)
print(full.collect())

[('A', (2, None)), ('B', (3, None)), ('F', (7, 5)), ('H', (None, 7)), ('C', (4, 2)), ('D', (5, 3)), ('E', (6, 4)), ('G', (None, 6))]


In [None]:
#cartesian join - cross product of both elements of RDD

cert = J1.cartesian(J2)
print(cert.collect())

[(('A', 2), ('C', 2)), (('A', 2), ('D', 3)), (('A', 2), ('E', 4)), (('B', 3), ('C', 2)), (('B', 3), ('D', 3)), (('B', 3), ('E', 4)), (('C', 4), ('C', 2)), (('C', 4), ('D', 3)), (('C', 4), ('E', 4)), (('A', 2), ('F', 5)), (('A', 2), ('G', 6)), (('A', 2), ('H', 7)), (('B', 3), ('F', 5)), (('B', 3), ('G', 6)), (('B', 3), ('H', 7)), (('C', 4), ('F', 5)), (('C', 4), ('G', 6)), (('C', 4), ('H', 7)), (('D', 5), ('C', 2)), (('D', 5), ('D', 3)), (('D', 5), ('E', 4)), (('E', 6), ('C', 2)), (('E', 6), ('D', 3)), (('E', 6), ('E', 4)), (('F', 7), ('C', 2)), (('F', 7), ('D', 3)), (('F', 7), ('E', 4)), (('D', 5), ('F', 5)), (('D', 5), ('G', 6)), (('D', 5), ('H', 7)), (('E', 6), ('F', 5)), (('E', 6), ('G', 6)), (('E', 6), ('H', 7)), (('F', 7), ('F', 5)), (('F', 7), ('G', 6)), (('F', 7), ('H', 7))]


# Spark-submit local


**wc.py :**

from pyspark.sql import sparkSession

spark = sparkSession.builder.master('local').appName('wcout').getOrCreate()

<<<<< logic here >>>>>>


**syntax: spark-submit in local**

spark-submit "configrationoptions" "programfile_path" file:input_path file:output_path

EX:

spark_submit /home/cloudera/wc.py file://home/cloudera/csvfile file://home/cloudera/wcout






# Spark-Submit Yarn cluster

**wc.py:**

from pyspark.sql import SparkSession

spark = SparkSession.builder.master('yarn').appName('wcout').getOrCreate()

<<<<< logic here >>>>>>

**syntax: spark-submit in yarn**

spark-submit "configrationoptions" "programfile_path" file:input_path file:output_path

spark-submit /home/cloudera/wc.py hdfs://localhost:8020/file1 hdfs://localhost:8020/wcout

# spark-submit without configuration values :


ByDefault values are in Yarn cluster

driver memory   :  1GB memory, 
executor memory :  1GB memory, 
number of cores per executor : 1 

# FIFO/FAIR


Spark support two Schedulers:

1.FIFO : (First in First out ) only one application at a time.second application under waiting state until first one completes.

2.FAIR : In order to allow multiple apps to run on cluster the applications must be submitted to Fair scheduler


