<a href="https://colab.research.google.com/github/TABEYWICKRAMA/BigData/blob/main/PySpark_HandsOnExperience_V5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img src='https://drive.google.com/uc?id=1SK8a3TPK2Q7kDmmVjDXn21CwDcNxrh9j'>

<img src='https://drive.google.com/uc?id=155Mc13qa5TcgX4JZ-FaPW-eQkarWFgIv'>

In [106]:
#install PySpark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [107]:
# Import SparkContext and SparkConf
from pyspark import SparkContext, SparkConf

In [108]:
conf = SparkConf().setAppName("PysparkBasics").setMaster("local")     #define configurations
sc = SparkContext(conf=conf)        #create SparkContext using created configurations 

In [109]:
# sc.stop()

In [110]:
num = sc.parallelize([5,5,4,3,2,9,2])
num.collect()

[5, 5, 4, 3, 2, 9, 2]

In [111]:
num.map(lambda a : a*2).collect()

[10, 10, 8, 6, 4, 18, 4]

In [112]:
num.map(lambda a : pow(a,2)).collect()

[25, 25, 16, 9, 4, 81, 4]

In [113]:
names = sc.parallelize(["Bills","Mark","Brain","Mick"])

In [114]:
names.map(lambda a : "Mr. "+a).collect()

['Mr. Bills', 'Mr. Mark', 'Mr. Brain', 'Mr. Mick']

## **FlatMap**

In [115]:
rdd = sc.parallelize([ 2,3,4])
rdd.collect()

[2, 3, 4]

In [116]:
a = range(1,3)
for i in a:
  print(i)

1
2


In [117]:
(rdd.flatMap(lambda x: range(1,x)).collect())

[1, 1, 2, 1, 2, 3]

In [118]:
a = sc.parallelize([1,2,3])

In [119]:
b = a.flatMap(lambda x: (x,x*10.57))
b.collect()

[1, 10.57, 2, 21.14, 3, 31.71]

## **Filter**

In [120]:
num.collect()

[5, 5, 4, 3, 2, 9, 2]

In [121]:
num.filter(lambda x: x%2 ==0).collect()

[4, 2, 2]

In [122]:
names.collect()

['Bills', 'Mark', 'Brain', 'Mick']

In [123]:
names.filter(lambda x: "B" in x).collect()

['Bills', 'Brain']

## **Union**

In [124]:
num.collect()

[5, 5, 4, 3, 2, 9, 2]

In [125]:
num2 = sc.parallelize([1,7,9,4,10,15])
num2.collect()

[1, 7, 9, 4, 10, 15]

In [126]:
num.union(num2).collect()

[5, 5, 4, 3, 2, 9, 2, 1, 7, 9, 4, 10, 15]

In [127]:
x = sc.parallelize([1,2,3],2)
y = sc.parallelize([3,4],1)

In [128]:
z = x.union(y)
z.collect()

[1, 2, 3, 3, 4]

## **Sample**

In [129]:
parallel = sc.parallelize(range(1,10))
parallel.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [130]:
parallel.sample(True,.2).collect()

[3, 5]

In [131]:
parallel.sample(True,.2).collect()

[3, 6]

In [132]:
parallel.sample(True,.2).collect()

[5, 8]

In [133]:
parallel.sample(True,.2).collect()

[5]

In [134]:
parallel.sample(True,.2).collect()

[7, 8]

In [135]:
parallel.sample(True,.2).collect()

[2, 2, 7]

In [136]:
parallel.sample(True,.2).collect()

[2, 4]

In [137]:
parallel.sample(False,.2,seed=23).collect()

[4, 5]

## **wide Transformation**

### 01. GroupBy

In [138]:
names.collect()

['Bills', 'Mark', 'Brain', 'Mick']

In [139]:
names_gr = names.groupBy(lambda x: x[0])

In [140]:
names_gr.collect()

[('B', <pyspark.resultiterable.ResultIterable at 0x7fdde2ff87c0>),
 ('M', <pyspark.resultiterable.ResultIterable at 0x7fdde2fa6dc0>)]

In [141]:
for (k,v) in names_gr.collect():
  print(k,list(v))

B ['Bills', 'Brain']
M ['Mark', 'Mick']


In [142]:
aa = sc.parallelize([1,1,2,3,5,8])
result = aa.groupBy(lambda x: x%2 ).collect()


In [143]:
for (k,v) in result:
  print(k, list(v))

1 [1, 1, 3, 5]
0 [2, 8]


### 02. Intersection

In [144]:
num.collect()

[5, 5, 4, 3, 2, 9, 2]

In [145]:
num2.collect()

[1, 7, 9, 4, 10, 15]

In [146]:
num.intersection(num2).collect()

[4, 9]

In [147]:
num2.intersection(num).collect()

[4, 9]

In [148]:
num.subtract(num2).collect()

[2, 2, 5, 5, 3]

In [149]:
num2.subtract(num).collect()

[10, 1, 7, 15]

## 03. Distinct

In [150]:
num.collect()

[5, 5, 4, 3, 2, 9, 2]

In [151]:
num.distinct().collect()

[5, 4, 3, 2, 9]