# Filter, withColumn and when usage examples

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
spark = SparkSession.builder.appName('transformations').getOrCreate()
data = [(1, 30), (2, 25), (3, 35), (4, 40), (5, 29)]
df = spark.createDataFrame(data, schema= ['a', 'b'])

# note usage of col
filtered_df = df.filter(col('b') > 30)

filtered_df.show()

with_df = df.withColumn('c', when(col('b') > 30, '1').otherwise('0'))

with_df.show()
spark.stop()

+---+---+
|  a|  b|
+---+---+
|  3| 35|
|  4| 40|
+---+---+

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  1| 30|  0|
|  2| 25|  0|
|  3| 35|  1|
|  4| 40|  1|
|  5| 29|  0|
+---+---+---+



# Way to find number of rows in a dataframe

In [14]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('display').getOrCreate()
df = spark.read.csv('data.csv', header = True)
print(df.count())
df.show()
spark.stop()

3
+---+---+
|  a|  b|
+---+---+
|  1|  2|
|  3|  4|
|  5|  6|
+---+---+



# Aggregation examples using sum, min, max and avg

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum, min, max
spark = SparkSession.builder.appName('aggregate').getOrCreate()
df = spark.read.csv('data.csv', header = True)
summ = df.agg(sum('b'))
summ.show()
minn = df.agg(min('b'))
minn.show()
maxx = df.agg(max('b'))
maxx.show()
avg = df.agg(avg('b'))
avg.show()
spark.stop()

24/10/02 18:13:39 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


+------+
|sum(b)|
+------+
|  12.0|
+------+

+------+
|min(b)|
+------+
|     2|
+------+

+------+
|max(b)|
+------+
|     6|
+------+

+------+
|avg(b)|
+------+
|   4.0|
+------+



# Way to save data into a csv file

In [10]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('write').getOrCreate()
data = [(1, ), (2, ), (3, ), (4, )]
df = spark.createDataFrame(data, schema= ['no'])
df.write.format('csv').mode('overwrite').option('header', 'true').save('./data1.csv')
spark.stop()

24/10/02 18:13:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

# Word count using RDD and collections module

In [16]:
from pyspark import SparkContext
from pyspark import SparkConf
from collections import Counter
sc = SparkContext.getOrCreate(SparkConf().setMaster('local[*]'))
line = sc.textFile('text.txt')
count = line.flatMap(lambda x: x.split()).collect()
c = Counter(count)
print(c)
sc.stop()

24/10/04 18:10:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

Counter({'this': 2, 'is': 2, 'a': 1, 'test.': 1, 'a.': 1})


# Displaying how parallelize, flatmap and reduceByKey work

In [15]:
from pyspark import SparkContext, SparkConf

spark = SparkContext.getOrCreate(SparkConf().setMaster('local[*]'))
rdd = spark.parallelize([['abc'], ['efg'], ['abc'], ['efg']])
print(rdd.collect())
a = rdd.flatMap(lambda x: x)
print(a.collect())
test_flat = a.flatMap(lambda x: x)
print(test_flat.collect())
b = a.map(lambda x: (x, (1, 1)))
print(b.collect())
c = b.reduceByKey(lambda x, y: x + y)
print(c.collect())
print(a)
spark.stop()

24/10/04 17:57:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


[['abc'], ['efg'], ['abc'], ['efg']]


                                                                                

['abc', 'efg', 'abc', 'efg']
['a', 'b', 'c', 'e', 'f', 'g', 'a', 'b', 'c', 'e', 'f', 'g']
[('abc', (1, 1)), ('efg', (1, 1)), ('abc', (1, 1)), ('efg', (1, 1))]
[('efg', (1, 1, 1, 1)), ('abc', (1, 1, 1, 1))]
PythonRDD[1] at collect at /tmp/ipykernel_252917/3387076186.py:7
