In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('transformations').getOrCreate()

In [3]:
from pyspark.sql import Window
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [4]:
df=spark.read.csv('C:/Users/acer/Downloads/test1.csv',header=True,inferSchema=True)

In [5]:
df.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [6]:
df=df.withColumn('value',F.floor(F.rand()*9+1))

In [7]:
df.show()

+---------+---+----------+------+-----+
|     Name|age|Experience|Salary|value|
+---------+---+----------+------+-----+
|    Krish| 31|        10| 30000|    5|
|Sudhanshu| 30|         8| 25000|    2|
|    Sunny| 29|         4| 20000|    2|
|     Paul| 24|         3| 20000|    9|
|   Harsha| 21|         1| 15000|    9|
|  Shubham| 23|         2| 18000|    9|
+---------+---+----------+------+-----+



Dataframe Transformation

In [8]:
def convert(df):
    return df.withColumn('name',F.upper('name'))
def sal_bonus(df):
    return df.withColumn('Salary',df.Salary*1.2)

In [9]:
u_df=df.transform(convert).transform(sal_bonus)
u_df.show()

+---------+---+----------+-------+-----+
|     name|age|Experience| Salary|value|
+---------+---+----------+-------+-----+
|    KRISH| 31|        10|36000.0|    5|
|SUDHANSHU| 30|         8|30000.0|    2|
|    SUNNY| 29|         4|24000.0|    2|
|     PAUL| 24|         3|24000.0|    9|
|   HARSHA| 21|         1|18000.0|    9|
|  SHUBHAM| 23|         2|21600.0|    9|
+---------+---+----------+-------+-----+



In [10]:
def convert(df):
    df2=df.withColumn('name',F.upper('name'))
    df2=df2.withColumn('Salary_bonus',df.Salary*1.2)
    df2=df2.drop('value')
    return df2

In [11]:
udf=df.transform(convert)
udf.show()

+---------+---+----------+------+------------+
|     name|age|Experience|Salary|Salary_bonus|
+---------+---+----------+------+------------+
|    KRISH| 31|        10| 30000|     36000.0|
|SUDHANSHU| 30|         8| 25000|     30000.0|
|    SUNNY| 29|         4| 20000|     24000.0|
|     PAUL| 24|         3| 20000|     24000.0|
|   HARSHA| 21|         1| 15000|     18000.0|
|  SHUBHAM| 23|         2| 18000|     21600.0|
+---------+---+----------+------+------------+



Array-Type columns Transformation

In [12]:
data = [("Alia", [80,70,60,40] ),
        ("Abhi", [95,90,80,97]),
        ("Charlie", [65,50,90,87])]

schema=['Name','Scores']

df2 = spark.createDataFrame(data, schema=schema)
df2.show()

+-------+----------------+
|   Name|          Scores|
+-------+----------------+
|   Alia|[80, 70, 60, 40]|
|   Abhi|[95, 90, 80, 97]|
|Charlie|[65, 50, 90, 87]|
+-------+----------------+



In [13]:
from pyspark.sql.functions import transform
df3=df2.select('Name',transform('Scores',lambda x:x+2).alias('new_scores'))

In [14]:
df3.show()

+-------+----------------+
|   Name|      new_scores|
+-------+----------------+
|   Alia|[82, 72, 62, 42]|
|   Abhi|[97, 92, 82, 99]|
|Charlie|[67, 52, 92, 89]|
+-------+----------------+



In [15]:
data = [("Alia", ['azure','cloud'] ),
        ("Balu", ['java','spring']),
        ("Charlie", ['python','sql'])]

schema=['Name','skill']
d = spark.createDataFrame(data, schema=schema)
d.show()

+-------+--------------+
|   Name|         skill|
+-------+--------------+
|   Alia|[azure, cloud]|
|   Balu|[java, spring]|
|Charlie| [python, sql]|
+-------+--------------+



In [16]:
def upp(x):
    return F.upper(x)
d.select('Name',transform('Skill',upp).alias('New_Skill')).show()

+-------+--------------+
|   Name|     New_Skill|
+-------+--------------+
|   Alia|[AZURE, CLOUD]|
|   Balu|[JAVA, SPRING]|
|Charlie| [PYTHON, SQL]|
+-------+--------------+



In [53]:
# def grace(x):
#     if x>=100:
#         return x
#     else:
#         return x+5
def grace(x):
    return  F.when(x < 96, x + 5).otherwise(x+(100-x))

df2.select('Name',transform('Scores',grace).alias('Scores_with_grace')).show()

+-------+------------------+
|   Name| Scores_with_grace|
+-------+------------------+
|   Alia|  [85, 75, 65, 45]|
|   Abhi|[100, 95, 85, 100]|
|Charlie|  [70, 55, 95, 92]|
+-------+------------------+



UDF

In [18]:
df_ud=df.withColumn('bonus',F.floor(F.rand()*5000+1000))
df_ud.show()

+---------+---+----------+------+-----+-----+
|     Name|age|Experience|Salary|value|bonus|
+---------+---+----------+------+-----+-----+
|    Krish| 31|        10| 30000|    5| 2834|
|Sudhanshu| 30|         8| 25000|    2| 4062|
|    Sunny| 29|         4| 20000|    2| 3080|
|     Paul| 24|         3| 20000|    9| 5225|
|   Harsha| 21|         1| 15000|    9| 4485|
|  Shubham| 23|         2| 18000|    9| 3505|
+---------+---+----------+------+-----+-----+



In [19]:
def total(s,v):
    return s*v

In [20]:
from pyspark.sql.functions import udf
#inline like lambda for regirtering udf
tol=udf(lambda x,y:total(x,y),T.IntegerType())

In [21]:
df_ud.select('Name',tol(df_ud['Salary'],df_ud['value']).alias('Total_Hike')).show()

+---------+----------+
|     Name|Total_Hike|
+---------+----------+
|    Krish|    150000|
|Sudhanshu|     50000|
|    Sunny|     40000|
|     Paul|    180000|
|   Harsha|    135000|
|  Shubham|    162000|
+---------+----------+



In [22]:
df_ud=df_ud.withColumn('bonus',F.floor(F.rand()*4000+1000))
df_ud.show()

+---------+---+----------+------+-----+-----+
|     Name|age|Experience|Salary|value|bonus|
+---------+---+----------+------+-----+-----+
|    Krish| 31|        10| 30000|    5| 1766|
|Sudhanshu| 30|         8| 25000|    2| 3892|
|    Sunny| 29|         4| 20000|    2| 1550|
|     Paul| 24|         3| 20000|    9| 1330|
|   Harsha| 21|         1| 15000|    9| 2972|
|  Shubham| 23|         2| 18000|    9| 4213|
+---------+---+----------+------+-----+-----+



In [23]:
#Faster way to register
@udf(returnType=T.IntegerType())
def bonus(s,v):
    return s+v

In [24]:
df_ud.select('Name',bonus(df_ud['Salary'],df_ud['bonus']).alias('Sal_with_bon')).show()

+---------+------------+
|     Name|Sal_with_bon|
+---------+------------+
|    Krish|       31766|
|Sudhanshu|       28892|
|    Sunny|       21550|
|     Paul|       21330|
|   Harsha|       17972|
|  Shubham|       22213|
+---------+------------+



In [25]:
#udf on sql queries
df_ud.createOrReplaceTempView('peop')

In [26]:
spark.sql('select * from peop').show()

+---------+---+----------+------+-----+-----+
|     Name|age|Experience|Salary|value|bonus|
+---------+---+----------+------+-----+-----+
|    Krish| 31|        10| 30000|    5| 1766|
|Sudhanshu| 30|         8| 25000|    2| 3892|
|    Sunny| 29|         4| 20000|    2| 1550|
|     Paul| 24|         3| 20000|    9| 1330|
|   Harsha| 21|         1| 15000|    9| 2972|
|  Shubham| 23|         2| 18000|    9| 4213|
+---------+---+----------+------+-----+-----+



In [27]:
spark.udf.register('Total_hike',total,T.IntegerType())

<function __main__.total(s, v)>

In [28]:
spark.sql('select name,Total_hike(Salary,value) as total_hike from peop').show()

+---------+----------+
|     name|total_hike|
+---------+----------+
|    Krish|    150000|
|Sudhanshu|     50000|
|    Sunny|     40000|
|     Paul|    180000|
|   Harsha|    135000|
|  Shubham|    162000|
+---------+----------+



In [29]:
#Activity to get range
from pyspark.sql.functions import udf
@udf(T.StringType())
def sal_rang(s):
    if s<=18000:
        return 'Low'
    elif s>18000 and s<25000:
        return 'Medium'
    else:
        return 'High'

In [30]:
df_ud.withColumn('Range',sal_rang(df_ud.Salary)).show()

+---------+---+----------+------+-----+-----+------+
|     Name|age|Experience|Salary|value|bonus| Range|
+---------+---+----------+------+-----+-----+------+
|    Krish| 31|        10| 30000|    5| 1766|  High|
|Sudhanshu| 30|         8| 25000|    2| 3892|  High|
|    Sunny| 29|         4| 20000|    2| 1550|Medium|
|     Paul| 24|         3| 20000|    9| 1330|Medium|
|   Harsha| 21|         1| 15000|    9| 2972|   Low|
|  Shubham| 23|         2| 18000|    9| 4213|   Low|
+---------+---+----------+------+-----+-----+------+



In [31]:
df.filter(df.Salary.between(20000,50000)).show()

+---------+---+----------+------+-----+
|     Name|age|Experience|Salary|value|
+---------+---+----------+------+-----+
|    Krish| 31|        10| 30000|    5|
|Sudhanshu| 30|         8| 25000|    2|
|    Sunny| 29|         4| 20000|    2|
|     Paul| 24|         3| 20000|    9|
+---------+---+----------+------+-----+



RDD Operations

In [32]:
from pyspark import SparkContext

In [33]:
sc=SparkContext.getOrCreate()

In [34]:
#map
data=sc.parallelize([10,20,34,55,10,35])
data.map(lambda x:x+10).collect()

[20, 30, 44, 65, 20, 45]

In [35]:
#filter
data.filter(lambda x:x%2==0).collect()

[10, 20, 34, 10]

In [36]:
#union
odd=data.filter(lambda x:x%2!=0)
even=data.filter(lambda x:x%2==0)
print(odd.collect())
print(even.collect())
odd.union(even).collect()

[55, 35]
[10, 20, 34, 10]


[55, 35, 10, 20, 34, 10]

In [37]:
#flatmap
fmp=sc.parallelize(['Hello everyone, I am Varun, this is my friend Nandi'])
print(fmp.flatMap(lambda x:x.split(' ')).collect())
print(fmp.flatMap(lambda x:x.split(',')).collect())

['Hello', 'everyone,', 'I', 'am', 'Varun,', 'this', 'is', 'my', 'friend', 'Nandi']
['Hello everyone', ' I am Varun', ' this is my friend Nandi']


In [38]:
#actions
print(data.collect())
print(data.count())
print(data.take(5))
data.first()

[10, 20, 34, 55, 10, 35]
6
[10, 20, 34, 55, 10]


10

In [39]:
#actions-reduce
data.reduce(lambda x,y:x+y)

164

In [40]:
#map using second element
data2=sc.parallelize([('varun',25),('karan',35),('naveen',45),('tharun',45),('varun',35),('karan',45),('naveen',35),('tharun',55)])
data2.map(lambda x:(x[0].upper(),x[1])).collect()

[('VARUN', 25),
 ('KARAN', 35),
 ('NAVEEN', 45),
 ('THARUN', 45),
 ('VARUN', 35),
 ('KARAN', 45),
 ('NAVEEN', 35),
 ('THARUN', 55)]

Pair RDD operations

In [41]:
data2=sc.parallelize([('varun',25),('karan',35),('naveen',45),('tharun',45),('varun',35),('karan',45),('naveen',35),('tharun',55)])
data2.reduceByKey(lambda x,y:x+y).collect()

[('varun', 60), ('karan', 80), ('naveen', 80), ('tharun', 100)]

In [42]:
data2.mapValues(lambda x:x+10).collect()

[('varun', 35),
 ('karan', 45),
 ('naveen', 55),
 ('tharun', 55),
 ('varun', 45),
 ('karan', 55),
 ('naveen', 45),
 ('tharun', 65)]

In [43]:
#gives in key-values pair
data2.collectAsMap()

{'varun': 35, 'karan': 45, 'naveen': 35, 'tharun': 55}

In [44]:
print(data2.sortByKey().collect())
data2.sortBy(lambda x:x[1],ascending=False).collect()

[('karan', 35), ('karan', 45), ('naveen', 45), ('naveen', 35), ('tharun', 45), ('tharun', 55), ('varun', 25), ('varun', 35)]


[('tharun', 55),
 ('naveen', 45),
 ('tharun', 45),
 ('karan', 45),
 ('karan', 35),
 ('varun', 35),
 ('naveen', 35),
 ('varun', 25)]

In [45]:
#groupby
res=data2.groupByKey().collect()
for key,value in res:
    print(key,list(value))

varun [25, 35]
karan [35, 45]
naveen [45, 35]
tharun [45, 55]


In [46]:
#count
data2.countByKey()

defaultdict(int, {'varun': 2, 'karan': 2, 'naveen': 2, 'tharun': 2})

In [47]:
print(data2.countByKey().items())
r=data2.countByKey().items()
for key,val in r:
    print(key,val)

dict_items([('varun', 2), ('karan', 2), ('naveen', 2), ('tharun', 2)])
varun 2
karan 2
naveen 2
tharun 2


In [48]:
#getting value for a specific key
data2.lookup('varun')

[25, 35]

In [49]:
#join on keys
rdd1 = sc.parallelize([("a", 1), ("b", 2)])
rdd2 = sc.parallelize([("a", 3), ("b", 4)])
rdd1.join(rdd2).collect()

[('a', (1, 3)), ('b', (2, 4))]

BroadCast Variables and accumulators

In [50]:
grades={'a':50,'b':40,'c':30,'d':20}
broad_grad=sc.broadcast(grades)
d = sc.parallelize(["a", "b", "c", "d","e"])
d.map(lambda x:broad_grad.value.get(x,0)).collect()

[50, 40, 30, 20, 0]

In [51]:
#accumulators
total=sc.accumulator(0)
def rdd_sum(x):
    global total
    total+=x
data.foreach(rdd_sum)
print(total.value)

164


In [55]:
accum = sc.accumulator(0)
data = sc.parallelize([1, 2, 3, 4, 5])

def add_to_accum(x):
    global accum
    accum += x
    
data.foreach(add_to_accum)
accum.value

15