In [0]:
# Creating a SparkSession

from pyspark.sql import SparkSession
from pyspark import SparkContext
spark=SparkSession.builder.appName('PySpark Coding challenge').getOrCreate()
spark

In [0]:
# Creating a dataframe

data=[("Sona","Sales","NY",90000,34,10000),
      ("Mona","Sales","NY",86000,56,20000),
      ("Bannu","Sales","CA",81000,30,23000),
      ("Chinnu","Finance","CA",90000,24,23000),
    ("Sunny","Finance","CA",99000,40,24000),
    ("Raju","Finance","NY",83000,36,19000),
    ("ravi","Finance","NY",79000,53,15000),
    ("Dhana","Marketing","CA",80000,25,18000),
    ("Ram","Marketing","NY",91000,50,21000)]

columns=["Name","Department","State","Salary","Age","Bonus"]

df1=spark.createDataFrame(data=data,schema=columns)
df1.show()

+------+----------+-----+------+---+-----+
|  Name|Department|State|Salary|Age|Bonus|
+------+----------+-----+------+---+-----+
|  Sona|     Sales|   NY| 90000| 34|10000|
|  Mona|     Sales|   NY| 86000| 56|20000|
| Bannu|     Sales|   CA| 81000| 30|23000|
|Chinnu|   Finance|   CA| 90000| 24|23000|
| Sunny|   Finance|   CA| 99000| 40|24000|
|  Raju|   Finance|   NY| 83000| 36|19000|
|  ravi|   Finance|   NY| 79000| 53|15000|
| Dhana| Marketing|   CA| 80000| 25|18000|
|   Ram| Marketing|   NY| 91000| 50|21000|
+------+----------+-----+------+---+-----+



In [0]:
############ Manipulating

# 1) schema 
df1.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Salary: long (nullable = true)
 |-- Age: long (nullable = true)
 |-- Bonus: long (nullable = true)



In [0]:
# 2) show only 2 top records
df1.show(2)

+----+----------+-----+------+---+-----+
|Name|Department|State|Salary|Age|Bonus|
+----+----------+-----+------+---+-----+
|Sona|     Sales|   NY| 90000| 34|10000|
|Mona|     Sales|   NY| 86000| 56|20000|
+----+----------+-----+------+---+-----+
only showing top 2 rows



In [0]:
# 3) count number of rows in Df
print(df1.count())

# 4) count of number of columns######## Dropping
#1) Drop single Column From DataFrame

# i)
df1.drop("Age").printSchema()
df1.drop("Age").show() 
print(len(df1.columns))

# 5) names of columns
df1.columns



9
6


['Name', 'Department', 'State', 'Salary', 'Age', 'Bonus']

In [0]:
# 6) Adding new columns
from pyspark.sql.functions import lit
df1.withColumn("Marks",lit(90)).show()

# 7) select() operations
df1.select(df1.Department,df1.State).show()

+------+----------+-----+------+---+-----+-----+
|  Name|Department|State|Salary|Age|Bonus|Marks|
+------+----------+-----+------+---+-----+-----+
|  Sona|     Sales|   NY| 90000| 34|10000|   90|
|  Mona|     Sales|   NY| 86000| 56|20000|   90|
| Bannu|     Sales|   CA| 81000| 30|23000|   90|
|Chinnu|   Finance|   CA| 90000| 24|23000|   90|
| Sunny|   Finance|   CA| 99000| 40|24000|   90|
|  Raju|   Finance|   NY| 83000| 36|19000|   90|
|  ravi|   Finance|   NY| 79000| 53|15000|   90|
| Dhana| Marketing|   CA| 80000| 25|18000|   90|
|   Ram| Marketing|   NY| 91000| 50|21000|   90|
+------+----------+-----+------+---+-----+-----+

+----------+-----+
|Department|State|
+----------+-----+
|     Sales|   NY|
|     Sales|   NY|
|     Sales|   CA|
|   Finance|   CA|
|   Finance|   CA|
|   Finance|   NY|
|   Finance|   NY|
| Marketing|   CA|
| Marketing|   NY|
+----------+-----+



In [0]:
######## Dropping
#1) Drop single Column From DataFrame

df1.drop("Age").printSchema()
df1.drop("Age").show()

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Salary: long (nullable = true)
 |-- Bonus: long (nullable = true)

+------+----------+-----+------+-----+
|  Name|Department|State|Salary|Bonus|
+------+----------+-----+------+-----+
|  Sona|     Sales|   NY| 90000|10000|
|  Mona|     Sales|   NY| 86000|20000|
| Bannu|     Sales|   CA| 81000|23000|
|Chinnu|   Finance|   CA| 90000|23000|
| Sunny|   Finance|   CA| 99000|24000|
|  Raju|   Finance|   NY| 83000|19000|
|  ravi|   Finance|   NY| 79000|15000|
| Dhana| Marketing|   CA| 80000|18000|
|   Ram| Marketing|   NY| 91000|21000|
+------+----------+-----+------+-----+



In [0]:
# 2) Dropping multiple columns
df1.drop("Name","Department").printSchema()
df1.drop("Name","Department").show()

root
 |-- State: string (nullable = true)
 |-- Salary: long (nullable = true)
 |-- Age: long (nullable = true)
 |-- Bonus: long (nullable = true)

+-----+------+---+-----+
|State|Salary|Age|Bonus|
+-----+------+---+-----+
|   NY| 90000| 34|10000|
|   NY| 86000| 56|20000|
|   CA| 81000| 30|23000|
|   CA| 90000| 24|23000|
|   CA| 99000| 40|24000|
|   NY| 83000| 36|19000|
|   NY| 79000| 53|15000|
|   CA| 80000| 25|18000|
|   NY| 91000| 50|21000|
+-----+------+---+-----+



In [0]:
############# Sorting
# sort() or orderBy() function of PySpark DataFrame to sort DataFrame by ascending or descending order 
# based on single or multiple columns

# 1) sort()

# i) sorting one column in ascending order
df1.sort("Salary",ascending=True).show()

+------+----------+-----+------+---+-----+
|  Name|Department|State|Salary|Age|Bonus|
+------+----------+-----+------+---+-----+
|  ravi|   Finance|   NY| 79000| 53|15000|
| Dhana| Marketing|   CA| 80000| 25|18000|
| Bannu|     Sales|   CA| 81000| 30|23000|
|  Raju|   Finance|   NY| 83000| 36|19000|
|  Mona|     Sales|   NY| 86000| 56|20000|
|  Sona|     Sales|   NY| 90000| 34|10000|
|Chinnu|   Finance|   CA| 90000| 24|23000|
|   Ram| Marketing|   NY| 91000| 50|21000|
| Sunny|   Finance|   CA| 99000| 40|24000|
+------+----------+-----+------+---+-----+



In [0]:
# ii) sorting one column in descending order

df1.sort("Salary",ascending=False).show()

+------+----------+-----+------+---+-----+
|  Name|Department|State|Salary|Age|Bonus|
+------+----------+-----+------+---+-----+
| Sunny|   Finance|   CA| 99000| 40|24000|
|   Ram| Marketing|   NY| 91000| 50|21000|
|  Sona|     Sales|   NY| 90000| 34|10000|
|Chinnu|   Finance|   CA| 90000| 24|23000|
|  Mona|     Sales|   NY| 86000| 56|20000|
|  Raju|   Finance|   NY| 83000| 36|19000|
| Bannu|     Sales|   CA| 81000| 30|23000|
| Dhana| Marketing|   CA| 80000| 25|18000|
|  ravi|   Finance|   NY| 79000| 53|15000|
+------+----------+-----+------+---+-----+



In [0]:
# multiple columns using orderBy()

df1.orderBy("Department","Salary").show()

+------+----------+-----+------+---+-----+
|  Name|Department|State|Salary|Age|Bonus|
+------+----------+-----+------+---+-----+
|  ravi|   Finance|   NY| 79000| 53|15000|
|  Raju|   Finance|   NY| 83000| 36|19000|
|Chinnu|   Finance|   CA| 90000| 24|23000|
| Sunny|   Finance|   CA| 99000| 40|24000|
| Dhana| Marketing|   CA| 80000| 25|18000|
|   Ram| Marketing|   NY| 91000| 50|21000|
| Bannu|     Sales|   CA| 81000| 30|23000|
|  Mona|     Sales|   NY| 86000| 56|20000|
|  Sona|     Sales|   NY| 90000| 34|10000|
+------+----------+-----+------+---+-----+



In [0]:
# group by and aggregations
# 1) sum of salary department wise
df1.groupBy("Department").sum("Salary").show()

+----------+-----------+
|Department|sum(Salary)|
+----------+-----------+
|     Sales|     257000|
|   Finance|     351000|
| Marketing|     171000|
+----------+-----------+



In [0]:
# 2) number of employees dept wise
df1.groupBy("Department").count().show()

+----------+-----+
|Department|count|
+----------+-----+
|     Sales|    3|
|   Finance|    4|
| Marketing|    2|
+----------+-----+



In [0]:
# 3) Minimum salary for each department
df1.groupBy("Department").min("Salary").show()

+----------+-----------+
|Department|min(Salary)|
+----------+-----------+
|     Sales|      81000|
|   Finance|      79000|
| Marketing|      80000|
+----------+-----------+



In [0]:
# 4) Maximum salary of each dept
df1.groupBy("Department").max("Salary").show()

+----------+-----------+
|Department|max(Salary)|
+----------+-----------+
|     Sales|      90000|
|   Finance|      99000|
| Marketing|      91000|
+----------+-----------+



In [0]:
# 5) AVerage salary of each dept
df1.groupBy("Department").avg("Salary").show()

+----------+-----------------+
|Department|      avg(Salary)|
+----------+-----------------+
|     Sales|85666.66666666667|
|   Finance|          87750.0|
| Marketing|          85500.0|
+----------+-----------------+



In [0]:
# 6) Mean of salary dept wise
df1.groupBy("Department").mean("Salary").show()

+----------+-----------------+
|Department|      avg(Salary)|
+----------+-----------------+
|     Sales|85666.66666666667|
|   Finance|          87750.0|
| Marketing|          85500.0|
+----------+-----------------+



In [0]:
# 7) group by multiple columns
# group by department,state and find sum of salary and bonus
df1.groupBy("Department","State").sum("Salary","Bonus").show()



+----------+-----+-----------+----------+
|Department|State|sum(Salary)|sum(Bonus)|
+----------+-----+-----------+----------+
|     Sales|   NY|     176000|     30000|
|     Sales|   CA|      81000|     23000|
|   Finance|   CA|     189000|     47000|
|   Finance|   NY|     162000|     34000|
| Marketing|   NY|      91000|     21000|
| Marketing|   CA|      80000|     18000|
+----------+-----+-----------+----------+



In [0]:
#8)  Running more aggregates at a time
from pyspark.sql.functions import sum,avg,min,max,mean,count
df1.groupBy("Department").agg(sum("Salary").alias("sum_salary"),
            avg("Salary").alias("avg_salary"),
                min("Salary").alias("Min_salary"),
            max("Salary").alias("Max_Salary"),
            mean("Salary").alias("Mean Salary"),
            count("*").alias("Total count")).show()

+----------+----------+-----------------+----------+----------+-----------------+-----------+
|Department|sum_salary|       avg_salary|Min_salary|Max_Salary|      Mean Salary|Total count|
+----------+----------+-----------------+----------+----------+-----------------+-----------+
|     Sales|    257000|85666.66666666667|     81000|     90000|85666.66666666667|          3|
|   Finance|    351000|          87750.0|     79000|     99000|          87750.0|          4|
| Marketing|    171000|          85500.0|     80000|     91000|          85500.0|          2|
+----------+----------+-----------------+----------+----------+-----------------+-----------+



In [0]:
# joins

# Creating a datframe for Employee Personal Details
data=[("1","Sona","TCS"),
      ("2","Mona","Infosys"),
      ("3","Bannu","Delloite"),
     ("4","Sunny","TCS"),
     ("5","Chinnu","Accenture")]
cols=["ID","Name","Company"]
df1=spark.createDataFrame(data,cols)
df1.show()

+---+------+---------+
| ID|  Name|  Company|
+---+------+---------+
|  1|  Sona|      TCS|
|  2|  Mona|  Infosys|
|  3| Bannu| Delloite|
|  4| Sunny|      TCS|
|  5|Chinnu|Accenture|
+---+------+---------+



In [0]:
# Creating a dataframe of employee salary details
data=[("1",45000,"HR"),
     ("2",90000,"Manager"),
     ("6",78000,"IT"),
     ("5",50000,"Sales")]

cols=["ID","Salary","Department"]
df2=spark.createDataFrame(data=data,schema=cols)
df2.show()

+---+------+----------+
| ID|Salary|Department|
+---+------+----------+
|  1| 45000|        HR|
|  2| 90000|   Manager|
|  6| 78000|        IT|
|  5| 50000|     Sales|
+---+------+----------+



In [0]:
# 1) Inner join
df1.join(df2,df1.ID==df2.ID,"inner").show()

+---+------+---------+---+------+----------+
| ID|  Name|  Company| ID|Salary|Department|
+---+------+---------+---+------+----------+
|  1|  Sona|      TCS|  1| 45000|        HR|
|  2|  Mona|  Infosys|  2| 90000|   Manager|
|  5|Chinnu|Accenture|  5| 50000|     Sales|
+---+------+---------+---+------+----------+



In [0]:
# 2) outer join
df1.join(df2,df1.ID==df2.ID,"outer").show()


+----+------+---------+----+------+----------+
|  ID|  Name|  Company|  ID|Salary|Department|
+----+------+---------+----+------+----------+
|   1|  Sona|      TCS|   1| 45000|        HR|
|   2|  Mona|  Infosys|   2| 90000|   Manager|
|   3| Bannu| Delloite|NULL|  NULL|      NULL|
|   4| Sunny|      TCS|NULL|  NULL|      NULL|
|   5|Chinnu|Accenture|   5| 50000|     Sales|
|NULL|  NULL|     NULL|   6| 78000|        IT|
+----+------+---------+----+------+----------+



In [0]:
# 3) left join
df1.join(df2,df1.ID==df2.ID,"leftouter").show()

+---+------+---------+----+------+----------+
| ID|  Name|  Company|  ID|Salary|Department|
+---+------+---------+----+------+----------+
|  1|  Sona|      TCS|   1| 45000|        HR|
|  2|  Mona|  Infosys|   2| 90000|   Manager|
|  3| Bannu| Delloite|NULL|  NULL|      NULL|
|  4| Sunny|      TCS|NULL|  NULL|      NULL|
|  5|Chinnu|Accenture|   5| 50000|     Sales|
+---+------+---------+----+------+----------+



In [0]:
# 4) Right join
df1.join(df2,df1.ID==df2.ID,"right").show()

+----+------+---------+---+------+----------+
|  ID|  Name|  Company| ID|Salary|Department|
+----+------+---------+---+------+----------+
|   1|  Sona|      TCS|  1| 45000|        HR|
|   2|  Mona|  Infosys|  2| 90000|   Manager|
|NULL|  NULL|     NULL|  6| 78000|        IT|
|   5|Chinnu|Accenture|  5| 50000|     Sales|
+----+------+---------+---+------+----------+



In [0]:
# 5) left semi
df1.join(df2,df1.ID==df2.ID,"leftsemi").show()

+---+------+---------+
| ID|  Name|  Company|
+---+------+---------+
|  1|  Sona|      TCS|
|  2|  Mona|  Infosys|
|  5|Chinnu|Accenture|
+---+------+---------+



In [0]:
# 6) left anti
df1.join(df2,df1.ID==df2.ID,"leftanti").show()

+---+-----+--------+
| ID| Name| Company|
+---+-----+--------+
|  3|Bannu|Delloite|
|  4|Sunny|     TCS|
+---+-----+--------+



In [0]:
# Question-2
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SparkCreateTableExample").getOrCreate()

In [0]:
# creating a database
spark.sql("CREATE DATABASE IF NOT EXISTS db;")


DataFrame[]

In [0]:
# creating a table Employee3
spark.sql("CREATE TABLE IF NOT EXISTS Employee3(emp_id Int, emp_name String,Salary INT,age INT,city String)")

DataFrame[]

In [0]:

spark.sql("INSERT INTO Employee3 VALUES (1,'Sona',80000,22,'Hyderabad')")
spark.sql("insert into employee3 values(2,'Sunny',70000,23,'Pune'),(3,'Mona',90000,19,'Hyderabad'),(7,'Chinnu',45000,25,'Mumbai'),(10,'Bannu',12,27,'Pune'),(6,'Raju',70000,45,'Mumbai'),(18,'Dhana',89999,34,'Kolkata')")

DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.sql(" select * from employee3").show()

+------+--------+------+---+---------+
|emp_id|emp_name|Salary|age|     city|
+------+--------+------+---+---------+
|     2|   Sunny| 70000| 23|     Pune|
|     3|    Mona| 90000| 19|Hyderabad|
|     7|  Chinnu| 45000| 25|   Mumbai|
|    10|   Bannu|    12| 27|     Pune|
|     6|    Raju| 70000| 45|   Mumbai|
|    18|   Dhana| 89999| 34|  Kolkata|
|     1|    Sona| 80000| 22|Hyderabad|
+------+--------+------+---+---------+



In [0]:
# Joins
# create another table department
spark.sql("create table if not exists Department1(dept_id int,emp_id int,dept_name string)")

# insert records into department table
spark.sql("insert into Department1 values(100,1,'IT'),(102,6,'HR'),(103,13,'Manager'),(109,7,'Developer'),(110,10,'Tester')")


DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.sql(" select * from department1").show()

+-------+------+---------+
|dept_id|emp_id|dept_name|
+-------+------+---------+
|    100|     1|       IT|
|    102|     6|       HR|
|    103|    13|  Manager|
|    109|     7|Developer|
|    110|    10|   Tester|
+-------+------+---------+



In [0]:
# Inner join
spark.sql("select * from employee3  inner join  department1 on employee3.emp_id=department1.emp_id").show()

+------+--------+------+---+---------+-------+------+---------+
|emp_id|emp_name|Salary|age|     city|dept_id|emp_id|dept_name|
+------+--------+------+---+---------+-------+------+---------+
|     7|  Chinnu| 45000| 25|   Mumbai|    109|     7|Developer|
|    10|   Bannu|    12| 27|     Pune|    110|    10|   Tester|
|     6|    Raju| 70000| 45|   Mumbai|    102|     6|       HR|
|     1|    Sona| 80000| 22|Hyderabad|    100|     1|       IT|
+------+--------+------+---+---------+-------+------+---------+



In [0]:
# Outer join
# Full outer join
spark.sql("select * from employee3  full outer join  department1 on employee3.emp_id=department1.emp_id").show()

spark.sql("select * from employee3  full join  department1 on employee3.emp_id=department1.emp_id").show()

+------+--------+------+----+---------+-------+------+---------+
|emp_id|emp_name|Salary| age|     city|dept_id|emp_id|dept_name|
+------+--------+------+----+---------+-------+------+---------+
|     1|    Sona| 80000|  22|Hyderabad|    100|     1|       IT|
|     2|   Sunny| 70000|  23|     Pune|   NULL|  NULL|     NULL|
|     3|    Mona| 90000|  19|Hyderabad|   NULL|  NULL|     NULL|
|     6|    Raju| 70000|  45|   Mumbai|    102|     6|       HR|
|     7|  Chinnu| 45000|  25|   Mumbai|    109|     7|Developer|
|    10|   Bannu|    12|  27|     Pune|    110|    10|   Tester|
|  NULL|    NULL|  NULL|NULL|     NULL|    103|    13|  Manager|
|    18|   Dhana| 89999|  34|  Kolkata|   NULL|  NULL|     NULL|
+------+--------+------+----+---------+-------+------+---------+

+------+--------+------+----+---------+-------+------+---------+
|emp_id|emp_name|Salary| age|     city|dept_id|emp_id|dept_name|
+------+--------+------+----+---------+-------+------+---------+
|     1|    Sona| 80000|

In [0]:
# Left join
# Left outer join
spark.sql("select * from employee3  left join  department1 on employee3.emp_id=department1.emp_id").show()

spark.sql("select * from employee3  left outer join  department1 on employee3.emp_id=department1.emp_id").show()

+------+--------+------+---+---------+-------+------+---------+
|emp_id|emp_name|Salary|age|     city|dept_id|emp_id|dept_name|
+------+--------+------+---+---------+-------+------+---------+
|     2|   Sunny| 70000| 23|     Pune|   NULL|  NULL|     NULL|
|     3|    Mona| 90000| 19|Hyderabad|   NULL|  NULL|     NULL|
|     7|  Chinnu| 45000| 25|   Mumbai|    109|     7|Developer|
|    10|   Bannu|    12| 27|     Pune|    110|    10|   Tester|
|     6|    Raju| 70000| 45|   Mumbai|    102|     6|       HR|
|    18|   Dhana| 89999| 34|  Kolkata|   NULL|  NULL|     NULL|
|     1|    Sona| 80000| 22|Hyderabad|    100|     1|       IT|
+------+--------+------+---+---------+-------+------+---------+

+------+--------+------+---+---------+-------+------+---------+
|emp_id|emp_name|Salary|age|     city|dept_id|emp_id|dept_name|
+------+--------+------+---+---------+-------+------+---------+
|     2|   Sunny| 70000| 23|     Pune|   NULL|  NULL|     NULL|
|     3|    Mona| 90000| 19|Hyderabad| 

In [0]:
# Right join
# Right outer join
spark.sql("select * from employee3  right outer join  department1 on employee3.emp_id=department1.emp_id").show()

spark.sql("select * from employee3  right join  department1 on employee3.emp_id=department1.emp_id").show()

+------+--------+------+----+---------+-------+------+---------+
|emp_id|emp_name|Salary| age|     city|dept_id|emp_id|dept_name|
+------+--------+------+----+---------+-------+------+---------+
|     1|    Sona| 80000|  22|Hyderabad|    100|     1|       IT|
|     6|    Raju| 70000|  45|   Mumbai|    102|     6|       HR|
|  NULL|    NULL|  NULL|NULL|     NULL|    103|    13|  Manager|
|     7|  Chinnu| 45000|  25|   Mumbai|    109|     7|Developer|
|    10|   Bannu|    12|  27|     Pune|    110|    10|   Tester|
+------+--------+------+----+---------+-------+------+---------+

+------+--------+------+----+---------+-------+------+---------+
|emp_id|emp_name|Salary| age|     city|dept_id|emp_id|dept_name|
+------+--------+------+----+---------+-------+------+---------+
|     1|    Sona| 80000|  22|Hyderabad|    100|     1|       IT|
|     6|    Raju| 70000|  45|   Mumbai|    102|     6|       HR|
|  NULL|    NULL|  NULL|NULL|     NULL|    103|    13|  Manager|
|     7|  Chinnu| 45000|

In [0]:
# Left semi join
spark.sql("select * from employee3  left semi join  department1 on employee3.emp_id=department1.emp_id").show()

+------+--------+------+---+---------+
|emp_id|emp_name|Salary|age|     city|
+------+--------+------+---+---------+
|     7|  Chinnu| 45000| 25|   Mumbai|
|    10|   Bannu|    12| 27|     Pune|
|     6|    Raju| 70000| 45|   Mumbai|
|     1|    Sona| 80000| 22|Hyderabad|
+------+--------+------+---+---------+



In [0]:
# Left anti join
spark.sql("select * from employee3  left anti join  department1 on employee3.emp_id=department1.emp_id").show()

+------+--------+------+---+---------+
|emp_id|emp_name|Salary|age|     city|
+------+--------+------+---+---------+
|     2|   Sunny| 70000| 23|     Pune|
|     3|    Mona| 90000| 19|Hyderabad|
|    18|   Dhana| 89999| 34|  Kolkata|
+------+--------+------+---+---------+



In [0]:
# b) Applying Functions in pandas DF
# 1) transform()

import pandas as pd
psdf = pd.DataFrame({'a': [1,2,3], 'b':[4,5,6]})
print(psdf)

# Adding 10 to each element

def pandas_plus(count):
     return count + 10  # should always return the same length as input.
    
psdf.transform(pandas_plus)

   a  b
0  1  4
1  2  5
2  3  6


Unnamed: 0,a,b
0,11,14
1,12,15
2,13,16


In [0]:
# 2) apply

psdf = pd.DataFrame({'a': [1,2,3], 'b':[5,6,7]})
def pandas_plus(x):
    return x[x % 2 == 1]  # allows an arbitrary length

psdf.apply(pandas_plus)

Unnamed: 0,a,b
0,1,5
2,3,7


In [0]:
# Example

import pyspark.pandas as pd
import numpy as np

technologies = ({
    'Fee' :[20000,25000,30000,22000,np.NaN],
    'Discount':[1000,2500,1500,1200,3000]
               })

psdf = pd.DataFrame(technologies)
print(type(psdf))
print(psdf)


def add(data):
   return data[0]+data[1]
  
addDF = psdf.apply(add)
print(addDF)

<class 'pyspark.pandas.frame.DataFrame'>
       Fee  Discount
0  20000.0      1000
1  25000.0      2500
2  30000.0      1500
3  22000.0      1200
4      NaN      3000
Fee         45000.0
Discount     3500.0
dtype: float64
