In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("question-2").getOrCreate()

In [3]:
df_emp = spark.read.csv("employee.csv", header=True, inferSchema=True)
df_dpt = spark.read.csv("dept.csv", header=True, inferSchema=True)

In [4]:
df_emp.show()
df_dpt.show()

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|
|     2|    Rose|              1|       2010|         20|     M|  4000|
|     3|Williams|              1|       2010|         10|     M|  1000|
|     4|   Jones|              2|       2005|         10|     F|  2000|
|     5|   Brown|              2|       2010|         40|  NULL|    -1|
|     6|   Brown|              2|       2010|         50|  NULL|    -1|
+------+--------+---------------+-----------+-----------+------+------+

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|  Finance|     10|
|Marketing|     20|
|    Sales|     30|
|       IT|     40|
+---------+-------+



In [5]:
df_emp.printSchema()
df_dpt.printSchema()

root
 |-- emp_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- superior_emp_id: integer (nullable = true)
 |-- year_joined: integer (nullable = true)
 |-- emp_dept_id: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: integer (nullable = true)



In [6]:
# JOINS IN PYSPARK

In [7]:
# INNER JOIN

In [8]:
df_emp.join(df_dpt, df_emp.emp_dept_id == df_dpt.dept_id, "inner").show()

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|  Finance|     10|
|     2|    Rose|              1|       2010|         20|     M|  4000|Marketing|     20|
|     3|Williams|              1|       2010|         10|     M|  1000|  Finance|     10|
|     4|   Jones|              2|       2005|         10|     F|  2000|  Finance|     10|
|     5|   Brown|              2|       2010|         40|  NULL|    -1|       IT|     40|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [18]:
# OUTER JOIN

In [9]:
df_emp.join(df_dpt, df_emp.emp_dept_id == df_dpt.dept_id, "outer").show()

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|  Finance|     10|
|     3|Williams|              1|       2010|         10|     M|  1000|  Finance|     10|
|     4|   Jones|              2|       2005|         10|     F|  2000|  Finance|     10|
|     2|    Rose|              1|       2010|         20|     M|  4000|Marketing|     20|
|  NULL|    NULL|           NULL|       NULL|       NULL|  NULL|  NULL|    Sales|     30|
|     5|   Brown|              2|       2010|         40|  NULL|    -1|       IT|     40|
|     6|   Brown|              2|       2010|         50|  NULL|    -1|     NULL|   NULL|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [17]:
# LEFT JOIN

In [10]:
df_emp.join(df_dpt, df_emp.emp_dept_id == df_dpt.dept_id, "left").show()

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|  Finance|     10|
|     2|    Rose|              1|       2010|         20|     M|  4000|Marketing|     20|
|     3|Williams|              1|       2010|         10|     M|  1000|  Finance|     10|
|     4|   Jones|              2|       2005|         10|     F|  2000|  Finance|     10|
|     5|   Brown|              2|       2010|         40|  NULL|    -1|       IT|     40|
|     6|   Brown|              2|       2010|         50|  NULL|    -1|     NULL|   NULL|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [16]:
# RIGHT JOIN

In [11]:
df_emp.join(df_dpt, df_emp.emp_dept_id == df_dpt.dept_id, "right").show()

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|     4|   Jones|              2|       2005|         10|     F|  2000|  Finance|     10|
|     3|Williams|              1|       2010|         10|     M|  1000|  Finance|     10|
|     1|   Smith|             -1|       2018|         10|     M|  3000|  Finance|     10|
|     2|    Rose|              1|       2010|         20|     M|  4000|Marketing|     20|
|  NULL|    NULL|           NULL|       NULL|       NULL|  NULL|  NULL|    Sales|     30|
|     5|   Brown|              2|       2010|         40|  NULL|    -1|       IT|     40|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [19]:
# LEFT SEMI JOIN

In [12]:
df_emp.join(df_dpt, df_emp.emp_dept_id == df_dpt.dept_id, "leftsemi").show()

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|
|     2|    Rose|              1|       2010|         20|     M|  4000|
|     3|Williams|              1|       2010|         10|     M|  1000|
|     4|   Jones|              2|       2005|         10|     F|  2000|
|     5|   Brown|              2|       2010|         40|  NULL|    -1|
+------+--------+---------------+-----------+-----------+------+------+



In [20]:
# LEFT ANTI JOIN

In [13]:
df_emp.join(df_dpt, df_emp.emp_dept_id == df_dpt.dept_id, "leftanti").show()

+------+-----+---------------+-----------+-----------+------+------+
|emp_id| name|superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+-----+---------------+-----------+-----------+------+------+
|     6|Brown|              2|       2010|         50|  NULL|    -1|
+------+-----+---------------+-----------+-----------+------+------+



In [21]:
# FUNCTIONS IN PANDAS DATAFRAME

In [22]:
import pandas as pd

In [23]:
data = {'Name':['Tom', 'nick', 'krish', 'jack'],
        'Age':[20, 21, 19, 18]}

In [24]:
# creating data frame in pandas

In [27]:
df =  pd.DataFrame(data)
df

Unnamed: 0,Name,Age
0,Tom,20
1,nick,21
2,krish,19
3,jack,18


In [28]:
# SELECTING A PARTICULAR ROW

In [29]:
df["Name"]

0      Tom
1     nick
2    krish
3     jack
Name: Name, dtype: object

In [30]:
# HEAD FUNCTION

In [31]:
df.head()

Unnamed: 0,Name,Age
0,Tom,20
1,nick,21
2,krish,19
3,jack,18


In [32]:
# TAIL FUNCTION

In [33]:
df.tail()

Unnamed: 0,Name,Age
0,Tom,20
1,nick,21
2,krish,19
3,jack,18


In [34]:
# Getting type

In [35]:
type(df)

pandas.core.frame.DataFrame

In [37]:
df.drop(3)

Unnamed: 0,Name,Age
0,Tom,20
1,nick,21
2,krish,19
