# Joins in `pyspark`

Performed with `df_left.join(df_right, how=type_str)`

In [11]:
from pyspark.sql import SparkSession
from more_pyspark import to_pandas
import pyspark.sql.functions as fn
from pyspark.sql.functions import col, isnan

In [1]:
spark = SparkSession.builder.appName('Ops').getOrCreate()
deptk = spark.read.csv("./data/department.csv",  header=True, inferSchema=True)
deptk.collect() >> to_pandas

Unnamed: 0,DeptID,DeptName
0,31,Sales
1,33,Engineering
2,34,Clerical
3,35,Marketing


In [2]:
emplk = spark.read.csv("./data/employee.csv",  header=True, inferSchema=True)
emplk.collect() >> to_pandas

Unnamed: 0,DeptID,LastName
0,31.0,Rafferty
1,33.0,Jones
2,33.0,Heisenberg
3,34.0,Robinson
4,34.0,Smith
5,,Williams


#### Inner join

In [3]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='inner')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,31,Rafferty,Sales
1,33,Jones,Engineering
2,33,Heisenberg,Engineering
3,34,Robinson,Clerical
4,34,Smith,Clerical


#### Left join

In [4]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='left')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,31.0,Rafferty,Sales
1,33.0,Jones,Engineering
2,33.0,Heisenberg,Engineering
3,34.0,Robinson,Clerical
4,34.0,Smith,Clerical
5,,Williams,


#### Right join

In [5]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='right')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,31,Rafferty,Sales
1,33,Heisenberg,Engineering
2,33,Jones,Engineering
3,34,Smith,Clerical
4,34,Robinson,Clerical
5,35,,Marketing


#### Outer join

In [6]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='outer')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,,Williams,
1,35.0,,Marketing
2,34.0,Robinson,Clerical
3,34.0,Smith,Clerical
4,31.0,Rafferty,Sales
5,33.0,Jones,Engineering
6,33.0,Heisenberg,Engineering


## <font color="red"> Exercise 2 </font>

Determine all the players that have hit more than 50 home runs in a season.  The final table should include the players proper name, as well as the team name.  

**Hint:** You will need join the files listed below.  To get credit for this exercise, use the join `pyspark` join methods presented above.

In [8]:
files = ("./data/baseball/core/Batting.csv", 
              "./data/baseball/core/People.csv",
              "./data/baseball/core/Teams.csv")

In [9]:
batting, people, teams = [spark.read.csv(f,  header=True, inferSchema=True) for f in files]

In [19]:
batting_select = (batting
                  .select([batting.playerID,batting.teamID,batting.yearID,batting.HR])
                  .groupby(batting.playerID,batting.yearID,batting.teamID)
                  .agg(fn.sum('HR').alias('yearHRS'))
                  .where(fn.col('yearHRS')>=50)
                 )

In [23]:
#batting_select.collect() >> to_pandas

In [24]:
people_select = (people
                 .select([people.playerID,people.nameFirst,people.nameLast])
                )

In [25]:
team_select = (teams
                 .select([teams.teamID,teams.name,teams.yearID])
                )

In [26]:
players_with_50ormore_HRs = (batting_select
                             .join(people_select, batting_select.playerID == people_select.playerID, how='left')
                             .drop('playerID')
                             .join(team_select, (batting_select.teamID == team_select.teamID) & (batting_select.yearID == team_select.yearID), how='left')
                             .drop('teamID')
                             .collect()
                            ) >> to_pandas
players_with_50ormore_HRs.head()

Unnamed: 0,yearID,yearHRS,nameFirst,nameLast,name
0,1955,51,Willie,Mays,New York Giants
1,1965,52,Willie,Mays,San Francisco Giants
2,2017,52,Aaron,Judge,New York Yankees
3,1999,65,Mark,McGwire,St. Louis Cardinals
4,2010,54,Jose,Bautista,Toronto Blue Jays


In [27]:
len(players_with_50ormore_HRs)

44

## Up Next

Stuff