In [0]:
%scala
val AddressType = spark.read.format("csv").option("Header","true").option("InferSchema","true").load("dbfs:/FileStore/tables/AddressType_Person_.csv")
val Address = spark.read.format("csv").option("Header","true").option("InferSchema","true").load("dbfs:/FileStore/tables/Address_Person_.csv")
val BusinessEntity = spark.read.format("csv").option("Header","true").option("InferSchema","true").load("dbfs:/FileStore/tables/BusinessEntity__Person_.csv")
val BusinessEntityAddress = spark.read.format("csv").option("Header","true").option("InferSchema","true").load("dbfs:/FileStore/tables/BusinessEntityAddress__Person_.csv")
val CountryRegion = spark.read.format("csv").option("Header","true").option("InferSchema","true").load("dbfs:/FileStore/tables/CountryRegion_Person_.csv")
val CountryRegionCurrency = spark.read.format("csv").option("Header","true").option("InferSchema","true").load("dbfs:/FileStore/tables/CountryRegionCurrency_Sales_.csv")
val Employee = spark.read.format("csv").option("Header","true").option("InferSchema","true").load("dbfs:/FileStore/tables/Employee_HumanResources_.csv")
val EmployeeDepartmentHistory = spark.read.format("csv").option("Header","true").option("InferSchema","true").load("dbfs:/FileStore/tables/EmployeeDepartmentHistory_HumanResources_.csv")
val EmployeePayHistory = spark.read.format("csv").option("Header","true").option("InferSchema","true").load("dbfs:/FileStore/tables/EmployeePayHistory_HumanResources_.csv")
val EmailAddress = spark.read.format("csv").option("Header","true").option("InferSchema","true").load("dbfs:/FileStore/tables/EmailAddress_Person_.csv")
val Person = spark.read.format("csv").option("Header","true").option("InferSchema","true").load("dbfs:/FileStore/tables/Person_Person_.csv")
val StateProvince = spark.read.format("csv").option("Header","true").option("InferSchema","true").load("dbfs:/FileStore/tables/StateProvice_Person_.csv")

In [0]:
%scala
AddressType.createOrReplaceTempView("AT")
Address.createOrReplaceTempView("A")
BusinessEntityAddress.createOrReplaceTempView("BEA")
CountryRegion.createOrReplaceTempView("CR")
CountryRegionCurrency.createOrReplaceTempView("CRC")
BusinessEntity.createOrReplaceTempView("BE")
Employee.createOrReplaceTempView("E")
EmployeeDepartmentHistory.createOrReplaceTempView("EDH")
EmployeePayHistory.createOrReplaceTempView("EPH")
EmailAddress.createOrReplaceTempView("EA")
Person.createOrReplaceTempView("P")
StateProvince.createOrReplaceTempView("SP")

In [0]:
%sql
SELECT * FROM EDH

BusinessEntityID,DepartmentID,ShiftID,StartDate,EndDate,ModifiedDate
1,16,1,2009-01-14T00:00:00.000+0000,,2009-01-13T00:00:00.000+0000
2,1,1,2008-01-31T00:00:00.000+0000,,2008-01-30T00:00:00.000+0000
3,1,1,2007-11-11T00:00:00.000+0000,,2007-11-10T00:00:00.000+0000
4,1,1,2007-12-05T00:00:00.000+0000,2010-05-30T00:00:00.000+0000,2010-05-28T00:00:00.000+0000
4,2,1,2010-05-31T00:00:00.000+0000,,2010-05-30T00:00:00.000+0000
5,1,1,2008-01-06T00:00:00.000+0000,,2008-01-05T00:00:00.000+0000
6,1,1,2008-01-24T00:00:00.000+0000,,2008-01-23T00:00:00.000+0000
7,6,1,2009-02-08T00:00:00.000+0000,,2009-02-07T00:00:00.000+0000
8,6,1,2008-12-29T00:00:00.000+0000,,2008-12-28T00:00:00.000+0000
9,6,1,2009-01-16T00:00:00.000+0000,,2009-01-15T00:00:00.000+0000


In [0]:
%scala
val prefered_Hours_df = spark.sql("""SELECT e.Gender, edh.ShiftID, COUNT(edh.ShiftID) AS preferedhour, RANK() OVER(PARTITION BY e.Gender ORDER BY COUNT(edh.ShiftID) DESC) AS RANKS FROM EDH edh JOIN E e ON edh.BusinessEntityID = e.BusinessEntityID GROUP BY e.Gender, edh.ShiftID""")
prefered_Hours_df.createOrReplaceTempView("preferedHours_v")

In [0]:
%sql
SELECT * FROM preferedHours_v

Gender,ShiftID,preferedhour,RANKS
F,1,59,1
F,3,19,2
F,2,9,3
M,1,123,1
M,2,53,2
M,3,33,3


In [0]:
%sql
--Female, Male Employees Stats
SELECT emp.Gender, COUNT(emp.Gender) AS Total_Employees, 
CASE WHEN phv.ShiftID = 1 THEN "Morning Shift" 
     WHEN phv.ShiftID = 2 THEN "Noon Shift" 
     WHEN phv.ShiftID = 3 THEN "Night Shift" END AS not_prefered_Shift, SUM(emp.VacationHours+emp.SickLeaveHours) AS LeaveHours, ROUND(SUM(emp.VacationHours+emp.SickLeaveHours)/COUNT(emp.Gender),1) AS Avg_LeaveHours, ROUND(SUM(eph.Rate),2) AS PayRate_Increased, ROUND(SUM(eph.Rate)/COUNT(emp.Gender),3) AS Avg_PayRate_Increased FROM E emp
JOIN EPH eph ON emp.BusinessEntityID = eph.BusinessEntityID
JOIN preferedHours_v phv ON phv.Gender = emp.Gender
WHERE phv.RANKS = 3
GROUP BY emp.Gender, phv.ShiftID

Gender,Total_Employees,not_prefered_Shift,LeaveHours,Avg_LeaveHours,PayRate_Increased,Avg_PayRate_Increased
F,88,Noon Shift,8445,96.0,1735.21,19.718
M,228,Night Shift,21116,92.6,3876.57,17.003


Output can only be rendered in Databricks

In [0]:
%sql
--Emails of Employees who's leave hours are highest and their experience
SELECT CONCAT(p.FirstName, ' ', p.LastName) AS Name, ea.EmailAddress, (2016-YEAR(e.BirthDate)) AS Age, (2016-YEAR(e.HireDate)) AS Years_Experienced, ROUND((e.VacationHours+ e.SickLeaveHours)/8) AS Total_Leave_Days FROM E e
JOIN EA ea ON e.BusinessEntityID = ea.BusinessEntityID
JOIN P p ON p.BusinessEntityID = e.BusinessEntityID
WHERE (VacationHours+ SickLeaveHours)/8 > 15
ORDER BY Years_Experienced, Total_Leave_Days DESC

Name,EmailAddress,Age,Years_Experienced,Total_Leave_Days
Danielle Tiedt,danielle0@adventure-works.com,30,6,21.0
Patrick Wedge,patrick0@adventure-works.com,30,6,21.0
Tom Vande Velde,tom0@adventure-works.com,30,6,21.0
Christopher Hill,christopher0@adventure-works.com,30,6,20.0
Jo Berry,jo1@adventure-works.com,62,6,20.0
Kimberly Zimmerman,kimberly0@adventure-works.com,30,6,20.0
John Kane,john4@adventure-works.com,30,6,20.0
Bonnie Kearney,bonnie0@adventure-works.com,30,6,19.0
Lori Penor,lori1@adventure-works.com,46,6,19.0
Stuart Macrae,stuart1@adventure-works.com,45,6,19.0


In [0]:
%sql
--Employees who got their salary rate increased long ago
SELECT CONCAT(p.FirstName,' ', p.LastName) AS NAME, DATE(eph.RateChangeDate) AS Last_rate_increased, ROUND(eph.Rate,2) AS RATE FROM EPH eph
JOIN E e ON eph.BusinessEntityID = e.BusinessEntityID
JOIN P p ON e.BusinessEntityID = p.BusinessEntityID
WHERE (2016-YEAR(eph.RateChangeDate)) >= 8
ORDER BY Last_rate_increased

NAME,Last_rate_increased,RATE
Guy Gilbert,2006-06-30,12.45
Kevin Brown,2007-01-26,13.46
Roberto Tamburello,2007-11-11,43.27
Rob Walters,2007-12-05,8.62
Thierry D'Hers,2007-12-11,25.0
David Bradley,2007-12-20,24.0
JoLynn Dobney,2007-12-26,25.0
Gail Erickson,2008-01-06,32.69
Ruth Ellerbrock,2008-01-06,13.45
Barry Johnson,2008-01-07,13.45


In [0]:
%sql
--departmentIDs of all employees based on their age groups
SELECT CASE WHEN 2016-YEAR(e.BirthDate)<20 THEN "<20"
            WHEN 2016-YEAR(e.BirthDate)>=20 AND 2016-YEAR(e.BirthDate)<30 THEN "20-30"
            WHEN 2016-YEAR(e.BirthDate)>=30 AND 2016-YEAR(e.BirthDate)<40 THEN "30-40" 
            WHEN 2016-YEAR(e.BirthDate)>=40 AND 2016-YEAR(e.BirthDate)<50 THEN "40-50" 
            WHEN 2016-YEAR(e.BirthDate)>=50 AND 2016-YEAR(e.BirthDate)<60 THEN "50-60" 
            WHEN 2016-YEAR(e.BirthDate)>=60 THEN ">60" END AS AGE_GROUP,
            (edh.DepartmentID),COUNT(edh.DepartmentID) AS DepartmentGroup FROM E e
            JOIN EDH edh ON e.BusinessEntityID = edh.BusinessEntityID
            GROUP BY AGE_GROUP,edh.DepartmentID
            ORDER BY AGE_GROUP,edh.DepartmentID

AGE_GROUP,DepartmentID,DepartmentGroup
20-30,2,1
20-30,4,2
20-30,6,1
20-30,7,45
20-30,8,1
20-30,11,2
20-30,12,3
20-30,13,1
30-40,1,1
30-40,2,1


Output can only be rendered in Databricks

In [0]:
%sql
--Hiring trends and how it affected salary rates
SELECT YEAR(e.HireDate) AS Year, COUNT(e.BusinessEntityID) AS no_of_hired, ROUND(SUM(eph.Rate),2) AS rate_change FROM E e
JOIN EPH eph ON e.BusinessEntityID = eph.BusinessEntityID
GROUP BY Year
ORDER BY Year

Year,no_of_hired,rate_change
2006,1,12.45
2007,10,259.17
2008,78,1323.23
2009,164,2767.04
2010,38,574.0
2011,18,464.31
2012,4,117.33
2013,3,94.25


Output can only be rendered in Databricks

In [0]:
%sql
SELECT e.MaritalStatus, SUM(e.VacationHours+e.SickLeaveHours) AS Leave_Hours FROM E e
GROUP BY e.MaritalStatus

MaritalStatus,Leave_Hours
M,13446
S,14371
