In [1]:
sc

In [2]:
spark

In [3]:
sc.stop()

In [4]:
spark.stop()

In [5]:
from pyspark import SparkConf, SparkContext
# setMaster sets spark ContextManager which is loca[cpu cores]
config = SparkConf().setMaster('local[2]').setAppName("ETL Pipeline")
sc = SparkContext(conf=config)

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('ETL Pipeline').getOrCreate()

In [7]:
spark

In [12]:
hremployeeDF = spark.read.format('jdbc')\
.option('url', 'jdbc:mysql://localhost:3306/hremployeedb')\
.option('dbtable', 'HR_Employee').option('user', 'root').option('password', 'hadoop@123')\
.option('driver', 'com.mysql.cj.jdbc.Driver').load()

In [13]:
hremployeeDF

DataFrame[EmployeeID: int, Department: string, JobRole: string, Attrition: string, Gender: string, Age: int, MaritalStatus: string, Education: string, EducationField: string, BusinessTravel: string, JobInvolvement: string, JobLevel: int, JobSatisfaction: string, Hourlyrate: int, Income: int, Salaryhike: int, OverTime: string, Workex: int, YearsSinceLastPromotion: int, EmpSatisfaction: string, TrainingTimesLastYear: int, WorkLifeBalance: string, Performance_Rating: string]

In [14]:
# show physical plan of execution, which is nkown as DAG
hremployeeDF.explain()

== Physical Plan ==
*(1) Scan JDBCRelation(HR_Employee) [numPartitions=1] [EmployeeID#0,Department#1,JobRole#2,Attrition#3,Gender#4,Age#5,MaritalStatus#6,Education#7,EducationField#8,BusinessTravel#9,JobInvolvement#10,JobLevel#11,JobSatisfaction#12,Hourlyrate#13,Income#14,Salaryhike#15,OverTime#16,Workex#17,YearsSinceLastPromotion#18,EmpSatisfaction#19,TrainingTimesLastYear#20,WorkLifeBalance#21,Performance_Rating#22] PushedFilters: [], ReadSchema: struct<EmployeeID:int,Department:string,JobRole:string,Attrition:string,Gender:string,Age:int,Mar...


### Materialized view of Table

In [16]:
hremployeeDF.createOrReplaceTempView('hremployee')

### Q1: Display shape of hremployee table
#### show number of rows and number of columns

In [43]:
#spark.sql("""
#    SELECT * FROM (
#        SELECT COUNT(*) AS rows FROM hremployee
#        SELECT COUNT(*) FROM INFORMATION_SCHEMA.COLUMNS WHERE type='table'
#    );
#""")

rows = spark.sql("""
    SELECT COUNT(*) AS rows FROM hremployee
""").show()

num_of_cols = len(hremployeeDF.columns)


print(rows)
print(num_of_cols)

+----+
|rows|
+----+
|1469|
+----+

None
23


In [38]:
spark.sql(f"""
    SELECT COUNT(*) AS row_count, {num_of_cols} AS column_count FROM hremployee
""").show()

+---------+------------+
|row_count|column_count|
+---------+------------+
|     1469|          23|
+---------+------------+



In [47]:
spark.sql('DESCRIBE hremployee').show(100)

+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
|          EmployeeID|      int|   null|
|          Department|   string|   null|
|             JobRole|   string|   null|
|           Attrition|   string|   null|
|              Gender|   string|   null|
|                 Age|      int|   null|
|       MaritalStatus|   string|   null|
|           Education|   string|   null|
|      EducationField|   string|   null|
|      BusinessTravel|   string|   null|
|      JobInvolvement|   string|   null|
|            JobLevel|      int|   null|
|     JobSatisfaction|   string|   null|
|          Hourlyrate|      int|   null|
|              Income|      int|   null|
|          Salaryhike|      int|   null|
|            OverTime|   string|   null|
|              Workex|      int|   null|
|YearsSinceLastPro...|      int|   null|
|     EmpSatisfaction|   string|   null|
|TrainingTimesLast...|      int|   null|
|     WorkLifeBa

### Q2: Write a query to show first 3 employee from each jobrole to join the company

In [72]:
spark.sql("""
    SELECT EmployeeID, JobRole, ranks AS Seniority_in_Department
    FROM (
        SELECT EmployeeID, JobRole,
        DENSE_RANK() OVER(PARTITION BY JobRole ORDER BY EmployeeID) as ranks
        FROM hremployee
    ) AS _
    WHERE ranks<=3
""").show(100)

+----------+--------------------+-----------------------+
|EmployeeID|             JobRole|Seniority_in_Department|
+----------+--------------------+-----------------------+
|         1|     Sales Executive|                      1|
|        28|     Sales Executive|                      2|
|        40|     Sales Executive|                      3|
|         9|Manufacturing Dir...|                      1|
|        16|Manufacturing Dir...|                      2|
|        21|Manufacturing Dir...|                      3|
|         3|Laboratory Techni...|                      1|
|         5|Laboratory Techni...|                      2|
|         6|Laboratory Techni...|                      3|
|        22|Sales Representative|                      1|
|        34|Sales Representative|                      2|
|        37|Sales Representative|                      3|
|        10|Healthcare Repres...|                      1|
|        29|Healthcare Repres...|                      2|
|        32|He

### Q3: Write a query to show top3 employees from each job role earning highest salary

In [78]:
spark.sql("""
    SELECT *
    FROM (
        SELECT EmployeeID, JobRole, Income,
        DENSE_RANK() OVER(PARTITION BY JobRole ORDER BY Income DESC) as ranks
        FROM hremployee
    ) AS _
    WHERE ranks<=3
""").show(30)

+----------+--------------------+------+-----+
|EmployeeID|             JobRole|Income|ranks|
+----------+--------------------+------+-----+
|        99|     Sales Executive| 13872|    1|
|       545|     Sales Executive| 13770|    2|
|       839|     Sales Executive| 13758|    3|
|       722|Manufacturing Dir...| 13973|    1|
|       628|Manufacturing Dir...| 13826|    2|
|       744|Manufacturing Dir...| 13726|    3|
|       678|Laboratory Techni...|  7403|    1|
|       817|Laboratory Techni...|  6782|    2|
|       945|Laboratory Techni...|  6674|    3|
|       565|Sales Representative|  6632|    1|
|      1308|Sales Representative|  5405|    2|
|      1220|Sales Representative|  4502|    3|
|      1181|Healthcare Repres...| 13966|    1|
|       317|Healthcare Repres...| 13964|    2|
|       190|Healthcare Repres...| 13734|    3|
|        68|  Research Scientist|  9724|    1|
|      1315|  Research Scientist|  6962|    2|
|      1305|  Research Scientist|  6854|    3|
|       191| 

### Q4: Show top 3 highest package from overall Job role

In [85]:
spark.sql("""
    SELECT EmployeeID, JobRole, Income
    FROM hremployee
    ORDER BY Income DESC
""").show(3)

+----------+-----------------+------+
|EmployeeID|          JobRole|Income|
+----------+-----------------+------+
|       191|          Manager| 19999|
|       747|Research Director| 19973|
|       852|          Manager| 19943|
+----------+-----------------+------+
only showing top 3 rows



### Q5: Write a Spark SQL query to sho employees in ascending order wrt employee income compared to previous income for each job role

In [104]:
spark.sql("""
    SELECT EmployeeID, JobRole, Income, (Income-lagged_income) AS diff
    FROM (
        SELECT EmployeeID, JobRole, Income,
        LAG(Income, 1) OVER(PARTITION BY JobRole ORDER BY JobRole) as lagged_income
        FROM hremployee
        WHERE Income IS NOT NULL
    ) AS _
    ORDER BY JobRole,diff
""").show(2500)

+----------+--------------------+------+-----+
|EmployeeID|             JobRole|Income| diff|
+----------+--------------------+------+-----+
|        10|Healthcare Repres...|  5237| null|
|       285|Healthcare Repres...|  4741|-8755|
|      1183|Healthcare Repres...|  6842|-7124|
|      1157|Healthcare Repres...|  4148|-7097|
|       205|Healthcare Repres...|  6673|-7061|
|       677|Healthcare Repres...|  4014|-6538|
|       397|Healthcare Repres...|  4522|-6443|
|       833|Healthcare Repres...|  5731|-6438|
|      1065|Healthcare Repres...|  4035|-6431|
|       745|Healthcare Repres...|  4777|-6222|
|       736|Healthcare Repres...|  4240|-6148|
|      1098|Healthcare Repres...|  4069|-6055|
|        89|Healthcare Repres...|  4152|-5944|
|       489|Healthcare Repres...|  4089|-5735|
|       929|Healthcare Repres...|  7978|-5599|
|       105|Healthcare Repres...|  5163|-5510|
|       267|Healthcare Repres...|  5582|-5356|
|      1231|Healthcare Repres...|  5562|-5186|
|       555|H

In [108]:
# a variation of above query, not same but for comparison
spark.sql("""
    SELECT *, (Income-prev_income) AS diff
    FROM (
        SELECT EmployeeID, JobRole, Income,
        LAG(Income, 1) OVER(PARTITION BY JobRole ORDER BY JobRole) as prev_income
        FROM hremployee
        WHERE Income IS NOT NULL
    ) AS _
""").show()

+----------+---------------+------+-----------+-----+
|EmployeeID|        JobRole|Income|prev_income| diff|
+----------+---------------+------+-----------+-----+
|         1|Sales Executive|  5993|       null| null|
|        28|Sales Executive|  6825|       5993|  832|
|        40|Sales Executive|  5376|       6825|-1449|
|        44|Sales Executive|  8726|       5376| 3350|
|        47|Sales Executive|  4568|       8726|-4158|
|        49|Sales Executive|  5772|       4568| 1204|
|        53|Sales Executive|  5454|       5772| -318|
|        55|Sales Executive|  4157|       5454|-1297|
|        57|Sales Executive|  9069|       4157| 4912|
|        64|Sales Executive|  7637|       9069|-1432|
|        71|Sales Executive|  5473|       7637|-2164|
|        77|Sales Executive|  4312|       5473|-1161|
|        83|Sales Executive| 10239|       4312| 5927|
|        90|Sales Executive|  9619|      10239| -620|
|        92|Sales Executive|  5441|       9619|-4178|
|        93|Sales Executive|

### Q6: 

In [112]:
spark.sql("""
    SELECT employeeID, department, JobRole, age, gender, income,
    LEAD(Income, 2, 0) OVER (ORDER BY EmployeeID) as next_income
    FROM hremployee
""").show()

+----------+--------------------+--------------------+---+------+------+-----------+
|employeeID|          department|             JobRole|age|gender|income|next_income|
+----------+--------------------+--------------------+---+------+------+-----------+
|         1|               Sales|     Sales Executive| 41|Female|  5993|       2090|
|         2|Research & Develo...|  Research Scientist| 49|  Male|  5130|       2909|
|         3|Research & Develo...|Laboratory Techni...| 37|  Male|  2090|       3468|
|         4|Research & Develo...|  Research Scientist| 33|Female|  2909|       3068|
|         5|Research & Develo...|Laboratory Techni...| 27|  Male|  3468|       2670|
|         6|Research & Develo...|Laboratory Techni...| 32|  Male|  3068|       2693|
|         7|Research & Develo...|Laboratory Techni...| 59|Female|  2670|       9526|
|         8|Research & Develo...|Laboratory Techni...| 30|  Male|  2693|       5237|
|         9|Research & Develo...|Manufacturing Dir...| 38|  Male|

### Q7: NTILE() - dividing records into percentiles

In [114]:
spark.sql("""
    SELECT employeeID, department, JobRole, age, gender, income,
    NTILE(4) OVER (ORDER BY INCOME) as salary_quartiles
    FROM hremployee
""").show()

+----------+--------------------+--------------------+---+------+------+----------------+
|employeeID|          department|             JobRole|age|gender|income|salary_quartiles|
+----------+--------------------+--------------------+---+------+------+----------------+
|       514|Research & Develo...|  Research Scientist| 20|  Male|  1009|               1|
|       728|Research & Develo...|  Research Scientist| 18|  Male|  1051|               1|
|       765|               Sales|Sales Representative| 28|  Male|  1052|               1|
|      1338|               Sales|Sales Representative| 30|  Male|  1081|               1|
|      1365|               Sales|Sales Representative| 29|  Male|  1091|               1|
|       178|Research & Develo...|Laboratory Techni...| 19|  Male|  1102|               1|
|       912|               Sales|Sales Representative| 25|  Male|  1118|               1|
|      1402|Research & Develo...|Laboratory Techni...| 31|Female|  1129|               1|
|       30

### Q8: Find number of employees in each percentile group 0-25th, 25th-50th, 50-75, and 75to100th [order by income and partition by department] using PERCENT_RANK and create a new category using CASE-WHEN 

In [146]:
spark.sql("""
    SELECT 
    CASE 
        WHEN percentile<=0.25 THEN "0.00-0.25"
        WHEN percentile<=0.50 THEN "0.25-0.50"
        WHEN percentile<=0.75 THEN "0.50-0.75"
        ELSE "0.75-1.00"
    END AS category,
    COUNT(*) AS FREQUENCY
    
    FROM (
        SELECT Income,
        PERCENT_RANK() OVER(ORDER BY Income) AS percentile
        FROM hremployee
    ) AS _
    GROUP BY category
    ORDER BY category
""").show()

+---------+---------+
| category|FREQUENCY|
+---------+---------+
|0.00-0.25|      368|
|0.25-0.50|      367|
|0.50-0.75|      367|
|0.75-1.00|      367|
+---------+---------+



## HIVE Integration with PySpark

In [147]:
spark.stop()

In [148]:

!jps

18032 Jps
27057 NodeManager
26306 NameNode
26500 DataNode
13591 SparkSubmit
26732 SecondaryNameNode
26893 ResourceManager


In [73]:
# spark integration with Hive [hadoop services must be running]
# config for hive integration: property-name "spark.sql.warehouse.dir", value "/user/hive/warehouse"

spark = (SparkSession.builder.appName('pyspark-hive-Integration')
        .config('spark.sql.warehouse.dir', '/user/hive/warehouse')
        .enableHiveSupport().getOrCreate())

In [74]:
spark

In [3]:
spark.sql('show databases').show()

+------------+
|databaseName|
+------------+
|     default|
+------------+



In [4]:
spark.sql("""
    CREATE database IF NOT EXISTS airlines
""")

DataFrame[]

In [6]:
spark.sql("""
    use airlines
""")

DataFrame[]

In [8]:
spark.sql('show tables').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [11]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS flights (
        DayofMonth int,
        DayofWeek int,
        Carrier VARCHAR(10),
        OriginAirportID int,
        DestAirportID int,
        DepDelay int,
        ArrDelay int
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    LINES TERMINATED BY '\n'
    STORED AS TEXTFILE
    TBLPROPERTIES('skip.header.line.count'='1')
""")

DataFrame[]

In [12]:
spark.sql('show tables').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|airlines|  flights|      false|
+--------+---------+-----------+



In [22]:
spark.sql("""load data local inpath '/home/hadoop/Downloads/raw_flight_data_1.csv'
overwrite into table flights""")

DataFrame[]

In [23]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS airports (
        airport_id int, 
        city VARCHAR(50), 
        state VARCHAR(50), 
        name VARCHAR(50)
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    LINES TERMINATED BY '\n'
    STORED AS TEXTFILE
    TBLPROPERTIES('skip.header.line.count'='1')
""")

DataFrame[]

In [24]:
spark.sql("""load data local inpath '/home/hadoop/Downloads/airports_1.csv'
overwrite into table airports""")

DataFrame[]

In [25]:
spark.sql('show tables').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|airlines| airports|      false|
|airlines|  flights|      false|
+--------+---------+-----------+



In [26]:
spark.sql("""
    SELECT * FROM airports
""").show()

+----------+-----------+-----+--------------------+
|airport_id|       city|state|                name|
+----------+-----------+-----+--------------------+
|     10165|Adak Island|   AK|                Adak|
|     10299|  Anchorage|   AK|Ted Stevens Ancho...|
|     10304|      Aniak|   AK|       Aniak Airport|
|     10754|     Barrow|   AK|Wiley Post/Will R...|
|     10551|     Bethel|   AK|      Bethel Airport|
|     10926|    Cordova|   AK|Merle K Mudhole S...|
|     14709|  Deadhorse|   AK|   Deadhorse Airport|
|     11336| Dillingham|   AK|  Dillingham Airport|
|     11630|  Fairbanks|   AK|Fairbanks Interna...|
|     11997|   Gustavus|   AK|    Gustavus Airport|
|     12523|     Juneau|   AK|Juneau International|
|     12819|  Ketchikan|   AK|Ketchikan Interna...|
|     10245|King Salmon|   AK| King Salmon Airport|
|     10170|     Kodiak|   AK|      Kodiak Airport|
|     13970|   Kotzebue|   AK| Ralph Wien Memorial|
|     13873|       Nome|   AK|        Nome Airport|
|     14256|

In [27]:
spark.sql("""
    SELECT * FROM flights
""").show()

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayofWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          11193|        12892|      -6|     -11|
|        19|        5|     DL|          10397|        15016|      -1|     -19|
|        19|        5|     DL|          15016|        10397|       0|      -1|
|        19|        5|     DL|          10397|        14869|      15|      24|
|        19|        5|     DL|          10397|        10423|      33|      34|
|        19|        5|     DL|          11278|      

### STEP 1 - Extract

In [28]:
flights_df = spark.table('airlines.flights')
airports_df = spark.table('airlines.airports')

In [29]:
airports_df.show(2)

+----------+-----------+-----+--------------------+
|airport_id|       city|state|                name|
+----------+-----------+-----+--------------------+
|     10165|Adak Island|   AK|                Adak|
|     10299|  Anchorage|   AK|Ted Stevens Ancho...|
+----------+-----------+-----+--------------------+
only showing top 2 rows



In [30]:
flights_df.show(2)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayofWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 2 rows



In [31]:
### STEP 2 - Transform

In [33]:
flights_join = flights_df.join(airports_df, 
                               on=(airports_df.airport_id==flights_df.OriginAirportID), 
                               how='inner')

In [35]:
flights_join.show()

+----------+---------+-------+---------------+-------------+--------+--------+----------+-----------------+-----+--------------------+
|DayofMonth|DayofWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|airport_id|             city|state|                name|
+----------+---------+-------+---------------+-------------+--------+--------+----------+-----------------+-----+--------------------+
|        19|        5|     DL|          11433|        13303|      -3|       1|     11433|          Detroit|   MI|Detroit Metro Way...|
|        19|        5|     DL|          14869|        12478|       0|      -8|     14869|   Salt Lake City|   UT|Salt Lake City In...|
|        19|        5|     DL|          14057|        14869|      -4|     -15|     14057|         Portland|   OR|Portland Internat...|
|        19|        5|     DL|          15016|        11433|      28|      24|     15016|        St. Louis|   MO|Lambert-St. Louis...|
|        19|        5|     DL|          11193|        1

In [37]:
### STEP 3 - Load

In [42]:
flights_join = flights_join.repartition(4)

In [43]:
flights_join.write.parquet('file:///home/hadoop/Downloads/flights')

In [45]:
# read from parquet
flights_parquet_df = spark.read.parquet('file:///home/hadoop/Downloads/flights/')

In [46]:
# write to HDFS

flights_join.write.parquet('/flights1')

In [47]:
# partitioning data 
flights_join.write.partitionBy('Carrier').parquet('/airlines')

#### 04-09-2024

In [48]:
flights_join

DataFrame[DayofMonth: int, DayofWeek: int, Carrier: string, OriginAirportID: int, DestAirportID: int, DepDelay: int, ArrDelay: int, airport_id: int, city: string, state: string, name: string]

In [53]:
flights_join.write.bucketBy(col='state', numBuckets=50).format('csv').saveAsTable('bucketed_table')

In [54]:
flights_join.write.partitionBy('Carrier').bucketBy(col='state', numBuckets=30).format('parquet').saveAsTable('partition_bucket_table')

In [57]:
spark.sql("""
    SELECT carrier, COUNT(*)
    FROM partition_bucket_table
    GROUP BY carrier
""").show()

+-------+--------+
|carrier|count(1)|
+-------+--------+
|     UA|  122443|
|     AA|  124037|
|     EV|   46563|
|     B6|   51381|
|     DL|  134724|
|     OO|   69785|
|     F9|    9811|
|     YV|   14612|
|     US|  100668|
|     MQ|   45926|
|     HA|    4962|
|     AS|   28796|
|     FL|   28053|
|     VX|   14683|
|     WN|  216101|
|     9E|   36030|
+-------+--------+



#### Load on MySQL

In [77]:
connection_properties = {
    'user': 'root',
    'password': 'hadoop@123',
    'driver': 'com.mysql.cj.jdbc.Driver'
}

flights_join.write.jdbc(url="jdbc:mysql://localhost:3306/flights", table='airlines', mode='overwrite', properties=connection_properties)

Py4JJavaError: An error occurred while calling o1356.jdbc.
: org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:226)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:146)
	at org.apache.spark.sql.execution.InputAdapter.doExecuteBroadcast(WholeStageCodegenExec.scala:387)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeBroadcast$1.apply(SparkPlan.scala:149)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeBroadcast$1.apply(SparkPlan.scala:145)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.SparkPlan.executeBroadcast(SparkPlan.scala:145)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.prepareBroadcast(BroadcastHashJoinExec.scala:117)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.codegenInner(BroadcastHashJoinExec.scala:211)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doConsume(BroadcastHashJoinExec.scala:101)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:189)
	at org.apache.spark.sql.execution.FilterExec.consume(basicPhysicalOperators.scala:87)
	at org.apache.spark.sql.execution.FilterExec.doConsume(basicPhysicalOperators.scala:212)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:189)
	at org.apache.spark.sql.execution.InputAdapter.consume(WholeStageCodegenExec.scala:374)
	at org.apache.spark.sql.execution.InputAdapter.doProduce(WholeStageCodegenExec.scala:403)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.InputAdapter.produce(WholeStageCodegenExec.scala:374)
	at org.apache.spark.sql.execution.FilterExec.doProduce(basicPhysicalOperators.scala:127)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.FilterExec.produce(basicPhysicalOperators.scala:87)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:96)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:544)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:598)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:136)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.prepareShuffleDependency(ShuffleExchangeExec.scala:92)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:128)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:136)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.DeserializeToObjectExec.doExecute(objects.scala:89)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:136)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:81)
	at org.apache.spark.sql.Dataset.rdd$lzycompute(Dataset.scala:3043)
	at org.apache.spark.sql.Dataset.rdd(Dataset.scala:3041)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.saveTable(JdbcUtils.scala:838)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:63)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:136)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:81)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:696)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:696)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:696)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:305)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:291)
	at org.apache.spark.sql.DataFrameWriter.jdbc(DataFrameWriter.scala:535)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.lang.reflect.Constructor.newInstance(Constructor.java:423)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.GatewayConnection.run(GatewayConnection.java:238)
java.lang.Thread.run(Thread.java:745)

The currently active SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.lang.reflect.Constructor.newInstance(Constructor.java:423)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.GatewayConnection.run(GatewayConnection.java:238)
java.lang.Thread.run(Thread.java:745)
         
	at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:100)
	at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1486)
	at org.apache.spark.sql.hive.HadoopTableReader.<init>(TableReader.scala:89)
	at org.apache.spark.sql.hive.execution.HiveTableScanExec.org$apache$spark$sql$hive$execution$HiveTableScanExec$$hadoopReader$lzycompute(HiveTableScanExec.scala:105)
	at org.apache.spark.sql.hive.execution.HiveTableScanExec.org$apache$spark$sql$hive$execution$HiveTableScanExec$$hadoopReader(HiveTableScanExec.scala:105)
	at org.apache.spark.sql.hive.execution.HiveTableScanExec$$anonfun$10.apply(HiveTableScanExec.scala:188)
	at org.apache.spark.sql.hive.execution.HiveTableScanExec$$anonfun$10.apply(HiveTableScanExec.scala:188)
	at org.apache.spark.util.Utils$.withDummyCallSite(Utils.scala:2470)
	at org.apache.spark.sql.hive.execution.HiveTableScanExec.doExecute(HiveTableScanExec.scala:187)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:136)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:391)
	at org.apache.spark.sql.execution.FilterExec.inputRDDs(basicPhysicalOperators.scala:123)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:627)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:136)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:160)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:157)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:132)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:252)
	at org.apache.spark.sql.execution.SparkPlan.executeCollectIterator(SparkPlan.scala:311)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec$$anonfun$relationFuture$1$$anonfun$apply$1.apply(BroadcastExchangeExec.scala:79)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec$$anonfun$relationFuture$1$$anonfun$apply$1.apply(BroadcastExchangeExec.scala:76)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withExecutionId$1.apply(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withExecutionId(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec$$anonfun$relationFuture$1.apply(BroadcastExchangeExec.scala:75)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec$$anonfun$relationFuture$1.apply(BroadcastExchangeExec.scala:75)
	at scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24)
	at scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [78]:
spark.stop()

In [79]:
sc.stop()

In [80]:
!jps

27057 NodeManager
26306 NameNode
26500 DataNode
18278 SparkSubmit
26732 SecondaryNameNode
26893 ResourceManager
28878 Jps
