# pyspark installation

In [None]:
!pip install pyspark py4j

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 KB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark


 # Create spark session with hive enabled

In [None]:
from os.path import abspath

from pyspark.sql import SparkSession

# warehouse_location
warehouse_location = abspath('hive-warehouse')

# Create spark session with hive enabled
spark = SparkSession.builder.master('local').appName('demo').config('spark.sql.warehouse.dir',warehouse_location).enableHiveSupport().getOrCreate()

spark


# show databases

In [None]:
spark.sql('show databases').show()

# creating new database

In [None]:
spark.sql('create database if not exists test3').show()

# creating table using specified file format:

In [None]:
spark.sql('create table if not exists test3.Employee_csv (Id Int ,Name string ,loc string) using csv')

In [None]:
# DDL of the table

spark.sql('show create table test3.Employee_csv ').show(truncate = False)

In [None]:
# insert data into table
spark.sql("insert into test3.employee_csv values(1,'Satish','hyd')")

In [8]:
ls /content/hive-warehouse/test3.db/employee_csv/

part-00000-26fb33ac-f84a-43c2-aa0b-5f5c4727a18e-c000.csv  _SUCCESS


In [9]:
# show table
spark.sql("select * from test3.employee_csv ").show()

+---+------+---+
| Id|  Name|loc|
+---+------+---+
|  1|Satish|hyd|
+---+------+---+



# show tables in database

In [10]:
spark.sql('show tables in test3').show()

+---------+------------+-----------+
|namespace|   tableName|isTemporary|
+---------+------------+-----------+
|    test3|employee_csv|      false|
+---------+------------+-----------+



# describe database

In [11]:
spark.sql("describe database test3" ).show(truncate = False)

+--------------+-------------------------------------+
|info_name     |info_value                           |
+--------------+-------------------------------------+
|Namespace Name|test3                                |
|Comment       |                                     |
|Location      |file:/content/hive-warehouse/test3.db|
|Owner         |root                                 |
+--------------+-------------------------------------+



# create and descibe database in specified location

In [12]:
spark.sql("create database if not exists sandeep location '/content/ext_db'")

DataFrame[]

In [13]:
spark.sql("describe database sandeep").show(truncate = False)

+--------------+--------------------+
|info_name     |info_value          |
+--------------+--------------------+
|Namespace Name|sandeep             |
|Comment       |                    |
|Location      |file:/content/ext_db|
|Owner         |root                |
+--------------+--------------------+



# internal table 

internal table /managed table -- default location (user/hive/warehouse/db/table_name)

In [14]:


spark.sql("create table if not exists student_int(id int ,Name string ,rollno int) ")

DataFrame[]

In [15]:
spark.sql("insert into student_int values(1,'sandeep',0382)")

DataFrame[]

In [16]:
spark.sql("insert into table student_int values(2,'satish',0561)")

DataFrame[]

In [17]:
spark.sql("select * from student_int").show(truncate = False)

+---+-------+------+
|id |Name   |rollno|
+---+-------+------+
|2  |satish |561   |
|1  |sandeep|382   |
+---+-------+------+



In [18]:
spark.sql("show create table student_int").show(truncate = False)

+------------------------------------------------------------------------------------------------------------------------------------------------------+
|createtab_stmt                                                                                                                                        |
+------------------------------------------------------------------------------------------------------------------------------------------------------+
|CREATE TABLE default.student_int (\n  id INT,\n  Name STRING,\n  rollno INT)\nUSING text\nTBLPROPERTIES (\n  'transient_lastDdlTime' = '1678549352')\n|
+------------------------------------------------------------------------------------------------------------------------------------------------------+



# External table creation


1. can create external table by using location
2. can create external table by using external key word location



In [19]:
spark.sql("create table student_ext(id int ,name string ,rollno int) location '/content/External/student_ext'")

DataFrame[]

In [20]:
spark.sql("show tables").show()

+---------+-----------+-----------+
|namespace|  tableName|isTemporary|
+---------+-----------+-----------+
|  default|student_ext|      false|
|  default|student_int|      false|
+---------+-----------+-----------+



In [21]:
spark.sql("describe table default.student_ext ").show(truncate = False)

+--------+---------+-------+
|col_name|data_type|comment|
+--------+---------+-------+
|id      |int      |null   |
|name    |string   |null   |
|rollno  |int      |null   |
+--------+---------+-------+



In [22]:
spark.sql("insert into table student_ext values(1,'sandeep',0382)")

DataFrame[]

In [23]:
spark.sql("insert into table student_ext values(2,'satish',0561)")

DataFrame[]

In [24]:
spark.sql("select * from student_ext").show(truncate = False)

+---+-------+------+
|id |name   |rollno|
+---+-------+------+
|1  |sandeep|382   |
|2  |satish |561   |
+---+-------+------+



In [25]:
spark.sql('show create table student_ext').show(truncate = False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|createtab_stmt                                                                                                                                                                                       |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|CREATE TABLE default.student_ext (\n  id INT,\n  name STRING,\n  rollno INT)\nUSING text\nLOCATION 'file:/content/External/student_ext'\nTBLPROPERTIES (\n  'transient_lastDdlTime' = '1678549355')\n|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


In [26]:
spark.sql("create external table if not exists family_ext (sno int ,name string ,age int ) location '/content/External/family_ext'")

DataFrame[]

In [27]:
spark.sql("show tables").show()

+---------+-----------+-----------+
|namespace|  tableName|isTemporary|
+---------+-----------+-----------+
|  default| family_ext|      false|
|  default|student_ext|      false|
|  default|student_int|      false|
+---------+-----------+-----------+



In [28]:
spark.sql("insert into table family_ext values(1,'Krishna',60)")

DataFrame[]

In [29]:
spark.sql('select * from family_ext').show(truncate = False)

+---+-------+---+
|sno|name   |age|
+---+-------+---+
|1  |Krishna|60 |
+---+-------+---+



In [30]:
spark.sql('show create table family_ext').show(truncate = False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|createtab_stmt                                                                                                                                                                                   |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|CREATE TABLE default.family_ext (\n  sno INT,\n  name STRING,\n  age INT)\nUSING text\nLOCATION 'file:/content/External/family_ext'\nTBLPROPERTIES (\n  'transient_lastDdlTime' = '1678549357')\n|
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



# Difference between internal and external table

if we drop internal table both table meta data and data will be deleted (/content/hive-warehouse)

if we drop External table only table meta data dropped ,data will remain in external location (/content/External)

In [31]:
spark.sql('drop table student_int')

DataFrame[]

In [32]:
spark.sql('drop table student_ext')

DataFrame[]

# Creating Views

View : window of a table or query , view does not store any data.

types of views:

1.global temporary view
2.temporary view

In [46]:
spark.sql("create external table test3.emp(id int,fname string,lname string,salary int,exp int) location '/content/External/emp'")

DataFrame[]

In [47]:
spark.sql("insert into test3.emp(id,fname,lname,salary,exp) values(1,'Satish','Nookala',100000,7)")
spark.sql("insert into test3.emp(id,fname,lname,salary,exp) values(2,'Sandeep','Nookala',50000,4)")
spark.sql("insert into test3.emp(id,fname,lname,salary,exp) values(3,'krishna','Nookala',10000,2)")
spark.sql("insert into test3.emp(id,fname,lname,salary,exp) values(4,'padma','Nookala',1000,2)")
spark.sql("insert into test3.emp(id,fname,lname,salary,exp) values(5,'anusha','Nookala',100,1)")
spark.sql("insert into test3.emp(id,fname,lname,salary,exp) values(6,'swathi','Nookala',10,0)")

DataFrame[]

In [48]:
spark.sql('select * from test3.emp').show()

+---+-------+-------+------+---+
| id|  fname|  lname|salary|exp|
+---+-------+-------+------+---+
|  4|  padma|Nookala|  1000|  2|
|  2|Sandeep|Nookala| 50000|  4|
|  1| Satish|Nookala|100000|  7|
|  6| swathi|Nookala|   100|  0|
|  3|krishna|Nookala| 10000|  2|
|  5| anusha|Nookala|   100|  1|
+---+-------+-------+------+---+



In [49]:
spark.sql('select * from test3.emp order by id').show()

+---+-------+-------+------+---+
| id|  fname|  lname|salary|exp|
+---+-------+-------+------+---+
|  1| Satish|Nookala|100000|  7|
|  2|Sandeep|Nookala| 50000|  4|
|  3|krishna|Nookala| 10000|  2|
|  4|  padma|Nookala|  1000|  2|
|  5| anusha|Nookala|   100|  1|
|  6| swathi|Nookala|   100|  0|
+---+-------+-------+------+---+



In [58]:
# create or replce view from select query
spark.sql('create or replace view emp_view_id as select * from test3.emp order by id')

DataFrame[]

In [61]:
# create or replce view from select query

spark.sql('create or replace view emp_view_salary as select * from test3.emp order by salary desc')


DataFrame[]

In [62]:
spark.sql('select * from emp_view_salary').show(truncate = False)

+---+-------+-------+------+---+
|id |fname  |lname  |salary|exp|
+---+-------+-------+------+---+
|1  |Satish |Nookala|100000|7  |
|2  |Sandeep|Nookala|50000 |4  |
|3  |krishna|Nookala|10000 |2  |
|4  |padma  |Nookala|1000  |2  |
|6  |swathi |Nookala|100   |0  |
|5  |anusha |Nookala|100   |1  |
+---+-------+-------+------+---+



In [63]:
spark.sql('select * from emp_view_id').show(truncate = False)

+---+-------+-------+------+---+
|id |fname  |lname  |salary|exp|
+---+-------+-------+------+---+
|1  |Satish |Nookala|100000|7  |
|2  |Sandeep|Nookala|50000 |4  |
|3  |krishna|Nookala|10000 |2  |
|4  |padma  |Nookala|1000  |2  |
|5  |anusha |Nookala|100   |1  |
|6  |swathi |Nookala|100   |0  |
+---+-------+-------+------+---+



In [64]:
# DL of view
spark.sql('show create table emp_view_salary').show(truncate = False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|createtab_stmt                                                                                                                                                                                    |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|CREATE VIEW default.emp_view_salary (\n  id,\n  fname,\n  lname,\n  salary,\n  exp)\nTBLPROPERTIES (\n  'transient_lastDdlTime' = '1678551453')\nAS select * from test3.emp order by salary desc\n|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

