In [1]:
# Import Libraries
from pyspark.sql import SparkSession

## Spark Session

In [2]:
spark = SparkSession.builder \
        .appName("SQLClass1App") \
        .getOrCreate()

In [3]:
spark.version

'3.5.5'

In [4]:
# mysql connector property for java com.mysql:mysql-connector-j:9.0.0

In [5]:
# Define MySQL credentials
mysql_url = "jdbc:mysql://localhost:3306/class_1"
mysql_user = "root"
mysql_password = "sarath254@"
mysql_driver = "com.mysql.cj.jdbc.Driver"

# MySQL cnnection with spark

In [6]:
# create table

In [7]:
query = "select * from CITY"

In [8]:
df = spark.read \
    .format("jdbc") \
    .option("url", mysql_url) \
    .option("user", mysql_user) \
    .option("query", query) \
    .option("password", mysql_password) \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .load()

In [9]:
def mysql_read_action(query):
    """
    Perform MySQL actions like read, write, update and delete
    """
    return spark.read \
    .format("jdbc") \
    .option("url", mysql_url) \
    .option("user", mysql_user) \
    .option("query", query) \
    .option("password", mysql_password) \
    .option("driver", mysql_driver) \
    .load()

In [10]:
df = mysql_read_action(query)

In [11]:
df.show()

+----+-------------+-----------+-------------+----------+
|  ID|         NAME|COUNTRYCODE|     DISTRICT|POPULATION|
+----+-------------+-----------+-------------+----------+
|   6|    Rotterdam|        NLD| Zuid-Holland|    593321|
|3878|   Scottsdale|        USA|      Arizona|    202705|
|3965|       Corona|        USA|   California|    124966|
|3973|      Concord|        USA|   California|    121780|
|3977| Cedar Rapids|        USA|         Iowa|    120758|
|3982|Coral Springs|        USA|      Florida|    117549|
|4054|    Fairfield|        USA|   California|     92256|
|4058|      Boulder|        USA|     Colorado|     91238|
|4061|   Fall River|        USA|Massachusetts|     90555|
+----+-------------+-----------+-------------+----------+



# SQL and Pyspark Practice NoteBook

#### Q1. Query all columns for all American cities in the CITY table with populations larger than 100000.The CountryCode for America is USA. The CITY table is described as follows:

## pyspark solution

In [None]:
from pyspark.sql.functions import col

In [None]:
df = mysql_read_action(query)

In [None]:
df_filtered = df.filter(col('POPULATION')>100000).where(col("COUNTRYCODE")=="USA")

In [None]:
df_filtered.show()

In [None]:
# another way in pyspark
df_filtered_1 = df.filter((col('POPULATION')>100000) & (col("COUNTRYCODE")=="USA"))

In [None]:
df_filtered_1.show()

In [None]:
df_filtered.describe()

1. Both filter() and where() do the same job in the DataFrame API.
2. where() is just an alias for filter(), provided for SQL-like syntax consistency.
3. You can use column expressions or SQL-like string conditions in both.

## SQL solution

In [None]:
sql_query = "select * from CITY where COUNTRYCODE='USA' and population>100000"

In [None]:
df_sql = mysql_read_action(sql_query)

In [None]:
df_sql.show()

###  Q2. Query the NAME field for all American cities in the CITY table with populations larger than 120000.The CountryCode for America is USA. The CITY table is described as follows:

In [None]:
# pyspark solution

In [None]:
df.show()

In [None]:
df_filtered_2 = df.select(col("NAME"), col("POPULATION")).filter(col("POPULATION")>120000)

In [None]:
df_filtered_2.show()

In [None]:
# SQL solution

In [None]:
sql_query_2 = "select name, population from city where population>120000"

In [None]:
df_sql_2 = mysql_read_action(sql_query_2)

In [None]:
df_sql_2.show()

### Q3. Query all columns (attributes) for every row in the CITY table.

In [None]:
# spark solution

In [None]:
df.show()

In [None]:
df_filtered_3 = df.select(col("*"))

In [None]:
df_filtered_3.show()

In [None]:
# SQL solution

In [None]:
sql_query_3 = "select * from city"

df_sql_3 = mysql_read_action(sql_query_3)

In [None]:
df_sql_3.show()

### Q4. Query all columns for a city in CITY with the ID 1661.

In [None]:
 # spark solution

In [None]:
df.show()

In [None]:
df_filtered_4 = df.where(col("ID")==1661)

In [None]:
df_filtered_4.show()

In [None]:
# sql solution

In [None]:
sql_query_4 = "select * from city where id=1661"

In [None]:
df_sql4 = mysql_read_action(sql_query_4)

In [None]:
df_sql4.show()

### Q5. Query all attributes of every Japanese city in the CITY table. The COUNTRYCODE for Japan is  JPN.

In [None]:
# spark solution

In [None]:
df.show()

In [None]:
df_filtered_5 = df.select(col("*")).filter(col("COUNTRYCODE")=="JPN")

In [None]:
df_filtered_5.show()

In [None]:
# sql solution

In [None]:
sql_query_5 = "select * from city where countrycode='JPN'"

In [None]:
df_sql_5 = mysql_read_action(sql_query_5)

In [None]:
df_sql_5.show()

###  Q6. Query the names of all the Japanese cities in the CITY table. The COUNTRYCODE for Japan is JPN.

In [None]:
# sql solution

In [None]:
sql_query_6 = "select name from CITY where COUNTRYCODE='JPN'"

In [None]:
df_sql_6 = mysql_read_action(sql_query_6)

In [None]:
df_sql_6.show()

In [None]:
# pyspark solution

In [None]:
df.show()

In [None]:
df_filtered_6 = df.select(col('name')).where(col("countrycode")=="JPN")

In [None]:
df_filtered_6.show()

###  Q7. Query a list of CITY and STATE from the STATION table

In [None]:
station_table_query = "select * from station"

In [None]:
station_df = mysql_read_action(station_table_query)

In [None]:
station_df.show()

In [None]:
station_df.columns

In [None]:
sql_query_7 = " select city, state from station"

In [None]:
df_sql_7 = mysql_read_action(sql_query_7)

In [None]:
df_sql_7.show()

In [None]:
# spark solution

In [None]:
df_filtered_7 = station_df.select(col("city"), col("state"))

In [None]:
df_filtered_7.show()

In [None]:
df_filtered_7 = station_df.selectExpr("city", "state")

In [None]:
df_filtered_7.show()

 ### Q8. Query a list of CITY names from STATION for cities that have an even ID number. Print the results in any order, but exclude duplicates from the answer.

In [None]:
sql_query_8 = " select distinct city from station where ID%2=0"

In [None]:
df_sql_8 = mysql_read_action(sql_query_8)

In [None]:
df_sql_8.show()

In [None]:
# spark solution

In [None]:
station_df.show()

In [None]:
df_filtered_8 = station_df.selectExpr("city").distinct().filter(col("ID")%2==0)

In [None]:
df_filtered_8.show()

### Q9. Find the difference between the total number of CITY entries in the table and the number of distinct CITY entries in the table.

In [None]:
sql_query_9 = "select count(city) as total_number_of_cities, count(distinct city) as unique_cities, (count(city) - count(distinct city)) as difference from station"

In [None]:
df_sql_9 = mysql_read_action(sql_query_9)

In [None]:
df_sql_9.show()

In [None]:
# pyspark solution

In [None]:
df_filtered_9 = station_df.selectExpr("count(city) as total_cities", 
                                     "count(distinct city) as unique_cities",
                                     "count(city) - count(distinct city) as difference")

In [None]:
df_filtered_9.show()

###  Q10. Query the two cities in STATION with the shortest and longest CITY names, as well as their
### respective lengths (i.e.: number of characters in the name). If there is more than one smallest or
### largest city, choose the one that comes first when ordered alphabetically.

In [None]:
# sql query 

In [None]:
sql_query_9_1 = "select city, length(city) as city_length from station order by length(city) desc limit 1"

In [None]:
df_sql_9_1 = mysql_read_action(sql_query_9_1)

In [None]:
df_sql_9_1.show()

In [None]:
sql_query_9_2 = "select city, length(city) as city_length from station order by length(city), city asc limit 1"

In [None]:
df_sql_9_2 = mysql_read_action(sql_query_9_2)

In [None]:
df_sql_9_2.show()

In [None]:
# pyspark solution

In [None]:
from pyspark.sql.functions import length
df_filtered_9_1 = station_df.select(col("city"), length(col("city")).alias("city_length")).orderBy(col("city_length").desc()).limit(1)

In [None]:
df_filtered_9_1.show()

In [None]:
df_filtered_9_2 = station_df.select(col("city"), length(col("city")).alias("city_length")).orderBy(col("city")).limit(1)

In [None]:
df_filtered_9_2.show()

###  Q11. Query the list of CITY names starting with vowels (i.e., a, e, i, o, or u) from STATION. Your result cannot contain duplicates.

In [None]:
# sql_solution

In [None]:
sql_query_10 = "SELECT DISTINCT(CITY) AS DISTINCT_CITY_NAME FROM STATION WHERE lower(SUBSTR(city,1,1)) in ('a','e','i','o','u')"

In [None]:
df_sql_10 = mysql_read_action(sql_query_10)

In [None]:
df_sql_10.show()

In [None]:
# pyspark solution

In [None]:
from pyspark.sql.functions import lower
df_filtered_11 = station_df.select(col("city")).distinct().filter(lower(col('city')).substr(1, 1).isin("a", "e", "i", "o", "u"))

In [None]:
df_filtered_11.show()

### Q12. Query the list of CITY names ending with vowels (a, e, i, o, u) from STATION. Your result cannot contain duplicates.

In [None]:
sql_query_11 = "SELECT DISTINCT(CITY) AS DISTINCT_CITY_NAME FROM STATION WHERE lower(SUBSTR(city,-1,1)) in ('a','e','i','o','u')"

In [None]:
df_sql_11 = mysql_read_action(sql_query_11)

In [None]:
df_sql_11.show()

In [None]:
# pyspark solution
df_filtered_12 = station_df.select(col("city")).distinct().filter(lower(col('city')).substr(-1, 1).isin("a", "e", "i", "o", "u"))

In [None]:
df_filtered_12.show()

### Q13. Query the list of CITY names from STATION that do not start with vowels. Your result cannot contain duplicates.

In [None]:
sql_query_13 = "select distinct city from station where lower(substr(city, 1, 1)) not in ('a', 'b', 'c', 'd', 'e')"

In [None]:
df_sql_13 = mysql_read_action(sql_query_13)

In [None]:
df_sql_13.show()

In [None]:
from pyspark.sql.functions import col, lower, substr

df_filtered_13 = station_df \
    .select("city") \
    .distinct() \
    .filter(~lower(col("city")).substr(1, 1).isin("a", "e", "i", "o", "u"))

In [None]:
df_filtered_13.show()

 ### Q14. Query the list of CITY names from STATION that do not end with vowels. Your result cannot contain duplicates.

In [None]:
# sql_solution

In [None]:
sql_query_14 = "select distinct city from station where lower(substr(city, -1, 1)) not in ('a', 'e', 'i', 'o', 'u')"

In [None]:
df_query_14 = mysql_read_action(sql_query_14)

In [None]:
df_query_14.show()

In [None]:
# spark solution

In [None]:
df_filtered_14 = station_df.select(col('city'))\
                           .distinct()\
                           .filter(~lower(col('city')).substr(-1,1).isin('a', 'e', 'i', 'o', 'u'))

In [None]:
df_filtered_14.show()

### Q15. Query the list of CITY names from STATION that either do not start with vowels and do not end with vowels. Your result cannot contain duplicates.

In [None]:
sql_query_15 = "select city from station where substr(lower(city), 1, 1) not in ('a', 'e', 'i', 'o', 'u') and substr(lower(city), -1, 1) not in  ('a', 'e', 'i', 'o', 'u')"

In [None]:
df_sql_15 = mysql_read_action(sql_query_15)

In [None]:
df_sql_15.show()

In [None]:
# pyspsark solution

In [None]:
df_filtered_15 = station_df.select(col('city'))\
                           .distinct()\
                           .filter(~lower(col('city')).substr(1,1).isin('a', 'e', 'i', 'o', 'u') & ~lower(col("city")).substr(-1, 1).isin('a', 'e', 'i', 'o', 'u'))

In [None]:
df_filtered_15.show()

### Q16. Query the list of CITY names from STATION that do not start with vowels or do not end with vowels. Your result cannot contain duplicates.

In [None]:
sql_query_16 = "select city from station where substr(lower(city), 1, 1) not in ('a', 'e', 'i', 'o', 'u') or substr(lower(city), -1, 1) not in  ('a', 'e', 'i', 'o', 'u')"

In [None]:
df_sql_16 = mysql_read_action(sql_query_16)

In [None]:
df_sql_16.show()

In [None]:
# pyspark solution

In [None]:
df_filtered_16 = station_df.select(col('city'))\
                           .distinct()\
                           .filter(~lower(col('city')).substr(1,1).isin('a', 'e', 'i', 'o', 'u') | ~lower(col("city")).substr(-1, 1).isin('a', 'e', 'i', 'o', 'u'))

In [None]:
df_filtered_16.show()

### Q17.
 Table: Product
 Column Name Type
 product_id
 int
 product_name varchar
 unit_price int
 product_id is the primary key of this table.
 Each row of this table indicates the name and the price of each product.
 Table: Sales
 Column Name Type
 seller_id
 int
 product_id
 buyer_id
 int
 int
 sale_date
 quantity
 price
 date
 int
 int
 This table has no primary key, it can have repeated rows.
 product_id is a foreign key to the Product table.
 Each row of this table contains some information about one sale.
 Write an SQL query that reports the products that were only sold in the first quarter of 2019. That is,
 between 2019-01-01 and 2019-03-31 inclusive.
 Return the result table in any order.
 The query result format is in the following example.
 Input:
 Product table:
 product_id
 1
 product_name unit_price
 S8
 1000
 2
 3
 Sales table:
 seller_id
 G4
 iPhone
 product_id
 800
 1400
 buyer_id
 sale_date
 quantity
 1
 1
 1
 2019-01-21
 2
 price
 2000
 1
 2
 3
 Output:
 product_id
 1
 2
 2
 3
 product_name
 S8
 2
 3
 4
 2019-02-17
 2019-06-02
 2019-05-13
 1
 1
 2
 800
 800
 2800
 Explanation:
 The product with id 1 was only sold in the spring of 2019.
 The product with id 2 was sold in the spring of 2019 but was also sold after the spring of 2019.
 The product with id 3 was sold after spring 2019.
 We return only product 1 as it is the product that was only sold in the spring of 2019

### 17. Write an SQL query that reports the products that were only sold in the first quarter of 2019. That is, between 2019-01-01 and 2019-03-31 inclusive.

In [None]:
# sql query

In [None]:
sql_query_17 = """SELECT p.product_id, p.product_name
FROM product p
LEFT JOIN (
    SELECT DISTINCT product_id
    FROM sales
    WHERE sale_date NOT BETWEEN '2019-01-01' AND '2019-03-31'
) s ON p.product_id = s.product_id
WHERE s.product_id IS NULL"""

# select product_id, product_name from product where product_id not in (select product_id from sales where sale_date not between "2019-01-01" and "2019-03-31")

In [None]:
df_sql_17 = mysql_read_action(sql_query_17)

In [None]:
df_sql_17.show()

In [None]:
product_query = "select * from product"

In [None]:
sales_query = "select * from sales"

In [None]:
product_df = mysql_read_action(product_query)

In [None]:
product_df.show()

In [None]:
sales_df = mysql_read_action(sales_query)

In [None]:
sales_df.show()

In [None]:
# Step 1: Filter sales that occurred OUTSIDE the date range
sales_outside = sales_df.filter(~col("sale_date").between("2019-01-01", "2019-03-31")) \
                        .select("product_id") \
                        .distinct()

# Step 2: Join product_df with the "sales_outside" using left anti join
df_filtered_17 = product_df.join(sales_outside, on="product_id", how="left_anti") \
                      .select("product_id", "product_name")


In [None]:
df_filtered_17.show()

### 18. Write an SQL query to find all the authors that viewed at least one of their own articles. Return the result table sorted by id in ascending order.

In [None]:
sql_query_18 = "select distinct author_id from views where author_id=viewer_id order by author_id asc"

In [None]:
df_sql_18 = mysql_read_action(sql_query_18)

In [None]:
df_sql_18.show()

In [None]:
# pyspark solution

In [None]:
views_query = "select * from views"
views_df = mysql_read_action(views_query)
views_df.show()

In [None]:
df_filtered_18 = views_df.select(col("author_id")) \
                         .filter(col('author_id')==col("viewer_id")) \
                         .distinct()\
                         .orderBy(col('author_id').asc())

In [None]:
df_filtered_18.show()

### 19. delivery_id is the primary key of this table. The table holds information about food delivery to customers that make orders at some date and specify a preferred delivery date (on the same order date or after it). If the customer's preferred delivery date is the same as the order date, then the order is called immediately; otherwise, it is called scheduled.
### Write an SQL query to find the percentage of immediate orders in the table, rounded to 2 decimal places.

In [None]:
sql_query_19 = """
SELECT 
    ROUND(
        100.0 * SUM(CASE WHEN order_date = customer_pref_delivery_date THEN 1 ELSE 0 END) 
        / COUNT(*), 
        2
    ) AS immediate_percentage
FROM Delivery
"""

In [None]:
df_sql_19 = mysql_read_action(sql_query_19)

In [None]:
df_sql_19.show()

In [None]:
# spark solution

In [None]:
delivery_query = "select * from delivery"

In [None]:
delivery_df = mysql_read_action(delivery_query)

In [None]:
delivery_df.show()

In [None]:
from pyspark.sql.functions import when, sum, count, round
delivery_df1 = delivery_df.withColumn("is_immediate",
                                     when(col('order_date') == col('customer_pref_delivery_date'), 1).otherwise(0))

In [None]:
delivery_df1.show()

In [None]:
df_filtered_19 = delivery_df1.select(round(100*(sum(col("is_immediate"))/count('delivery_id')), 2).alias("delivery_percentage"))

In [None]:
df_filtered_19.show()

### 20. Write an SQL query to find the ctr of each Ad. Round ctr to two decimal points.Return the result table ordered by ctr in descending order and by ad_id in ascending order in case of a tie.

In [None]:
sql_query_20 = """select 
b.ad_id,
round(case when (b.total_clicks+b.total_views) = 0 then 0 else 100*(b.total_clicks/(b.total_clicks+b.total_views)) end,2) as ctr
from
(select 
ad_id,
sum(case when action="Clicked" then 1 else 0 end) as total_clicks,
sum(case when action="Viewed" then 1 else 0 end) as total_views
from ads group by ad_id) as b"""

In [None]:
df_sql_20 = mysql_read_action(sql_query_20)

In [None]:
df_sql_20.show()

In [None]:
# spark solution

In [None]:
ads_query = "select * from ads"

In [None]:
ads_df = mysql_read_action(ads_query)

In [None]:
ads_df.show()

In [None]:
# spark solution

In [None]:
from pyspark.sql.functions import when, sum as _sum
df_filtered_20_1 = ads_df.groupby(col("ad_id")).agg(_sum(when(col("action") =="Clicked", 1).otherwise(0)).alias("total_clicks"))
df_filtered_20_2 = ads_df.groupby(col("ad_id")).agg(_sum(when(col("action") =="Viewed", 1).otherwise(0)).alias("total_views"))

In [None]:
df_filtered_20_1.show()

In [None]:
df_filtered_20_2.show()

In [None]:
df_filtered_20_3 = df_filtered_20_1.join(df_filtered_20_2, on="ad_id", how="left").select(col("ad_id"), col("total_clicks"), col("total_views"))

In [None]:
df_filtered_20_3.show()

In [None]:
df_filtered_20 = df_filtered_20_3.withColumn("ctr",
                                            round(when(col("total_clicks")+col("total_views") == 0, 0).otherwise(100 * col('total_clicks')/(col('total_clicks')+col('total_views'))), 2))

In [None]:
df_filtered_20.show()

### 21.  Write an SQL query to find the team size of each of the employees.

In [12]:
sql_query_21 = """
select 
employee_id,
count(employee_id) over (partition by team_id) as team_size
from employee order by team_size desc
"""

In [13]:
df_query_21 = mysql_read_action(sql_query_21)

In [14]:
df_query_21.show()

+-----------+---------+
|employee_id|team_size|
+-----------+---------+
|          1|        3|
|          2|        3|
|          3|        3|
|          5|        2|
|          6|        2|
|          4|        1|
+-----------+---------+



In [15]:
# spark solution

In [16]:
employee_query = "select * from employee"

In [17]:
employee_df = mysql_read_action(employee_query)

In [18]:
employee_df.show()

+-----------+-------+
|employee_id|team_id|
+-----------+-------+
|          1|      8|
|          2|      8|
|          3|      8|
|          4|      7|
|          5|      9|
|          6|      9|
+-----------+-------+



In [22]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, count, sum
window_fun_21 = Window.partitionBy("team_id")
df_filtered_21 = employee_df.withColumn("team_size", count("team_id").over(window_fun_21)).orderBy("employee_id")

In [23]:
df_filtered_21.show()

+-----------+-------+---------+
|employee_id|team_id|team_size|
+-----------+-------+---------+
|          1|      8|        3|
|          2|      8|        3|
|          3|      8|        3|
|          4|      7|        1|
|          5|      9|        2|
|          6|      9|        2|
+-----------+-------+---------+



### 22 --Write an SQL query to find the type of weather in each country for November 2019.