In [1]:
# Import Libraries
from pyspark.sql import SparkSession

## Spark Session

In [2]:
spark = SparkSession.builder \
        .appName("SQLClass1App") \
        .getOrCreate()

In [3]:
spark.version

'3.5.5'

In [4]:
# mysql connector property for java com.mysql:mysql-connector-j:9.0.0

In [5]:
# Define MySQL credentials
mysql_url = "jdbc:mysql://localhost:3306/class_1"
mysql_user = "root"
mysql_password = "sarath254@"
mysql_driver = "com.mysql.cj.jdbc.Driver"

# MySQL cnnection with spark

In [6]:
# create table

In [7]:
query = "select * from CITY"

In [8]:
df = spark.read \
    .format("jdbc") \
    .option("url", mysql_url) \
    .option("user", mysql_user) \
    .option("query", query) \
    .option("password", mysql_password) \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .load()

In [9]:
def mysql_read_action(query):
    """
    Perform MySQL actions like read, write, update and delete
    """
    return spark.read \
    .format("jdbc") \
    .option("url", mysql_url) \
    .option("user", mysql_user) \
    .option("query", query) \
    .option("password", mysql_password) \
    .option("driver", mysql_driver) \
    .load()

In [10]:
df = mysql_read_action(query)

In [11]:
df.show()

+----+-------------+-----------+-------------+----------+
|  ID|         NAME|COUNTRYCODE|     DISTRICT|POPULATION|
+----+-------------+-----------+-------------+----------+
|   6|    Rotterdam|        NLD| Zuid-Holland|    593321|
|3878|   Scottsdale|        USA|      Arizona|    202705|
|3965|       Corona|        USA|   California|    124966|
|3973|      Concord|        USA|   California|    121780|
|3977| Cedar Rapids|        USA|         Iowa|    120758|
|3982|Coral Springs|        USA|      Florida|    117549|
|4054|    Fairfield|        USA|   California|     92256|
|4058|      Boulder|        USA|     Colorado|     91238|
|4061|   Fall River|        USA|Massachusetts|     90555|
+----+-------------+-----------+-------------+----------+



# SQL and Pyspark Practice NoteBook

#### Q1. Query all columns for all American cities in the CITY table with populations larger than 100000.The CountryCode for America is USA. The CITY table is described as follows:

## pyspark solution

In [29]:
from pyspark.sql.functions import col

In [30]:
df = mysql_read_action(query)

In [31]:
df_filtered = df.filter(col('POPULATION')>100000).where(col("COUNTRYCODE")=="USA")

In [32]:
df_filtered.show()

+----+-------------+-----------+----------+----------+
|  ID|         NAME|COUNTRYCODE|  DISTRICT|POPULATION|
+----+-------------+-----------+----------+----------+
|3878|   Scottsdale|        USA|   Arizona|    202705|
|3965|       Corona|        USA|California|    124966|
|3973|      Concord|        USA|California|    121780|
|3977| Cedar Rapids|        USA|      Iowa|    120758|
|3982|Coral Springs|        USA|   Florida|    117549|
+----+-------------+-----------+----------+----------+



In [33]:
# another way in pyspark
df_filtered_1 = df.filter((col('POPULATION')>100000) & (col("COUNTRYCODE")=="USA"))

In [34]:
df_filtered_1.show()

+----+-------------+-----------+----------+----------+
|  ID|         NAME|COUNTRYCODE|  DISTRICT|POPULATION|
+----+-------------+-----------+----------+----------+
|3878|   Scottsdale|        USA|   Arizona|    202705|
|3965|       Corona|        USA|California|    124966|
|3973|      Concord|        USA|California|    121780|
|3977| Cedar Rapids|        USA|      Iowa|    120758|
|3982|Coral Springs|        USA|   Florida|    117549|
+----+-------------+-----------+----------+----------+



In [35]:
df_filtered.describe()

DataFrame[summary: string, ID: string, NAME: string, COUNTRYCODE: string, DISTRICT: string, POPULATION: string]

1. Both filter() and where() do the same job in the DataFrame API.
2. where() is just an alias for filter(), provided for SQL-like syntax consistency.
3. You can use column expressions or SQL-like string conditions in both.

## SQL solution

In [36]:
sql_query = "select * from CITY where COUNTRYCODE='USA' and population>100000"

In [37]:
df_sql = mysql_read_action(sql_query)

In [38]:
df_sql.show()

+----+-------------+-----------+----------+----------+
|  ID|         NAME|COUNTRYCODE|  DISTRICT|POPULATION|
+----+-------------+-----------+----------+----------+
|3878|   Scottsdale|        USA|   Arizona|    202705|
|3965|       Corona|        USA|California|    124966|
|3973|      Concord|        USA|California|    121780|
|3977| Cedar Rapids|        USA|      Iowa|    120758|
|3982|Coral Springs|        USA|   Florida|    117549|
+----+-------------+-----------+----------+----------+



###  Q2. Query the NAME field for all American cities in the CITY table with populations larger than 120000.The CountryCode for America is USA. The CITY table is described as follows:

In [46]:
# pyspark solution

In [39]:
df.show()

+----+-------------+-----------+-------------+----------+
|  ID|         NAME|COUNTRYCODE|     DISTRICT|POPULATION|
+----+-------------+-----------+-------------+----------+
|   6|    Rotterdam|        NLD| Zuid-Holland|    593321|
|3878|   Scottsdale|        USA|      Arizona|    202705|
|3965|       Corona|        USA|   California|    124966|
|3973|      Concord|        USA|   California|    121780|
|3977| Cedar Rapids|        USA|         Iowa|    120758|
|3982|Coral Springs|        USA|      Florida|    117549|
|4054|    Fairfield|        USA|   California|     92256|
|4058|      Boulder|        USA|     Colorado|     91238|
|4061|   Fall River|        USA|Massachusetts|     90555|
+----+-------------+-----------+-------------+----------+



In [43]:
df_filtered_2 = df.select(col("NAME"), col("POPULATION")).filter(col("POPULATION")>120000)

In [44]:
df_filtered_2.show()

+------------+----------+
|        NAME|POPULATION|
+------------+----------+
|   Rotterdam|    593321|
|  Scottsdale|    202705|
|      Corona|    124966|
|     Concord|    121780|
|Cedar Rapids|    120758|
+------------+----------+



In [45]:
# SQL solution

In [47]:
sql_query_2 = "select name, population from city where population>120000"

In [48]:
df_sql_2 = mysql_read_action(sql_query_2)

In [49]:
df_sql_2.show()

+------------+----------+
|        name|population|
+------------+----------+
|   Rotterdam|    593321|
|  Scottsdale|    202705|
|      Corona|    124966|
|     Concord|    121780|
|Cedar Rapids|    120758|
+------------+----------+



### Q3. Query all columns (attributes) for every row in the CITY table.

In [50]:
# spark solution

In [51]:
df.show()

+----+-------------+-----------+-------------+----------+
|  ID|         NAME|COUNTRYCODE|     DISTRICT|POPULATION|
+----+-------------+-----------+-------------+----------+
|   6|    Rotterdam|        NLD| Zuid-Holland|    593321|
|3878|   Scottsdale|        USA|      Arizona|    202705|
|3965|       Corona|        USA|   California|    124966|
|3973|      Concord|        USA|   California|    121780|
|3977| Cedar Rapids|        USA|         Iowa|    120758|
|3982|Coral Springs|        USA|      Florida|    117549|
|4054|    Fairfield|        USA|   California|     92256|
|4058|      Boulder|        USA|     Colorado|     91238|
|4061|   Fall River|        USA|Massachusetts|     90555|
+----+-------------+-----------+-------------+----------+



In [54]:
df_filtered_3 = df.select(col("*"))

In [55]:
df_filtered_3.show()

+----+-------------+-----------+-------------+----------+
|  ID|         NAME|COUNTRYCODE|     DISTRICT|POPULATION|
+----+-------------+-----------+-------------+----------+
|   6|    Rotterdam|        NLD| Zuid-Holland|    593321|
|3878|   Scottsdale|        USA|      Arizona|    202705|
|3965|       Corona|        USA|   California|    124966|
|3973|      Concord|        USA|   California|    121780|
|3977| Cedar Rapids|        USA|         Iowa|    120758|
|3982|Coral Springs|        USA|      Florida|    117549|
|4054|    Fairfield|        USA|   California|     92256|
|4058|      Boulder|        USA|     Colorado|     91238|
|4061|   Fall River|        USA|Massachusetts|     90555|
+----+-------------+-----------+-------------+----------+



In [56]:
# SQL solution

In [57]:
sql_query_3 = "select * from city"

df_sql_3 = mysql_read_action(sql_query_3)

In [58]:
df_sql_3.show()

+----+-------------+-----------+-------------+----------+
|  ID|         NAME|COUNTRYCODE|     DISTRICT|POPULATION|
+----+-------------+-----------+-------------+----------+
|   6|    Rotterdam|        NLD| Zuid-Holland|    593321|
|3878|   Scottsdale|        USA|      Arizona|    202705|
|3965|       Corona|        USA|   California|    124966|
|3973|      Concord|        USA|   California|    121780|
|3977| Cedar Rapids|        USA|         Iowa|    120758|
|3982|Coral Springs|        USA|      Florida|    117549|
|4054|    Fairfield|        USA|   California|     92256|
|4058|      Boulder|        USA|     Colorado|     91238|
|4061|   Fall River|        USA|Massachusetts|     90555|
+----+-------------+-----------+-------------+----------+



### Q4. Query all columns for a city in CITY with the ID 1661.

In [69]:
 # spark solution

In [59]:
df.show()

+----+-------------+-----------+-------------+----------+
|  ID|         NAME|COUNTRYCODE|     DISTRICT|POPULATION|
+----+-------------+-----------+-------------+----------+
|   6|    Rotterdam|        NLD| Zuid-Holland|    593321|
|3878|   Scottsdale|        USA|      Arizona|    202705|
|3965|       Corona|        USA|   California|    124966|
|3973|      Concord|        USA|   California|    121780|
|3977| Cedar Rapids|        USA|         Iowa|    120758|
|3982|Coral Springs|        USA|      Florida|    117549|
|4054|    Fairfield|        USA|   California|     92256|
|4058|      Boulder|        USA|     Colorado|     91238|
|4061|   Fall River|        USA|Massachusetts|     90555|
+----+-------------+-----------+-------------+----------+



In [67]:
df_filtered_4 = df.where(col("ID")==1661)

In [68]:
df_filtered_4.show()

+---+----+-----------+--------+----------+
| ID|NAME|COUNTRYCODE|DISTRICT|POPULATION|
+---+----+-----------+--------+----------+
+---+----+-----------+--------+----------+



In [70]:
# sql solution

In [77]:
sql_query_4 = "select * from city where id=1661"

In [78]:
df_sql4 = mysql_read_action(sql_query_4)

In [79]:
df_sql4.show()

+---+----+-----------+--------+----------+
| ID|NAME|COUNTRYCODE|DISTRICT|POPULATION|
+---+----+-----------+--------+----------+
+---+----+-----------+--------+----------+



### Q5. Query all attributes of every Japanese city in the CITY table. The COUNTRYCODE for Japan is  JPN.

In [80]:
# spark solution

In [81]:
df.show()

+----+-------------+-----------+-------------+----------+
|  ID|         NAME|COUNTRYCODE|     DISTRICT|POPULATION|
+----+-------------+-----------+-------------+----------+
|   6|    Rotterdam|        NLD| Zuid-Holland|    593321|
|3878|   Scottsdale|        USA|      Arizona|    202705|
|3965|       Corona|        USA|   California|    124966|
|3973|      Concord|        USA|   California|    121780|
|3977| Cedar Rapids|        USA|         Iowa|    120758|
|3982|Coral Springs|        USA|      Florida|    117549|
|4054|    Fairfield|        USA|   California|     92256|
|4058|      Boulder|        USA|     Colorado|     91238|
|4061|   Fall River|        USA|Massachusetts|     90555|
+----+-------------+-----------+-------------+----------+



In [82]:
df_filtered_5 = df.select(col("*")).filter(col("COUNTRYCODE")=="JPN")

In [83]:
df_filtered_5.show()

+---+----+-----------+--------+----------+
| ID|NAME|COUNTRYCODE|DISTRICT|POPULATION|
+---+----+-----------+--------+----------+
+---+----+-----------+--------+----------+



In [84]:
# sql solution

In [85]:
sql_query_5 = "select * from city where countrycode='JPN'"

In [86]:
df_sql_5 = mysql_read_action(sql_query_5)

In [87]:
df_sql_5.show()

+---+----+-----------+--------+----------+
| ID|NAME|COUNTRYCODE|DISTRICT|POPULATION|
+---+----+-----------+--------+----------+
+---+----+-----------+--------+----------+

