In [3]:
# Import Libraries
from pyspark.sql import SparkSession

## Spark Session

In [4]:
spark = SparkSession.builder \
        .appName("SQLClass1App") \
        .getOrCreate()

In [5]:
spark.version

'3.5.5'

In [6]:
# mysql connector property for java com.mysql:mysql-connector-j:9.0.0

In [7]:
# Define MySQL credentials
mysql_url = "jdbc:mysql://localhost:3306/class_1"
mysql_user = "root"
mysql_password = "sarath254@"
mysql_driver = "com.mysql.cj.jdbc.Driver"

# MySQL cnnection with spark

In [8]:
# create table

In [9]:
query = "select * from CITY"

In [10]:
df = spark.read \
    .format("jdbc") \
    .option("url", mysql_url) \
    .option("user", mysql_user) \
    .option("query", query) \
    .option("password", mysql_password) \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .load()

In [11]:
def mysql_read_action(query):
    """
    Perform MySQL actions like read, write, update and delete
    """
    return spark.read \
    .format("jdbc") \
    .option("url", mysql_url) \
    .option("user", mysql_user) \
    .option("query", query) \
    .option("password", mysql_password) \
    .option("driver", mysql_driver) \
    .load()

In [12]:
df = mysql_read_action(query)

In [13]:
df.show()

+----+-------------+-----------+-------------+----------+
|  ID|         NAME|COUNTRYCODE|     DISTRICT|POPULATION|
+----+-------------+-----------+-------------+----------+
|   6|    Rotterdam|        NLD| Zuid-Holland|    593321|
|3878|   Scottsdale|        USA|      Arizona|    202705|
|3965|       Corona|        USA|   California|    124966|
|3973|      Concord|        USA|   California|    121780|
|3977| Cedar Rapids|        USA|         Iowa|    120758|
|3982|Coral Springs|        USA|      Florida|    117549|
|4054|    Fairfield|        USA|   California|     92256|
|4058|      Boulder|        USA|     Colorado|     91238|
|4061|   Fall River|        USA|Massachusetts|     90555|
+----+-------------+-----------+-------------+----------+



# SQL and Pyspark Practice NoteBook

#### Q1. Query all columns for all American cities in the CITY table with populations larger than 100000.The CountryCode for America is USA. The CITY table is described as follows:

## pyspark solution

In [14]:
from pyspark.sql.functions import col

In [15]:
df = mysql_read_action(query)

In [16]:
df_filtered = df.filter(col('POPULATION')>100000).where(col("COUNTRYCODE")=="USA")

In [17]:
df_filtered.show()

+----+-------------+-----------+----------+----------+
|  ID|         NAME|COUNTRYCODE|  DISTRICT|POPULATION|
+----+-------------+-----------+----------+----------+
|3878|   Scottsdale|        USA|   Arizona|    202705|
|3965|       Corona|        USA|California|    124966|
|3973|      Concord|        USA|California|    121780|
|3977| Cedar Rapids|        USA|      Iowa|    120758|
|3982|Coral Springs|        USA|   Florida|    117549|
+----+-------------+-----------+----------+----------+



In [18]:
# another way in pyspark
df_filtered_1 = df.filter((col('POPULATION')>100000) & (col("COUNTRYCODE")=="USA"))

In [19]:
df_filtered_1.show()

+----+-------------+-----------+----------+----------+
|  ID|         NAME|COUNTRYCODE|  DISTRICT|POPULATION|
+----+-------------+-----------+----------+----------+
|3878|   Scottsdale|        USA|   Arizona|    202705|
|3965|       Corona|        USA|California|    124966|
|3973|      Concord|        USA|California|    121780|
|3977| Cedar Rapids|        USA|      Iowa|    120758|
|3982|Coral Springs|        USA|   Florida|    117549|
+----+-------------+-----------+----------+----------+



In [20]:
df_filtered.describe()

DataFrame[summary: string, ID: string, NAME: string, COUNTRYCODE: string, DISTRICT: string, POPULATION: string]

1. Both filter() and where() do the same job in the DataFrame API.
2. where() is just an alias for filter(), provided for SQL-like syntax consistency.
3. You can use column expressions or SQL-like string conditions in both.

## SQL solution

In [21]:
sql_query = "select * from CITY where COUNTRYCODE='USA' and population>100000"

In [22]:
df_sql = mysql_read_action(sql_query)

In [23]:
df_sql.show()

+----+-------------+-----------+----------+----------+
|  ID|         NAME|COUNTRYCODE|  DISTRICT|POPULATION|
+----+-------------+-----------+----------+----------+
|3878|   Scottsdale|        USA|   Arizona|    202705|
|3965|       Corona|        USA|California|    124966|
|3973|      Concord|        USA|California|    121780|
|3977| Cedar Rapids|        USA|      Iowa|    120758|
|3982|Coral Springs|        USA|   Florida|    117549|
+----+-------------+-----------+----------+----------+



###  Q2. Query the NAME field for all American cities in the CITY table with populations larger than 120000.The CountryCode for America is USA. The CITY table is described as follows:

In [24]:
# pyspark solution

In [25]:
df.show()

+----+-------------+-----------+-------------+----------+
|  ID|         NAME|COUNTRYCODE|     DISTRICT|POPULATION|
+----+-------------+-----------+-------------+----------+
|   6|    Rotterdam|        NLD| Zuid-Holland|    593321|
|3878|   Scottsdale|        USA|      Arizona|    202705|
|3965|       Corona|        USA|   California|    124966|
|3973|      Concord|        USA|   California|    121780|
|3977| Cedar Rapids|        USA|         Iowa|    120758|
|3982|Coral Springs|        USA|      Florida|    117549|
|4054|    Fairfield|        USA|   California|     92256|
|4058|      Boulder|        USA|     Colorado|     91238|
|4061|   Fall River|        USA|Massachusetts|     90555|
+----+-------------+-----------+-------------+----------+



In [26]:
df_filtered_2 = df.select(col("NAME"), col("POPULATION")).filter(col("POPULATION")>120000)

In [27]:
df_filtered_2.show()

+------------+----------+
|        NAME|POPULATION|
+------------+----------+
|   Rotterdam|    593321|
|  Scottsdale|    202705|
|      Corona|    124966|
|     Concord|    121780|
|Cedar Rapids|    120758|
+------------+----------+



In [28]:
# SQL solution

In [29]:
sql_query_2 = "select name, population from city where population>120000"

In [30]:
df_sql_2 = mysql_read_action(sql_query_2)

In [31]:
df_sql_2.show()

+------------+----------+
|        name|population|
+------------+----------+
|   Rotterdam|    593321|
|  Scottsdale|    202705|
|      Corona|    124966|
|     Concord|    121780|
|Cedar Rapids|    120758|
+------------+----------+



### Q3. Query all columns (attributes) for every row in the CITY table.

In [32]:
# spark solution

In [33]:
df.show()

+----+-------------+-----------+-------------+----------+
|  ID|         NAME|COUNTRYCODE|     DISTRICT|POPULATION|
+----+-------------+-----------+-------------+----------+
|   6|    Rotterdam|        NLD| Zuid-Holland|    593321|
|3878|   Scottsdale|        USA|      Arizona|    202705|
|3965|       Corona|        USA|   California|    124966|
|3973|      Concord|        USA|   California|    121780|
|3977| Cedar Rapids|        USA|         Iowa|    120758|
|3982|Coral Springs|        USA|      Florida|    117549|
|4054|    Fairfield|        USA|   California|     92256|
|4058|      Boulder|        USA|     Colorado|     91238|
|4061|   Fall River|        USA|Massachusetts|     90555|
+----+-------------+-----------+-------------+----------+



In [34]:
df_filtered_3 = df.select(col("*"))

In [35]:
df_filtered_3.show()

+----+-------------+-----------+-------------+----------+
|  ID|         NAME|COUNTRYCODE|     DISTRICT|POPULATION|
+----+-------------+-----------+-------------+----------+
|   6|    Rotterdam|        NLD| Zuid-Holland|    593321|
|3878|   Scottsdale|        USA|      Arizona|    202705|
|3965|       Corona|        USA|   California|    124966|
|3973|      Concord|        USA|   California|    121780|
|3977| Cedar Rapids|        USA|         Iowa|    120758|
|3982|Coral Springs|        USA|      Florida|    117549|
|4054|    Fairfield|        USA|   California|     92256|
|4058|      Boulder|        USA|     Colorado|     91238|
|4061|   Fall River|        USA|Massachusetts|     90555|
+----+-------------+-----------+-------------+----------+



In [36]:
# SQL solution

In [37]:
sql_query_3 = "select * from city"

df_sql_3 = mysql_read_action(sql_query_3)

In [38]:
df_sql_3.show()

+----+-------------+-----------+-------------+----------+
|  ID|         NAME|COUNTRYCODE|     DISTRICT|POPULATION|
+----+-------------+-----------+-------------+----------+
|   6|    Rotterdam|        NLD| Zuid-Holland|    593321|
|3878|   Scottsdale|        USA|      Arizona|    202705|
|3965|       Corona|        USA|   California|    124966|
|3973|      Concord|        USA|   California|    121780|
|3977| Cedar Rapids|        USA|         Iowa|    120758|
|3982|Coral Springs|        USA|      Florida|    117549|
|4054|    Fairfield|        USA|   California|     92256|
|4058|      Boulder|        USA|     Colorado|     91238|
|4061|   Fall River|        USA|Massachusetts|     90555|
+----+-------------+-----------+-------------+----------+



### Q4. Query all columns for a city in CITY with the ID 1661.

In [39]:
 # spark solution

In [40]:
df.show()

+----+-------------+-----------+-------------+----------+
|  ID|         NAME|COUNTRYCODE|     DISTRICT|POPULATION|
+----+-------------+-----------+-------------+----------+
|   6|    Rotterdam|        NLD| Zuid-Holland|    593321|
|3878|   Scottsdale|        USA|      Arizona|    202705|
|3965|       Corona|        USA|   California|    124966|
|3973|      Concord|        USA|   California|    121780|
|3977| Cedar Rapids|        USA|         Iowa|    120758|
|3982|Coral Springs|        USA|      Florida|    117549|
|4054|    Fairfield|        USA|   California|     92256|
|4058|      Boulder|        USA|     Colorado|     91238|
|4061|   Fall River|        USA|Massachusetts|     90555|
+----+-------------+-----------+-------------+----------+



In [41]:
df_filtered_4 = df.where(col("ID")==1661)

In [42]:
df_filtered_4.show()

+---+----+-----------+--------+----------+
| ID|NAME|COUNTRYCODE|DISTRICT|POPULATION|
+---+----+-----------+--------+----------+
+---+----+-----------+--------+----------+



In [43]:
# sql solution

In [44]:
sql_query_4 = "select * from city where id=1661"

In [45]:
df_sql4 = mysql_read_action(sql_query_4)

In [46]:
df_sql4.show()

+---+----+-----------+--------+----------+
| ID|NAME|COUNTRYCODE|DISTRICT|POPULATION|
+---+----+-----------+--------+----------+
+---+----+-----------+--------+----------+



### Q5. Query all attributes of every Japanese city in the CITY table. The COUNTRYCODE for Japan is  JPN.

In [47]:
# spark solution

In [48]:
df.show()

+----+-------------+-----------+-------------+----------+
|  ID|         NAME|COUNTRYCODE|     DISTRICT|POPULATION|
+----+-------------+-----------+-------------+----------+
|   6|    Rotterdam|        NLD| Zuid-Holland|    593321|
|3878|   Scottsdale|        USA|      Arizona|    202705|
|3965|       Corona|        USA|   California|    124966|
|3973|      Concord|        USA|   California|    121780|
|3977| Cedar Rapids|        USA|         Iowa|    120758|
|3982|Coral Springs|        USA|      Florida|    117549|
|4054|    Fairfield|        USA|   California|     92256|
|4058|      Boulder|        USA|     Colorado|     91238|
|4061|   Fall River|        USA|Massachusetts|     90555|
+----+-------------+-----------+-------------+----------+



In [49]:
df_filtered_5 = df.select(col("*")).filter(col("COUNTRYCODE")=="JPN")

In [50]:
df_filtered_5.show()

+---+----+-----------+--------+----------+
| ID|NAME|COUNTRYCODE|DISTRICT|POPULATION|
+---+----+-----------+--------+----------+
+---+----+-----------+--------+----------+



In [51]:
# sql solution

In [52]:
sql_query_5 = "select * from city where countrycode='JPN'"

In [53]:
df_sql_5 = mysql_read_action(sql_query_5)

In [54]:
df_sql_5.show()

+---+----+-----------+--------+----------+
| ID|NAME|COUNTRYCODE|DISTRICT|POPULATION|
+---+----+-----------+--------+----------+
+---+----+-----------+--------+----------+



###  Q6. Query the names of all the Japanese cities in the CITY table. The COUNTRYCODE for Japan is JPN.

In [55]:
# sql solution

In [56]:
sql_query_6 = "select name from CITY where COUNTRYCODE='JPN'"

In [68]:
df_sql_6 = mysql_read_action(sql_query_6)

In [69]:
df_sql_6.show()

+----+
|name|
+----+
+----+



In [59]:
# pyspark solution

In [60]:
df.show()

+----+-------------+-----------+-------------+----------+
|  ID|         NAME|COUNTRYCODE|     DISTRICT|POPULATION|
+----+-------------+-----------+-------------+----------+
|   6|    Rotterdam|        NLD| Zuid-Holland|    593321|
|3878|   Scottsdale|        USA|      Arizona|    202705|
|3965|       Corona|        USA|   California|    124966|
|3973|      Concord|        USA|   California|    121780|
|3977| Cedar Rapids|        USA|         Iowa|    120758|
|3982|Coral Springs|        USA|      Florida|    117549|
|4054|    Fairfield|        USA|   California|     92256|
|4058|      Boulder|        USA|     Colorado|     91238|
|4061|   Fall River|        USA|Massachusetts|     90555|
+----+-------------+-----------+-------------+----------+



In [64]:
df_filtered_6 = df.select(col('name')).where(col("countrycode")=="JPN")

In [65]:
df_filtered_6.show()

+----+
|name|
+----+
+----+



###  Q7. Query a list of CITY and STATE from the STATION table

In [77]:
station_table_query = "select * from station"

In [78]:
station_df = mysql_read_action(station_table_query)

In [79]:
station_df.show()

+---+--------------+-----+-----+------+
| ID|          CITY|STATE|LAT_N|LONG_W|
+---+--------------+-----+-----+------+
|794|  Kissee Mills|   MO|  139|    73|
|824|      Loma Mar|   CA|   48|   130|
|603|    Sandy Hook|   CT|   72|   148|
|478|        Tipton|   IN|   33|    97|
|619|     Arlington|   CO|   75|    92|
|711|        Turner|   AR|   50|   101|
|839|       Slidell|   LA|   85|   151|
|411|       Negreet|   LA|   98|   105|
|588|       Glencoe|   KY|   46|   136|
|665|       Chelsea|   IA|   98|    59|
|342|Chignik Lagoon|   AK|  103|   153|
|733|   Pelahatchie|   MS|   38|    28|
|811|      Dorrance|   KS|  102|   121|
|698|        Albany|   CA|   49|    80|
|325|      Monument|   KS|   70|   141|
|414|    Manchester|   MD|   73|    37|
|113|      Prescott|   IA|   39|    65|
|971|   Graettinger|   IA|   94|   150|
|266|        Cahone|   CO|  116|   127|
+---+--------------+-----+-----+------+



In [83]:
station_df.columns

['ID', 'CITY', 'STATE', 'LAT_N', 'LONG_W']

In [90]:
sql_query_7 = " select city, state from station"

In [91]:
df_sql_7 = mysql_read_action(sql_query_7)

In [92]:
df_sql_7.show()

+--------------+-----+
|          city|state|
+--------------+-----+
|  Kissee Mills|   MO|
|      Loma Mar|   CA|
|    Sandy Hook|   CT|
|        Tipton|   IN|
|     Arlington|   CO|
|        Turner|   AR|
|       Slidell|   LA|
|       Negreet|   LA|
|       Glencoe|   KY|
|       Chelsea|   IA|
|Chignik Lagoon|   AK|
|   Pelahatchie|   MS|
|      Dorrance|   KS|
|        Albany|   CA|
|      Monument|   KS|
|    Manchester|   MD|
|      Prescott|   IA|
|   Graettinger|   IA|
|        Cahone|   CO|
+--------------+-----+



In [93]:
# spark solution

In [96]:
df_filtered_7 = station_df.select(col("city"), col("state"))

In [97]:
df_filtered_7.show()

+--------------+-----+
|          city|state|
+--------------+-----+
|  Kissee Mills|   MO|
|      Loma Mar|   CA|
|    Sandy Hook|   CT|
|        Tipton|   IN|
|     Arlington|   CO|
|        Turner|   AR|
|       Slidell|   LA|
|       Negreet|   LA|
|       Glencoe|   KY|
|       Chelsea|   IA|
|Chignik Lagoon|   AK|
|   Pelahatchie|   MS|
|      Dorrance|   KS|
|        Albany|   CA|
|      Monument|   KS|
|    Manchester|   MD|
|      Prescott|   IA|
|   Graettinger|   IA|
|        Cahone|   CO|
+--------------+-----+



In [98]:
df_filtered_7 = station_df.selectExpr("city", "state")

In [99]:
df_filtered_7.show()

+--------------+-----+
|          city|state|
+--------------+-----+
|  Kissee Mills|   MO|
|      Loma Mar|   CA|
|    Sandy Hook|   CT|
|        Tipton|   IN|
|     Arlington|   CO|
|        Turner|   AR|
|       Slidell|   LA|
|       Negreet|   LA|
|       Glencoe|   KY|
|       Chelsea|   IA|
|Chignik Lagoon|   AK|
|   Pelahatchie|   MS|
|      Dorrance|   KS|
|        Albany|   CA|
|      Monument|   KS|
|    Manchester|   MD|
|      Prescott|   IA|
|   Graettinger|   IA|
|        Cahone|   CO|
+--------------+-----+



 ### Q8. Query a list of CITY names from STATION for cities that have an even ID number. Print the results in any order, but exclude duplicates from the answer.

In [102]:
sql_query_8 = " select distinct city from station where ID%2=0"

In [103]:
df_sql_8 = mysql_read_action(sql_query_8)

In [104]:
df_sql_8.show()

+--------------+
|          city|
+--------------+
|  Kissee Mills|
|      Loma Mar|
|        Tipton|
|       Glencoe|
|Chignik Lagoon|
|        Albany|
|    Manchester|
|        Cahone|
+--------------+



In [105]:
# spark solution

In [106]:
station_df.show()

+---+--------------+-----+-----+------+
| ID|          CITY|STATE|LAT_N|LONG_W|
+---+--------------+-----+-----+------+
|794|  Kissee Mills|   MO|  139|    73|
|824|      Loma Mar|   CA|   48|   130|
|603|    Sandy Hook|   CT|   72|   148|
|478|        Tipton|   IN|   33|    97|
|619|     Arlington|   CO|   75|    92|
|711|        Turner|   AR|   50|   101|
|839|       Slidell|   LA|   85|   151|
|411|       Negreet|   LA|   98|   105|
|588|       Glencoe|   KY|   46|   136|
|665|       Chelsea|   IA|   98|    59|
|342|Chignik Lagoon|   AK|  103|   153|
|733|   Pelahatchie|   MS|   38|    28|
|811|      Dorrance|   KS|  102|   121|
|698|        Albany|   CA|   49|    80|
|325|      Monument|   KS|   70|   141|
|414|    Manchester|   MD|   73|    37|
|113|      Prescott|   IA|   39|    65|
|971|   Graettinger|   IA|   94|   150|
|266|        Cahone|   CO|  116|   127|
+---+--------------+-----+-----+------+



In [115]:
df_filtered_8 = station_df.selectExpr("city").distinct().filter(col("ID")%2==0)

In [116]:
df_filtered_8.show()

+--------------+
|          city|
+--------------+
|    Manchester|
|        Tipton|
|      Loma Mar|
|        Albany|
|        Cahone|
|       Glencoe|
|  Kissee Mills|
|Chignik Lagoon|
+--------------+



### Q9. Find the difference between the total number of CITY entries in the table and the number of distinct CITY entries in the table.

In [129]:
sql_query_9 = "select count(city) as total_number_of_cities, count(distinct city) as unique_cities, (count(city) - count(distinct city)) as difference from station"

In [130]:
df_sql_9 = mysql_read_action(sql_query_9)

In [131]:
df_sql_9.show()

+----------------------+-------------+----------+
|total_number_of_cities|unique_cities|difference|
+----------------------+-------------+----------+
|                    19|           19|         0|
+----------------------+-------------+----------+



In [132]:
# pyspark solution

In [133]:
df_filtered_9 = station_df.selectExpr("count(city) as total_cities", 
                                     "count(distinct city) as unique_cities",
                                     "count(city) - count(distinct city) as difference")

In [134]:
df_filtered_9.show()

+------------+-------------+----------+
|total_cities|unique_cities|difference|
+------------+-------------+----------+
|          19|           19|         0|
+------------+-------------+----------+



###  Q10. Query the two cities in STATION with the shortest and longest CITY names, as well as their
### respective lengths (i.e.: number of characters in the name). If there is more than one smallest or
### largest city, choose the one that comes first when ordered alphabetically.

In [135]:
# sql query 

In [136]:
sql_query_9_1 = "select city, length(city) as city_length from station order by length(city) desc limit 1"

In [137]:
df_sql_9_1 = mysql_read_action(sql_query_9_1)

In [138]:
df_sql_9_1.show()

+--------------+-----------+
|          city|city_length|
+--------------+-----------+
|Chignik Lagoon|         14|
+--------------+-----------+



In [144]:
sql_query_9_2 = "select city, length(city) as city_length from station order by length(city), city asc limit 1"

In [145]:
df_sql_9_2 = mysql_read_action(sql_query_9_2)

In [146]:
df_sql_9_2.show()

+------+-----------+
|  city|city_length|
+------+-----------+
|Albany|          6|
+------+-----------+



In [147]:
# pyspark solution

In [158]:
from pyspark.sql.functions import length
df_filtered_9_1 = station_df.select(col("city"), length(col("city")).alias("city_length")).orderBy(col("city_length").desc()).limit(1)

In [159]:
df_filtered_9_1.show()

+--------------+-----------+
|          city|city_length|
+--------------+-----------+
|Chignik Lagoon|         14|
+--------------+-----------+



In [160]:
df_filtered_9_2 = station_df.select(col("city"), length(col("city")).alias("city_length")).orderBy(col("city")).limit(1)

In [161]:
df_filtered_9_2.show()

+------+-----------+
|  city|city_length|
+------+-----------+
|Albany|          6|
+------+-----------+



###  Q11. Query the list of CITY names starting with vowels (i.e., a, e, i, o, or u) from STATION. Your result cannot contain duplicates.

In [162]:
# sql_solution

In [165]:
sql_query_10 = "SELECT DISTINCT(CITY) AS DISTINCT_CITY_NAME FROM STATION WHERE lower(SUBSTR(city,1,1)) in ('a','e','i','o','u')"

In [168]:
df_sql_10 = mysql_read_action(sql_query_10)

In [169]:
df_sql_10.show()

+------------------+
|DISTINCT_CITY_NAME|
+------------------+
|         Arlington|
|            Albany|
+------------------+



In [174]:
# pyspark solution

In [178]:
from pyspark.sql.functions import lower
df_filtered_11 = station_df.select(col("city")).distinct().filter(lower(col('city')).substr(1, 1).isin("a", "e", "i", "o", "u"))

In [179]:
df_filtered_11.show()

+---------+
|     city|
+---------+
|   Albany|
|Arlington|
+---------+



### Q12. Query the list of CITY names ending with vowels (a, e, i, o, u) from STATION. Your result cannot contain duplicates.

In [171]:
sql_query_11 = "SELECT DISTINCT(CITY) AS DISTINCT_CITY_NAME FROM STATION WHERE lower(SUBSTR(city,-1,1)) in ('a','e','i','o','u')"

In [172]:
df_sql_11 = mysql_read_action(sql_query_11)

In [173]:
df_sql_11.show()

+------------------+
|DISTINCT_CITY_NAME|
+------------------+
|           Glencoe|
|           Chelsea|
|       Pelahatchie|
|          Dorrance|
|            Cahone|
+------------------+



In [180]:
# pyspark solution
df_filtered_12 = station_df.select(col("city")).distinct().filter(lower(col('city')).substr(-1, 1).isin("a", "e", "i", "o", "u"))

In [181]:
df_filtered_12.show()

+-----------+
|       city|
+-----------+
|Pelahatchie|
|     Cahone|
|    Glencoe|
|   Dorrance|
|    Chelsea|
+-----------+



### Q13. Query the list of CITY names from STATION that do not start with vowels. Your result cannot contain duplicates.

In [182]:
sql_query_13 = "select distinct city from station where lower(substr(city, 1, 1)) not in ('a', 'b', 'c', 'd', 'e')"

In [183]:
df_sql_13 = mysql_read_action(sql_query_13)

In [184]:
df_sql_13.show()

+------------+
|        city|
+------------+
|Kissee Mills|
|    Loma Mar|
|  Sandy Hook|
|      Tipton|
|      Turner|
|     Slidell|
|     Negreet|
|     Glencoe|
| Pelahatchie|
|    Monument|
|  Manchester|
|    Prescott|
| Graettinger|
+------------+



In [193]:
from pyspark.sql.functions import col, lower, substr

df_filtered_13 = station_df \
    .select("city") \
    .distinct() \
    .filter(~lower(col("city")).substr(1, 1).isin("a", "e", "i", "o", "u"))

In [194]:
df_filtered_13.show()

+--------------+
|          city|
+--------------+
|    Manchester|
|        Tipton|
|   Pelahatchie|
|      Loma Mar|
|        Cahone|
|       Glencoe|
|    Sandy Hook|
|  Kissee Mills|
|      Dorrance|
|       Negreet|
|Chignik Lagoon|
|        Turner|
|       Slidell|
|       Chelsea|
|      Monument|
|   Graettinger|
|      Prescott|
+--------------+



 ### Q14. Query the list of CITY names from STATION that do not end with vowels. Your result cannot contain duplicates.

In [195]:
# sql_solution

In [198]:
sql_query_14 = "select distinct city from station where lower(substr(city, -1, 1)) not in ('a', 'e', 'i', 'o', 'u')"

In [199]:
df_query_14 = mysql_read_action(sql_query_14)

In [200]:
df_query_14.show()

+--------------+
|          city|
+--------------+
|  Kissee Mills|
|      Loma Mar|
|    Sandy Hook|
|        Tipton|
|     Arlington|
|        Turner|
|       Slidell|
|       Negreet|
|Chignik Lagoon|
|        Albany|
|      Monument|
|    Manchester|
|      Prescott|
|   Graettinger|
+--------------+



In [201]:
# spark solution

In [204]:
df_filtered_14 = station_df.select(col('city'))\
                           .distinct()\
                           .filter(~lower(col('city')).substr(-1,1).isin('a', 'e', 'i', 'o', 'u'))

In [205]:
df_filtered_14.show()

+--------------+
|          city|
+--------------+
|    Manchester|
|        Tipton|
|      Loma Mar|
|        Albany|
|    Sandy Hook|
|     Arlington|
|  Kissee Mills|
|       Negreet|
|Chignik Lagoon|
|        Turner|
|       Slidell|
|      Monument|
|   Graettinger|
|      Prescott|
+--------------+



### Q15. Query the list of CITY names from STATION that either do not start with vowels and do not end with vowels. Your result cannot contain duplicates.

In [209]:
sql_query_15 = "select city from station where substr(lower(city), 1, 1) not in ('a', 'e', 'i', 'o', 'u') and substr(lower(city), -1, 1) not in  ('a', 'e', 'i', 'o', 'u')"

In [210]:
df_sql_15 = mysql_read_action(sql_query_15)

In [211]:
df_sql_15.show()

+--------------+
|          city|
+--------------+
|  Kissee Mills|
|      Loma Mar|
|    Sandy Hook|
|        Tipton|
|        Turner|
|       Slidell|
|       Negreet|
|Chignik Lagoon|
|      Monument|
|    Manchester|
|      Prescott|
|   Graettinger|
+--------------+



In [212]:
# pyspsark solution

In [214]:
df_filtered_15 = station_df.select(col('city'))\
                           .distinct()\
                           .filter(~lower(col('city')).substr(1,1).isin('a', 'e', 'i', 'o', 'u') & ~lower(col("city")).substr(-1, 1).isin('a', 'e', 'i', 'o', 'u'))

In [215]:
df_filtered_15.show()

+--------------+
|          city|
+--------------+
|    Manchester|
|        Tipton|
|      Loma Mar|
|    Sandy Hook|
|  Kissee Mills|
|       Negreet|
|Chignik Lagoon|
|        Turner|
|       Slidell|
|      Monument|
|   Graettinger|
|      Prescott|
+--------------+



### Q16. Query the list of CITY names from STATION that do not start with vowels or do not end with vowels. Your result cannot contain duplicates.

In [216]:
sql_query_16 = "select city from station where substr(lower(city), 1, 1) not in ('a', 'e', 'i', 'o', 'u') or substr(lower(city), -1, 1) not in  ('a', 'e', 'i', 'o', 'u')"

In [217]:
df_sql_16 = mysql_read_action(sql_query_16)

In [218]:
df_sql_16.show()

+--------------+
|          city|
+--------------+
|  Kissee Mills|
|      Loma Mar|
|    Sandy Hook|
|        Tipton|
|     Arlington|
|        Turner|
|       Slidell|
|       Negreet|
|       Glencoe|
|       Chelsea|
|Chignik Lagoon|
|   Pelahatchie|
|      Dorrance|
|        Albany|
|      Monument|
|    Manchester|
|      Prescott|
|   Graettinger|
|        Cahone|
+--------------+



In [219]:
# pyspark solution

In [221]:
df_filtered_16 = station_df.select(col('city'))\
                           .distinct()\
                           .filter(~lower(col('city')).substr(1,1).isin('a', 'e', 'i', 'o', 'u') | ~lower(col("city")).substr(-1, 1).isin('a', 'e', 'i', 'o', 'u'))

In [222]:
df_filtered_16.show()

+--------------+
|          city|
+--------------+
|    Manchester|
|        Tipton|
|   Pelahatchie|
|      Loma Mar|
|        Albany|
|        Cahone|
|       Glencoe|
|    Sandy Hook|
|     Arlington|
|  Kissee Mills|
|      Dorrance|
|       Negreet|
|Chignik Lagoon|
|        Turner|
|       Slidell|
|       Chelsea|
|      Monument|
|   Graettinger|
|      Prescott|
+--------------+

