In [1]:
import pandas as pd
from sqlalchemy.engine import create_engine
from sqlalchemy import text

In [2]:
# dialect+driver://username:password@host:port/database_name
engine = create_engine('postgresql://postgres:54321@127.0.0.1:5432/postgres')

## Type of UNION
##### UNION : returns only unique records
##### UNION ALL : returns all records (potentially duplicates) in both tables
##### INTERSECT : returns only records appearing in both tables
##### EXCEPT : (EXCEPTional)

## Union

##### Combine these two tables into one table containing all of the fields in economies2010.

##### Sort this resulting single table by country code and then by year, both in ascending order.

In [3]:
sql = """
          -- Select fields from 2010 table
        SELECT *
          -- From 2010 table
        FROM economies2010
          -- Set theory clause
        UNION
          -- Select fields from 2015 table
        SELECT code, year, income_group, gross_savings
          -- From 2015 table
        FROM economies2015
          -- Order by code and year
        ORDER BY code, year;
      """
df = pd.read_sql(sql,engine)
print(df.shape)
df.head(10)

(380, 4)


Unnamed: 0,code,year,income_group,gross_savings
0,AFG,2010,Low income,37.133
1,AFG,2015,Low income,21.466
2,AGO,2010,Upper middle income,23.534
3,AGO,2015,Upper middle income,-0.425
4,ALB,2010,Upper middle income,20.011
5,ALB,2015,Upper middle income,13.84
6,ARE,2010,High income,27.073
7,ARE,2015,High income,34.106
8,ARG,2010,Upper middle income,17.361
9,ARG,2015,Upper middle income,14.111


## Union (2)

##### UNION can also be used to determine all occurrences of a field across multiple tables.

##### Determine all (non-duplicated) country codes in either the cities or the currencies table. The result should be a table with only one field called country_code.

##### Sort by country_code in alphabetical order.

In [4]:
sql = """
          -- Select field
        SELECT country_code
          -- From cities
        FROM cities
          -- Set theory clause
        UNION
          -- Select field
        SELECT code
          -- From currencies
        FROM currencies
          -- Order by country_code
        ORDER BY country_code;
      """
df = pd.read_sql(sql,engine)
print(df.shape)
df.head(10)

(205, 1)


Unnamed: 0,country_code
0,ABW
1,AFG
2,AGO
3,AIA
4,ALB
5,AND
6,ARE
7,ARG
8,ARM
9,ATG


## Union all

##### As you saw, duplicates were removed from the previous two exercises by using UNION. 
##### To include duplicates, you can use UNION ALL.

##### Determine all combinations (include duplicates) of country code and year that exist in either the economies or the populations tables. Order by code then year.

In [5]:
sql = """
          -- Select fields
        SELECT code, year
          -- From economies
        FROM economies
          -- Order by code, year
        ORDER BY code, year;
      """
df = pd.read_sql(sql,engine)
print(df.shape)
df.head(5)

(380, 2)


Unnamed: 0,code,year
0,AFG,2010
1,AFG,2015
2,AGO,2010
3,AGO,2015
4,ALB,2010


In [6]:
sql = """
          -- Select fields
        SELECT country_code, year
          -- From populations
        FROM populations
          -- Order by code, year
        ORDER BY country_code, year;
      """
df = pd.read_sql(sql,engine)
print(df.shape)
df.head(5)

(434, 2)


Unnamed: 0,country_code,year
0,ABW,2010
1,ABW,2015
2,AFG,2010
3,AFG,2015
4,AGO,2010


In [7]:
sql = """
          -- Select fields
        SELECT code, year
          -- From economies
        FROM economies
          -- Set theory clause
        UNION ALL
          -- Select fields
        SELECT country_code, year
          -- From populations
        FROM populations
          -- Order by code, year
        ORDER BY code, year;
      """
df = pd.read_sql(sql,engine)
print(df.shape)
df.head(10)

(814, 2)


Unnamed: 0,code,year
0,ABW,2010
1,ABW,2015
2,AFG,2010
3,AFG,2010
4,AFG,2015
5,AFG,2015
6,AGO,2010
7,AGO,2010
8,AGO,2015
9,AGO,2015


## Intersect

##### Repeat the previous UNION ALL exercise, this time looking at the records in common for country code and year for the economies and populations tables.

##### Difference between UNION ALL (above) and INTERSECT (below) in same query is INTERSECT gives less rows since it only provides mutual records in both tables.

In [8]:
sql = """
          -- Select fields
        SELECT code, year
          -- From economies
        FROM economies
          -- Set theory clause
        INTERSECT
          -- Select fields
        SELECT country_code, year
          -- From populations
        FROM populations
          -- Order by code and year
        ORDER BY code, year;
      """
df = pd.read_sql(sql,engine)
print(df.shape)
df.head(10)

(380, 2)


Unnamed: 0,code,year
0,AFG,2010
1,AFG,2015
2,AGO,2010
3,AGO,2015
4,ALB,2010
5,ALB,2015
6,ARE,2010
7,ARE,2015
8,ARG,2010
9,ARG,2015


## Intersect (2)

##### Which countries also have a city with the same name as their country name?

In [9]:
sql = """
          -- Select fields
        SELECT code, name
          -- From countries
        FROM countries
          -- Set theory clause
        INTERSECT
          -- Select fields
        SELECT country_code, name
          -- From cities
        FROM cities;
      """
df = pd.read_sql(sql,engine)
print(df.shape)
df.head(10)

(1, 2)


Unnamed: 0,code,name
0,SGP,Singapore


## Except

##### Get the names of cities in cities which are not noted as capital cities in countries as a single field result.

##### Note that there are some countries in the world that are not included in the countries table, which will result in some cities not being labeled as capital cities when in fact they are.

In [10]:
sql = """
          -- Select field
        SELECT name
          -- From cities
        FROM cities
          -- Set theory clause
        EXCEPT
          -- Select field
        SELECT capital
          -- From countries
        FROM countries
          -- Order by result
        ORDER BY name;
      """
df = pd.read_sql(sql,engine)
print(df.shape)
df.head(10)

(170, 1)


Unnamed: 0,name
0,Abidjan
1,Ahmedabad
2,Alexandria
3,Almaty
4,Auckland
5,Bandung
6,Barcelona
7,Barranquilla
8,Basra
9,Belo Horizonte


## Except (2)

##### Now you will complete the previous query in reverse!

##### Determine the names of capital cities that are not listed in the cities table.

In [11]:
sql = """
        SELECT capital
        FROM countries
        EXCEPT
        SELECT name
        FROM cities
        ORDER BY capital
      """
df = pd.read_sql(sql,engine)
print(df.shape)
df.head(10)

(136, 1)


Unnamed: 0,capital
0,Agana
1,Amman
2,Amsterdam
3,Andorra la Vella
4,Antananarivo
5,Apia
6,Ashgabat
7,Asmara
8,Astana
9,Asuncion


## Semi-join

#### Step 1 :

In [12]:
sql = """
        -- Select code
        SELECT *
          -- From countries
          FROM countries
        -- Where region is Middle East
        WHERE region = 'Middle East';
      """
df = pd.read_sql(sql,engine)
print(df.shape)
df.head(10)

(18, 11)


Unnamed: 0,code,name,continent,region,surface_area,indep_year,local_name,gov_form,capital,cap_long,cap_lat
0,ARE,United Arab Emirates,Asia,Middle East,83600.0,1971.0,Al-Imarat al-´Arabiya al-Muttahida,Emirate Federation,Abu Dhabi,54.3705,24.4764
1,ARM,Armenia,Asia,Middle East,29800.0,1991.0,Hajastan,Republic,Yerevan,44.509,40.1596
2,AZE,Azerbaijan,Asia,Middle East,86600.0,1991.0,Azarbaycan,Federal Republic,Baku,49.8932,40.3834
3,BHR,Bahrain,Asia,Middle East,694.0,1971.0,Al-Bahrayn,Monarchy (Emirate),Manama,50.5354,26.1921
4,GEO,Georgia,Asia,Middle East,69700.0,1991.0,Sakartvelo,Republic,Tbilisi,44.793,41.71
5,IRQ,Iraq,Asia,Middle East,438317.0,1932.0,Al-´Iraq,Republic,Baghdad,44.394,33.3302
6,ISR,Israel,Asia,Middle East,21056.0,1948.0,Yisrael/Israil,Republic,,35.2035,31.7717
7,YEM,Yemen,Asia,Middle East,527968.0,1918.0,Al-Yaman,Republic,Sana'a,44.2075,15.352
8,JOR,Jordan,Asia,Middle East,88946.0,1946.0,Al-Urdunn,Constitutional Monarchy,Amman,35.9263,31.9497
9,KWT,Kuwait,Asia,Middle East,17818.0,1961.0,Al-Kuwayt,Constitutional Monarchy (Emirate),Kuwait City,47.9824,29.3721


#### Step 2 :

##### Convert the GROUP BY code to use a subquery inside of SELECT

In [13]:
sql = """
        -- Select field
        SELECT DISTINCT(name)
          -- From languages
          FROM languages
        -- Order by name
        ORDER BY name;
      """
df = pd.read_sql(sql,engine)
print(df.shape)
df.head(10)

(396, 1)


Unnamed: 0,name
0,Afar
1,Afrikaans
2,Akyem
3,Albanian
4,Alsatian
5,Amerindian
6,Amharic
7,Angolar
8,Antiguan creole
9,Arabic


#### Step 3 :

In [14]:
sql = """
        -- Select distinct fields
        SELECT DISTINCT(name)
          -- From languages
          FROM languages
        -- Where in statement
        WHERE code IN
          -- Subquery
          (SELECT code
           FROM countries
           WHERE region = 'Middle East')
        -- Order by name
        ORDER BY name;
      """
df = pd.read_sql(sql,engine)
print(df.shape)
df.head(10)

(27, 1)


Unnamed: 0,name
0,Arabic
1,Aramaic
2,Armenian
3,Azerbaijani
4,Azeri
5,Baluchi
6,Bulgarian
7,Circassian
8,English
9,Farsi


## Diagnosing problems using anti-join

##### Another powerful join in SQL is the anti-join. It is particularly useful in identifying which records are causing an incorrect number of records to appear in join queries.

##### You will also see another example of a subquery here, as you saw in the first exercise on semi-joins. Your goal is to identify the currencies used in Oceanian countries!

#### Step 1 :

##### Begin by determining the number of countries in countries that are listed in Oceania 

In [15]:
sql = """
        -- Select statement
        SELECT COUNT(code)
          -- From countries
          FROM countries
        -- Where continent is Oceania
        WHERE continent = 'Oceania';
      """
df = pd.read_sql(sql,engine)
print(df.shape)
df.head(10)

(1, 1)


Unnamed: 0,count
0,19


#### Step 2 :

##### Complete an inner join with countries AS c1 on the left and currencies AS c2 on the right to get the different currencies used in the countries of Oceania.

In [16]:
sql = """
        -- 5. Select fields (with aliases)
        SELECT c1.code, c1.name, c2.basic_unit AS currency
          -- 1. From countries (alias as c1)
          FROM countries as c1
            -- 2. Join with currencies (alias as c2)
            INNER JOIN currencies as c2
            -- 3. Match on code
            USING (code)
        -- 4. Where continent is Oceania
        WHERE continent = 'Oceania';
      """
df = pd.read_sql(sql,engine)
print(df.shape)
df.head(10)

(15, 3)


Unnamed: 0,code,name,currency
0,AUS,Australia,Australian dollar
1,PYF,French Polynesia,CFP franc
2,KIR,Kiribati,Australian dollar
3,MHL,Marshall Islands,United States dollar
4,NRU,Nauru,Australian dollar
5,NCL,New Caledonia,CFP franc
6,NZL,New Zealand,New Zealand dollar
7,PLW,Palau,United States dollar
8,PNG,Papua New Guinea,Papua New Guinean kina
9,WSM,Samoa,Samoan tala


#### Step 3 :

##### Note that not all countries in Oceania were listed in the resulting inner join with currencies. Use an anti-join to determine which countries were not included!

In [17]:
sql = """
        -- 3. Select fields
        SELECT code, name
          -- 4. From Countries
          FROM countries
          -- 5. Where continent is Oceania
          WHERE continent = 'Oceania'
            -- 1. And code not in
            AND code NOT IN
            -- 2. Subquery
            (SELECT code 
             FROM currencies);
      """
df = pd.read_sql(sql,engine)
print(df.shape)
df.head(10)

(5, 2)


Unnamed: 0,code,name
0,ASM,American Samoa
1,FJI,Fiji Islands
2,GUM,Guam
3,FSM,"Micronesia, Federated States of"
4,MNP,Northern Mariana Islands


## Set theory challenge

##### Identify the country codes that are included in either economies or currencies but not in populations.

In [18]:
sql = """
        -- Select the city name
        SELECT c1.name
          -- Alias the table where city name resides
          FROM cities AS c1
          -- Choose only records matching the result of multiple set theory clauses
          WHERE c1.country_code IN
        (
            -- Select appropriate field from economies AS e
            SELECT e.code
            FROM economies AS e
            -- Get all additional (unique) values of the field from currencies AS c2  
            UNION
            SELECT c2.code
            FROM currencies AS c2
            -- Exclude those appearing in populations AS p
            EXCEPT
            SELECT p.country_code
            FROM populations AS p
        );
      """
df = pd.read_sql(sql,engine)
print(df.shape)
df.head(10)

(6, 1)


Unnamed: 0,name
0,Bucharest
1,Kaohsiung
2,New Taipei City
3,Taichung
4,Tainan
5,Taipei
