In [1]:
import pandas as pd
from sqlalchemy.engine import create_engine
from sqlalchemy import text

In [2]:
engine = create_engine('postgresql://postgres:DSpark2019@127.0.0.1:5432/postgres')

# Left and Right Join

## Left Join

In [3]:
sql = """
         -- 4. Select fields
        SELECT c.name AS country, c.continent, l.name AS language, l.official
          -- 1. From countries (alias as c)
          FROM countries AS c
          -- 2. Join to languages (as l)
          INNER JOIN languages AS l
            -- 3. Match using code
            USING (code);
      """

pd.read_sql(sql,engine).head()

Unnamed: 0,country,continent,language,official
0,Afghanistan,Asia,Dari,True
1,Afghanistan,Asia,Pashto,True
2,Afghanistan,Asia,Turkic,False
3,Afghanistan,Asia,Other,False
4,Albania,Europe,Albanian,True


## Left join (2)

In [4]:
# NOTE : the use of multi-line comments here using /* and */

# INNER JOIN
sql = """
        /*
        5. Select country name AS country, the country's local name,
        the language name AS language, and
        the percent of the language spoken in the country
        */
        SELECT c.name AS country, local_name, l.name AS language, percent
        -- 1. From left table (alias as c)
        FROM countries AS c
          -- 2. Join to right table (alias as l)
          INNER JOIN languages AS l
            -- 3. Match on fields
            ON c.code = l.code
        -- 4. Order by descending country
        ORDER BY country DESC;
      """

df = pd.read_sql(sql, engine)
print(df.shape)
df.head()

(914, 4)


Unnamed: 0,country,local_name,language,percent
0,Zimbabwe,Zimbabwe,Shona,
1,Zimbabwe,Zimbabwe,Tonga,
2,Zimbabwe,Zimbabwe,Tswana,
3,Zimbabwe,Zimbabwe,Venda,
4,Zimbabwe,Zimbabwe,Xhosa,


In [5]:
# LEFT JOIN
sql = """
        /*
        5. Select country name AS country, the country's local name,
        the language name AS language, and
        the percent of the language spoken in the country
        */
        SELECT c.name AS country, local_name, l.name AS language, percent
        -- 1. From left table (alias as c)
        FROM countries AS c
          -- 2. Join to right table (alias as l)
          LEFT JOIN languages AS l
            -- 3. Match on fields
            ON c.code = l.code
        -- 4. Order by descending country
        ORDER BY country DESC;
      """

df = pd.read_sql(sql, engine)
print(df.shape)
df.head()

(921, 4)


Unnamed: 0,country,local_name,language,percent
0,Zimbabwe,Zimbabwe,Chibarwe,
1,Zimbabwe,Zimbabwe,Shona,
2,Zimbabwe,Zimbabwe,Ndebele,
3,Zimbabwe,Zimbabwe,English,
4,Zimbabwe,Zimbabwe,Chewa,


## Left join (3)

In [6]:
sql = """
        -- 5. Select name, region, and gdp_percapita
        SELECT c.name, c.region, e.gdp_percapita
        -- 1. From countries (alias as c)
        FROM countries AS c
          -- 2. Left join with economies (alias as e)
          LEFT JOIN economies AS e
            -- 3. Match on code fields
            ON c.code = e.code
        -- 4. Focus on 2010
        WHERE e.year = 2010;
              """

pd.read_sql(sql, engine).head()

Unnamed: 0,name,region,gdp_percapita
0,Afghanistan,Southern and Central Asia,539.667
1,Angola,Central Africa,3599.27
2,Albania,Southern Europe,4098.13
3,United Arab Emirates,Middle East,34628.63
4,Argentina,South America,10412.95


In [7]:
sql = """
        -- Select fields
        SELECT region, AVG(gdp_percapita) AS avg_gdp
        -- From countries (alias as c)
        FROM countries AS c
          -- Left join with economies (alias as e)
          LEFT JOIN economies AS e
            -- Match on code fields
            ON c.code = e.code
        -- Focus on 2010
        WHERE e.year = 2010
        -- Group by region
        GROUP BY region;
              """

df = pd.read_sql(sql, engine)
print(df.shape)
df.head()

(23, 2)


Unnamed: 0,region,avg_gdp
0,Southern Africa,5051.597974
1,Caribbean,11413.339454
2,Eastern Africa,1757.348162
3,Southern Europe,22926.410911
4,Eastern Asia,26205.8514


In [8]:
sql = """
        -- Select fields
        SELECT region, AVG(gdp_percapita) AS avg_gdp
        -- From countries (alias as c)
        FROM countries as c
          -- Left join with economies (alias as e)
          LEFT JOIN economies as e
            -- Match on code fields
            ON c.code = e.code
        -- Focus on 2010
        WHERE e.year = 2010
        -- Group by region
        GROUP BY region
        -- Order by descending avg_gdp
        ORDER BY avg_gdp DESC;
      """

df = pd.read_sql(sql, engine)
print(df.shape)
df.head()

(23, 2)


Unnamed: 0,region,avg_gdp
0,Western Europe,58130.961496
1,Nordic Countries,57073.997656
2,North America,47911.509766
3,Australia and New Zealand,44792.384766
4,British Islands,43588.330078


## Right join

In [9]:
# Right joins aren't as common as left joins. One reason why is that you can always write a right join as a left join.

sql = """
        SELECT cities.name AS city, urbanarea_pop, countries.name AS country,
               indep_year, languages.name AS language, percent
        FROM languages
          RIGHT JOIN countries
            ON languages.code = countries.code
          RIGHT JOIN cities
            ON cities.country_code = countries.code
        ORDER BY city, language;
      """

df = pd.read_sql(sql, engine)
print(df.shape)
df.head()

(1375, 6)


Unnamed: 0,city,urbanarea_pop,country,indep_year,language,percent
0,Abidjan,4765000.0,Cote d'Ivoire,1960.0,French,
1,Abidjan,4765000.0,Cote d'Ivoire,1960.0,Other,
2,Abu Dhabi,1145000.0,United Arab Emirates,1971.0,Arabic,
3,Abu Dhabi,1145000.0,United Arab Emirates,1971.0,English,
4,Abu Dhabi,1145000.0,United Arab Emirates,1971.0,Hindi,


## Full Join

### Comparison between FULL JOIN, LEFT JOIN and INNER JOIN

In [10]:
sql = """
        SELECT name AS country, code, region, basic_unit
        -- 3. From countries
        FROM countries
          -- 4. Join to currencies
          FULL JOIN currencies
            -- 5. Match on code
            USING (code)
        -- 1. Where region is North America or null
        WHERE region = 'North America' OR region IS NULL
        -- 2. Order by region
        ORDER BY region;
      """

df = pd.read_sql(sql, engine)
print(df.shape)
df.head(10)

(17, 4)


Unnamed: 0,country,code,region,basic_unit
0,Canada,CAN,North America,Canadian dollar
1,United States,USA,North America,United States dollar
2,Bermuda,BMU,North America,Bermudian dollar
3,Greenland,GRL,North America,
4,,TMP,,United States dollar
5,,FLK,,Falkland Islands pound
6,,AIA,,East Caribbean dollar
7,,NIU,,New Zealand dollar
8,,ROM,,Romanian leu
9,,SHN,,Saint Helena pound


In [11]:
sql = """
        SELECT name AS country, code, region, basic_unit
        -- 1. From countries
        FROM countries
          -- 2. Join to currencies
          LEFT JOIN currencies
            -- 3. Match on code
            USING (code)
        -- 4. Where region is North America or null
        WHERE region = 'North America' OR region IS NULL
        -- 5. Order by region
        ORDER BY region;
      """

df = pd.read_sql(sql, engine)
print(df.shape)
df

(4, 4)


Unnamed: 0,country,code,region,basic_unit
0,Bermuda,BMU,North America,Bermudian dollar
1,Canada,CAN,North America,Canadian dollar
2,United States,USA,North America,United States dollar
3,Greenland,GRL,North America,


In [12]:
sql = """
        SELECT name AS country, code, region, basic_unit
        -- 1. From countries
        FROM countries
          -- 2. Join to currencies
          INNER JOIN currencies
            -- 3. Match on code
            USING (code)
        -- 4. Where region is North America or null
        WHERE region = 'North America' OR region IS NULL
        -- 5. Order by region
        ORDER BY region;
      """

df = pd.read_sql(sql, engine)
print(df.shape)
df

(3, 4)


Unnamed: 0,country,code,region,basic_unit
0,Bermuda,BMU,North America,Bermudian dollar
1,Canada,CAN,North America,Canadian dollar
2,United States,USA,North America,United States dollar


## Full join (2)

### Comparison between FULL JOIN, LEFT JOIN and INNER JOIN

In [13]:
sql = """
        SELECT countries.name, code, languages.name AS language
        -- 3. From languages
        FROM languages
          -- 4. Join to countries
        FULL JOIN countries
        -- 5. Match on code
        USING (code)
        -- 1. Where countries.name starts with V or is null
        WHERE countries.name LIKE %s OR countries.name IS NULL
        -- 2. Order by ascending countries.name
        ORDER BY countries.name;
      """

df = pd.read_sql(sql, engine, params = ("V%",))
print(df.shape)
df.head()

(53, 3)


Unnamed: 0,name,code,language
0,Vanuatu,VUT,Tribal Languages
1,Vanuatu,VUT,English
2,Vanuatu,VUT,French
3,Vanuatu,VUT,Other
4,Vanuatu,VUT,Bislama


In [14]:
sql = """
        SELECT countries.name, code, languages.name AS language
        FROM languages
        -- 1. Join to countries
        LEFT JOIN countries
        -- 2. Match using code
        USING (code)
        -- 3. Where countries.name starts with V or is null
        WHERE countries.name LIKE 'V%' OR countries.name IS NULL
        ORDER BY countries.name;
      """

df = pd.read_sql(text(sql), engine)
print(df.shape)
df.head()

(51, 3)


Unnamed: 0,name,code,language
0,Vanuatu,VUT,English
1,Vanuatu,VUT,Other
2,Vanuatu,VUT,French
3,Vanuatu,VUT,Tribal Languages
4,Vanuatu,VUT,Bislama


In [15]:
sql = """
        SELECT countries.name, code, languages.name AS language
        FROM languages
        -- 1. Join to countries
        INNER JOIN countries
        USING (code)
        -- 2. Where countries.name starts with V or is null
        WHERE countries.name LIKE 'V%' OR countries.name IS NULL
        ORDER BY countries.name;
      """

df = pd.read_sql(text(sql), engine)
print(df.shape)
df

(10, 3)


Unnamed: 0,name,code,language
0,Vanuatu,VUT,Tribal Languages
1,Vanuatu,VUT,Bislama
2,Vanuatu,VUT,English
3,Vanuatu,VUT,French
4,Vanuatu,VUT,Other
5,Venezuela,VEN,Spanish
6,Venezuela,VEN,indigenous
7,Vietnam,VNM,Vietnamese
8,Vietnam,VNM,English
9,Vietnam,VNM,Other


## Full join (3)

In [16]:
sql = """
            -- 7. Select fields (with aliases)
        SELECT c1.name AS country, region, l.name AS language, basic_unit, frac_unit
            -- 1. From countries (alias as c1)
        FROM countries AS c1
            -- 2. Join with languages (alias as l)
        FULL JOIN languages AS l
            -- 3. Match on code
        USING (code)
            -- 4. Join with currencies (alias as c2)
        FULL JOIN currencies AS c2
            -- 5. Match on code
        USING (code)
            -- 6. Where region like Melanesia and Micronesia
        WHERE region LIKE 'M%nesia'
      """

df = pd.read_sql(text(sql),engine)
print(df.shape)
df.head()

(50, 5)


Unnamed: 0,country,region,language,basic_unit,frac_unit
0,Kiribati,Micronesia,English,Australian dollar,Cent
1,Kiribati,Micronesia,Kiribati,Australian dollar,Cent
2,Marshall Islands,Micronesia,Other,United States dollar,Cent
3,Marshall Islands,Micronesia,Marshallese,United States dollar,Cent
4,Nauru,Micronesia,Other,Australian dollar,Cent


# CROSS JOIN

## A table of two cities

In [17]:
sql = """
            -- 4. Select fields
        SELECT c.name AS city, l.name AS language
            -- 1. From cities (alias as c)
        FROM cities AS c        
            -- 2. Join to languages (alias as l)
        CROSS JOIN languages AS l
            -- 3. Where c.name like Hyderabad
        WHERE c.name LIKE 'Hyder%';
      """

df = pd.read_sql(text(sql),engine)
print(df.shape)
df.head(10)

(1910, 2)


Unnamed: 0,city,language
0,Hyderabad (India),Dari
1,Hyderabad,Dari
2,Hyderabad (India),Pashto
3,Hyderabad,Pashto
4,Hyderabad (India),Turkic
5,Hyderabad,Turkic
6,Hyderabad (India),Other
7,Hyderabad,Other
8,Hyderabad (India),Albanian
9,Hyderabad,Albanian


In [18]:
sql = """
            -- 5. Select fields
        SELECT c.name AS city, l.name AS Language
            -- 1. From cities (alias as c)
        FROM cities AS c    
            -- 2. Join to languages (alias as l)
        INNER JOIN languages AS l
            -- 3. Match on country code
        ON c.country_code = l.code
            -- 4. Where c.name like Hyderabad
        WHERE c.name LIKE 'Hyder%';
      """

df = pd.read_sql(text(sql),engine)
print(df.shape)
df

(25, 2)


Unnamed: 0,city,language
0,Hyderabad (India),Hindi
1,Hyderabad (India),Bengali
2,Hyderabad (India),Telugu
3,Hyderabad (India),Marathi
4,Hyderabad (India),Tamil
5,Hyderabad (India),Urdu
6,Hyderabad (India),Gujarati
7,Hyderabad (India),Kannada
8,Hyderabad (India),Malayalam
9,Hyderabad (India),Oriya


In [19]:
sql = """
            -- 5. Select fields
        SELECT COUNT(DISTINCT(l.name))
            -- 1. From cities (alias as c)
        FROM cities AS c        
            -- 2. Join to languages (alias as l)
        CROSS JOIN languages AS l
            -- 3. Where c.name like Hyderabad
        WHERE c.name LIKE 'Hyder%';
      """

df = pd.read_sql(text(sql),engine)
print(df.shape)
df

(1, 1)


Unnamed: 0,count
0,396


## Outer Challenge

In [20]:
sql = """
            -- Select fields
        SELECT c.name AS country, c.region, p.life_expectancy AS life_exp
            -- From countries (alias as c)
        FROM countries AS c
            -- Join to populations (alias as p)
        LEFT JOIN populations AS p
            -- Match on country code
        ON p.country_code = c.code
            -- Focus on 2010
        WHERE p.year = 2010
            -- Order by life_exp
        ORDER BY life_exp
        -- Limit to 5 records
        LIMIT 5
      """

df = pd.read_sql(text(sql),engine)
print(df.shape)
df

(5, 3)


Unnamed: 0,country,region,life_exp
0,Lesotho,Southern Africa,47.483414
1,Central African Republic,Central Africa,47.625317
2,Sierra Leone,Western Africa,48.22895
3,Swaziland,Southern Africa,48.345757
4,Zimbabwe,Eastern Africa,49.574657
