In [1]:
import pandas as pd
from sqlalchemy.engine import create_engine

In [2]:
engine = create_engine('postgresql://postgres:DSpark2019@127.0.0.1:5432/postgres')

In [3]:
sql = """
        Select table_name, table_schema
        FROM information_schema.tables
        WHERE table_schema = 'public'
        ORDER BY table_name
      """

pd.read_sql(sql, engine)

Unnamed: 0,table_name,table_schema
0,cities,public
1,countries,public
2,countries_plus,public
3,currencies,public
4,economies,public
5,economies2010,public
6,economies2015,public
7,languages,public
8,populations,public


In [4]:
sql = """
        SELECT * 
        FROM cities
        LIMIT 5;
      """

pd.read_sql(sql, engine)

Unnamed: 0,name,country_code,city_proper_pop,metroarea_pop,urbanarea_pop
0,Abidjan,CIV,4765000.0,,4765000.0
1,Abu Dhabi,ARE,1145000.0,,1145000.0
2,Abuja,NGA,1235880.0,6000000.0,1235880.0
3,Accra,GHA,2070463.0,4010054.0,2070463.0
4,Addis Ababa,ETH,3103673.0,4567857.0,3103673.0


In [5]:
sql = """
        SELECT * 
        FROM countries
        LIMIT 5;
      """
pd.read_sql(sql, engine)

Unnamed: 0,code,name,continent,region,surface_area,indep_year,local_name,gov_form,capital,cap_long,cap_lat
0,AFG,Afghanistan,Asia,Southern and Central Asia,652090.0,1919.0,Afganistan/Afqanestan,Islamic Emirate,Kabul,69.1761,34.5228
1,NLD,Netherlands,Europe,Western Europe,41526.0,1581.0,Nederland,Constitutional Monarchy,Amsterdam,4.89095,52.3738
2,ALB,Albania,Europe,Southern Europe,28748.0,1912.0,Shqiperia,Republic,Tirane,19.8172,41.3317
3,DZA,Algeria,Africa,Northern Africa,2381740.0,1962.0,Al-Jazair/Algerie,Republic,Algiers,3.05097,36.7397
4,ASM,American Samoa,Oceania,Polynesia,199.0,,Amerika Samoa,US Territory,Pago Pago,-170.691,-14.2846


## Inner join

In [6]:
sql = """
            -- 1. Select name fields (with alias) and region
        SELECT countries.name as country, cities.name as cities, countries.region as region
        FROM cities
            -- 2. Inner join to countries
        INNER JOIN countries
            -- 3. Match on the country codes
        ON cities.country_code = countries.code;
      """

pd.read_sql(sql,engine).head()

Unnamed: 0,country,cities,region
0,Cote d'Ivoire,Abidjan,Western Africa
1,United Arab Emirates,Abu Dhabi,Middle East
2,Nigeria,Abuja,Western Africa
3,Ghana,Accra,Western Africa
4,Ethiopia,Addis Ababa,Eastern Africa


## Inner join (2)

In [7]:
sql = """
        -- 3. Select fields with aliases
        SELECT c.code AS country_code, c.name, e.year, e.inflation_rate
        FROM countries AS c
          -- 1. Join to economies (alias e)
          INNER JOIN economies AS e
            -- 2. Match on code
            ON e.code = c.code;
      """

pd.read_sql(sql,engine).head()

Unnamed: 0,country_code,name,year,inflation_rate
0,AFG,Afghanistan,2010,2.179
1,AFG,Afghanistan,2015,-1.549
2,AGO,Angola,2010,14.48
3,AGO,Angola,2015,10.287
4,ALB,Albania,2010,3.605


## Inner join (3)

In [8]:
sql = """
         -- 6. Select fields
        SELECT c.code, name, region, e.year, fertility_rate, unemployment_rate
          -- 1. From countries (alias as c)
          FROM countries AS c
          -- 2. Join to populations (as p)
          INNER JOIN populations AS p
            -- 3. Match on country code
            ON c.code = p.country_code
          -- 4. Join to economies (as e)
          INNER JOIN economies AS e
            -- 5. Match on country code and year
            ON c.code = e.code AND e.year = p.year;
      """

pd.read_sql(sql,engine).head()

Unnamed: 0,code,name,region,year,fertility_rate,unemployment_rate
0,AFG,Afghanistan,Southern and Central Asia,2010,5.746,
1,AFG,Afghanistan,Southern and Central Asia,2015,4.653,
2,AGO,Angola,Central Africa,2010,6.416,
3,AGO,Angola,Central Africa,2015,5.996,
4,ALB,Albania,Southern Europe,2010,1.663,14.0


## Inner join with using

In [9]:
sql = """
         -- 4. Select fields
        SELECT c.name AS country, c.continent, l.name AS language, l.official
          -- 1. From countries (alias as c)
          FROM countries AS c
          -- 2. Join to languages (as l)
          INNER JOIN languages AS l
            -- 3. Match using code
            USING (code);
      """

pd.read_sql(sql,engine).head()

Unnamed: 0,country,continent,language,official
0,Afghanistan,Asia,Dari,True
1,Afghanistan,Asia,Pashto,True
2,Afghanistan,Asia,Turkic,False
3,Afghanistan,Asia,Other,False
4,Albania,Europe,Albanian,True


## Self-join

In [10]:
sql = """
        SELECT p1.country_code,
               p1.size AS size2010, 
               p2.size AS size2015,
               -- 1. calculate growth_perc
               ((p2.size - p1.size)/p1.size * 100.0) AS growth_perc
        -- 2. From populations (alias as p1)
        FROM populations AS p1
          -- 3. Join to itself (alias as p2)
          INNER JOIN populations AS p2
            -- 4. Match on country code
            ON p1.country_code = p2.country_code
                -- 5. and year (with calculation)
                AND p1.year = p2.year - 5;
      """

pd.read_sql(sql,engine).head()

Unnamed: 0,country_code,size2010,size2015,growth_perc
0,ABW,101597.0,103889.0,2.255972
1,AFG,27962208.0,32526562.0,16.323297
2,AGO,21219954.0,25021974.0,17.917192
3,ALB,2913021.0,2889167.0,-0.818875
4,AND,84419.0,70473.0,-16.519977


## Case when and then

In [11]:
sql = """
        SELECT name, continent, code, surface_area,
            -- 1. First case
            CASE WHEN surface_area > 2000000 THEN 'large'
                -- 2. Second case
                WHEN surface_area > 350000 THEN 'medium'
                -- 3. Else clause + end
                ELSE 'small' END
                -- 4. Alias name
                AS geosize_group
        -- 5. From table
        FROM countries;
      """

pd.read_sql(sql,engine).head()

Unnamed: 0,name,continent,code,surface_area,geosize_group
0,Afghanistan,Asia,AFG,652090.0,medium
1,Netherlands,Europe,NLD,41526.0,small
2,Albania,Europe,ALB,28748.0,small
3,Algeria,Africa,DZA,2381740.0,large
4,American Samoa,Oceania,ASM,199.0,small


## Inner challenge

In [12]:
sql = """
         SELECT country_code, size,
          CASE WHEN size > 50000000
                    THEN 'large'
               WHEN size > 1000000
                    THEN 'medium'
               ELSE 'small' END
               AS popsize_group
        -- 1. Into table
        INTO pop_plus       
        FROM populations
        WHERE year = 2015;
        
        -- 5. Select fields
        SELECT name, continent, geosize_group, popsize_group
        -- 1. From countries_plus (alias as c)
        FROM countries_plus AS c
          -- 2. Join to pop_plus (alias as p)
          INNER JOIN pop_plus AS p
            -- 3. Match on country code
            ON p.country_code = c.code
        -- 4. Order the table    
        ORDER BY geosize_group;
      """

pd.read_sql(sql,engine).head()

Unnamed: 0,name,continent,geosize_group,popsize_group
0,India,Asia,large,large
1,United States,North America,large,large
2,Saudi Arabia,Asia,large,medium
3,China,Asia,large,large
4,Kazakhstan,Asia,large,medium
