## set up connection

In [4]:
import configparser
import psycopg2
import pandas as pd

In [5]:
config = configparser.ConfigParser()
config.read('dwh.cfg')

['dwh.cfg']

In [6]:
%load_ext sql
conn_string="postgresql://{}:{}@{}:{}/{}".format(config.get("CLUSTER","DB_USER"),
                                               config.get("CLUSTER",'DB_PASSWORD'),
                                               config.get("CLUSTER",'HOST'),
                                               config.get("CLUSTER",'DB_PORT'),
                                               config.get("CLUSTER",'DB_NAME'))
print(conn_string)
%sql $conn_string

postgresql://awsuser:Passw0rd@dwhcluster.cbu6otbv3egu.us-west-2.redshift.amazonaws.com:5439/dev


'Connected: awsuser@dev'

In [7]:
conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
cur = conn.cursor()

# two data quality checks were done
 - check table count
 - check table column names

In [31]:
def quality_checks(conn,cur,table_name):
    """Count checks on fact and dimension table to ensure completeness of data.
    """
    
    #check total count of table
    try:
        query = f"select count(*) from {table_name}"

        cur.execute(query)
        totalcount = cur.fetchone()
        print(f"Total record of {table_name} is {totalcount[0]}" )

    except (Exception, psycopg2.Error) as error:
        print("Error while fetching data from PostgreSQL", error)
        
    #check column names of table
    try:
        query = f'''SELECT column_name FROM information_schema.columns
                 WHERE table_name  = '{table_name}'
                 '''

        cur.execute(query)
        allcolumns = cur.fetchall()
        print(f"Column names of {table_name} is {allcolumns}" )

    except (Exception, psycopg2.Error) as error:
        print("Error while fetching data from PostgreSQL", error)

In [32]:
table_dfs = ['i94','countries','airport','usport','citydemo','cic']
for i in table_dfs:
    # quality check for table
    quality_checks(conn,cur,i)

Total record of i94 is 1000
Column names of i94 is [('i94res',), ('i94cit',), ('i94mon',), ('i94yr',), ('cicid',), ('visatype',), ('fltno',), ('admnum',), ('airline',), ('dtaddto',), ('visapost',), ('dtadfile',), ('i94visa',), ('depdate',), ('i94addr',), ('i94mode',), ('arrdate',), ('i94port',)]
Total record of countries is 289
Column names of countries is [('countrycode',), ('country',)]
Total record of airport is 55075
Column names of airport is [('lon',), ('lat',), ('elevation_ft',), ('citystate',), ('iso_region2',), ('coordinates',), ('local_code',), ('iata_code',), ('gps_code',), ('municipality',), ('iso_region',), ('iso_country',), ('continent',), ('name',), ('type',), ('ident',)]
Total record of usport is 660
Column names of usport is [('citystate',), ('state',), ('portname',), ('port',)]
Total record of citydemo is 596
Column names of citydemo is [('white',), ('native',), ('hispa',), ('black',), ('asian',), ('averagehouseholdsize',), ('foreignborn',), ('numberofveterans',), ('t

# below are for my own purpose

## citydemo

In [4]:
%%sql
select count(*) from citydemo

 * postgresql://awsuser:***@dwhcluster.cbu6otbv3egu.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.


count
596


In [5]:
%%sql
select top 10 * from citydemo

 * postgresql://awsuser:***@dwhcluster.cbu6otbv3egu.us-west-2.redshift.amazonaws.com:5439/dev
10 rows affected.


city,state,medianage,malepopulation,femalepopulation,totalpopulation,numberofveterans,foreignborn,averagehouseholdsize,statecode,asian,black,hispa,native,white,citystate
Silver Spring,Maryland,33.8,40601.0,41862.0,82463.0,1562.0,30908.0,2.6,MD,8841.0,21330.0,25924.0,1084.0,37756.0,silverspringmd
Quincy,Massachusetts,41.0,44129.0,49500.0,93629.0,4147.0,32935.0,2.39,MA,30473.0,3917.0,2566.0,351.0,58723.0,quincyma
Hoover,Alabama,38.5,38040.0,46799.0,84839.0,4819.0,8229.0,2.58,AL,4759.0,18191.0,3430.0,,61869.0,hooveral
Rancho Cucamonga,California,34.5,88127.0,87105.0,175232.0,5821.0,33878.0,3.18,CA,24519.0,24437.0,65823.0,2789.0,111832.0,ranchocucamongaca
Newark,New Jersey,34.6,138040.0,143873.0,281913.0,5829.0,86253.0,2.73,NJ,7349.0,144961.0,100432.0,2268.0,76402.0,newarknj
Peoria,Illinois,33.1,56229.0,62432.0,118661.0,6634.0,7517.0,2.4,IL,6989.0,36333.0,6874.0,1343.0,77074.0,peoriail
Avondale,Arizona,29.1,38712.0,41971.0,80683.0,4815.0,8355.0,3.18,AZ,2828.0,11592.0,34716.0,613.0,62176.0,avondaleaz
West Covina,California,39.8,51629.0,56860.0,108489.0,3800.0,37038.0,3.56,CA,32716.0,3693.0,58907.0,518.0,48046.0,westcovinaca
O'Fallon,Missouri,36.0,41762.0,43270.0,85032.0,5783.0,3269.0,2.77,MO,3447.0,5136.0,2583.0,685.0,77049.0,o'fallonmo
High Point,North Carolina,35.5,51751.0,58077.0,109828.0,5204.0,16315.0,2.65,NC,11060.0,39369.0,11446.0,1181.0,58004.0,highpointnc


In [23]:
%%sql 
SELECT column_name
  FROM information_schema.columns
 WHERE table_name   = 'citydemo'


 * postgresql://awsuser:***@dwhcluster.cbu6otbv3egu.us-west-2.redshift.amazonaws.com:5439/dev
16 rows affected.


column_name
white
native
hispa
black
asian
averagehouseholdsize
foreignborn
numberofveterans
totalpopulation
femalepopulation


## airport

In [6]:
%%sql
select count(*) from airport

 * postgresql://awsuser:***@dwhcluster.cbu6otbv3egu.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.


count
55075


In [7]:
%%sql
select top 10 * from airport

 * postgresql://awsuser:***@dwhcluster.cbu6otbv3egu.us-west-2.redshift.amazonaws.com:5439/dev
10 rows affected.


ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,iso_region2,lat,lon,citystate
00AS,small_airport,Fulton Airport,1100.0,,US,US-OK,Alex,00AS,,00AS,"-97.8180194, 34.9428028",OK,-97.8180194,34.9428028,alexok
00FL,small_airport,River Oak Airport,35.0,,US,US-FL,Okeechobee,00FL,,00FL,"-80.96920013427734, 27.230899810791016",FL,-80.9692001342773,27.230899810791,okeechobeefl
00IN,heliport,St Mary Medical Center Heliport,634.0,,US,US-IN,Hobart,00IN,,00IN,"-87.2605972290039, 41.51139831542969",IN,-87.2605972290039,41.5113983154297,hobartin
00MI,heliport,Dow Chemical Heliport,588.0,,US,US-MI,Ludington,00MI,,00MI,"-86.41670227050781, 43.94940185546875",MI,-86.4167022705078,43.9494018554688,ludingtonmi
00NY,small_airport,Weiss Airfield,1000.0,,US,US-NY,West Bloomfield,00NY,,00NY,"-77.49970245361328, 42.90010070800781",NY,-77.4997024536133,42.9001007080078,west bloomfieldny
00S,small_airport,Mc Kenzie Bridge State Airport,1620.0,,US,US-OR,Mc Kenzie Bridge,00S,,00S,"-122.088996887, 44.183200836199994",OR,-122.088996887,44.1832008362,mc kenzie bridgeor
00UT,closed,Clear Creek Ranch Airport,6138.0,,US,US-UT,Kanab,,,,"-112.821998, 37.247799",UT,-112.821998,37.247799,kanabut
00WY,heliport,Mountain View Regional Hospital Heliport,5210.0,,US,US-WY,Casper,00WY,,00WY,"-106.224443, 42.840361",WY,-106.224443,42.840361,casperwy
01CA,heliport,Lugo Substation Heliport,3733.0,,US,US-CA,Hesperia,01CA,,01CA,"-117.370058745, 34.368240591699994",CA,-117.370058745,34.3682405917,hesperiaca
01GA,heliport,Medical Center Heliport,319.0,,US,US-GA,Columbus,01GA,,01GA,"-84.9791030883789, 32.47930145263672",GA,-84.9791030883789,32.4793014526367,columbusga


# i94

In [8]:
%%sql
select count(*) from i94

 * postgresql://awsuser:***@dwhcluster.cbu6otbv3egu.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.


count
1000


In [9]:
%%sql
select top 10 * from i94

 * postgresql://awsuser:***@dwhcluster.cbu6otbv3egu.us-west-2.redshift.amazonaws.com:5439/dev
10 rows affected.


cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94visa,dtadfile,visapost,dtaddto,airline,admnum,fltno,visatype
4084316.0,2016.0,4.0,209.0,209.0,HHW,2016-04-22,1.0,HI,2016-04-29,2.0,20160422,,7202016,JL,56582674633.0,00782,WT
4422636.0,2016.0,4.0,582.0,582.0,MCA,2016-04-23,1.0,TX,2016-04-24,2.0,20160423,MTR,10222016,*GA,94361995930.0,XBLNG,B2
1195600.0,2016.0,4.0,148.0,112.0,OGG,2016-04-07,1.0,FL,2016-04-27,2.0,20160407,,7052016,LH,55780468433.0,00464,WT
5291768.0,2016.0,4.0,297.0,297.0,LOS,2016-04-28,1.0,CA,2016-05-07,2.0,20160428,DOH,10272016,QR,94789696030.0,00739,B2
985523.0,2016.0,4.0,111.0,111.0,CHM,2016-04-06,3.0,NY,2016-04-09,2.0,20160406,,7042016,,42322572633.0,LAND,WT
1481650.0,2016.0,4.0,577.0,577.0,ATL,2016-04-08,1.0,GA,2016-06-01,2.0,20160408,,10072016,DL,736852585.0,910,B2
2197173.0,2016.0,4.0,245.0,245.0,SFR,2016-04-12,1.0,CA,2016-06-30,2.0,20160412,,10112016,CX,786312185.0,870,B2
232708.0,2016.0,4.0,113.0,135.0,NYC,2016-04-02,1.0,NY,2016-04-10,2.0,20160402,,6302016,BA,55474485033.0,00117,WT
5227851.0,2016.0,4.0,131.0,131.0,CHI,2016-04-28,1.0,IL,2016-05-01,2.0,20160428,,7262016,LX,59413424733.0,00008,WT
13213.0,2016.0,4.0,116.0,116.0,LOS,2016-04-01,1.0,CA,2016-04-09,2.0,20160401,,6292016,AA,55449792933.0,00109,WT


# countries

In [10]:
%%sql
select count(*) from countries

 * postgresql://awsuser:***@dwhcluster.cbu6otbv3egu.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.


count
289


In [11]:
%%sql
select top 10 * from countries

 * postgresql://awsuser:***@dwhcluster.cbu6otbv3egu.us-west-2.redshift.amazonaws.com:5439/dev
10 rows affected.


countrycode,country
582.0,"MEXICO Air Sea, and Not Reported (I-94, no land arrivals)"
236.0,AFGHANISTAN
101.0,ALBANIA
316.0,ALGERIA
102.0,ANDORRA
324.0,ANGOLA
529.0,ANGUILLA
518.0,ANTIGUA-BARBUDA
687.0,ARGENTINA
151.0,ARMENIA


# usport

In [12]:
%%sql
select count(*) from usport

 * postgresql://awsuser:***@dwhcluster.cbu6otbv3egu.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.


count
660


In [13]:
%%sql
select top 10 * from usport

 * postgresql://awsuser:***@dwhcluster.cbu6otbv3egu.us-west-2.redshift.amazonaws.com:5439/dev
10 rows affected.


port,portname,state,citystate
ALC,ALCAN,AK,alcanak
ANC,ANCHORAGE,AK,anchorageak
BAR,BAKER AAF - BAKER ISLAND,AK,bakeraaf-bakerislandak
DAC,DALTONS CACHE,AK,daltonscacheak
PIZ,DEW STATION PT LAY DEW,AK,dewstationptlaydewak
DTH,DUTCH HARBOR,AK,dutchharborak
EGL,EAGLE,AK,eagleak
FRB,FAIRBANKS,AK,fairbanksak
HOM,HOMER,AK,homerak
HYD,HYDER,AK,hyderak


# cic

In [14]:
%%sql
select count(*) from cic

 * postgresql://awsuser:***@dwhcluster.cbu6otbv3egu.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.


count
1000


In [15]:
%%sql
select top 10 * from cic

 * postgresql://awsuser:***@dwhcluster.cbu6otbv3egu.us-west-2.redshift.amazonaws.com:5439/dev
10 rows affected.


cicid,biryear,occup,visatype
985523.0,1997.0,,WT
2711583.0,1962.0,,WT
4042798.0,1954.0,,WT
2006623.0,1999.0,,WT
216657.0,1962.0,,B2
2572618.0,1978.0,,B2
3108700.0,1983.0,,WT
1840249.0,1948.0,,B2
3293058.0,1974.0,,B1
1186211.0,1970.0,,WT


# should not have raw i94 table

In [16]:
%%sql
select count(*) from raw_i94

 * postgresql://awsuser:***@dwhcluster.cbu6otbv3egu.us-west-2.redshift.amazonaws.com:5439/dev
(psycopg2.ProgrammingError) relation "raw_i94" does not exist
 [SQL: 'select count(*) from raw_i94']
