<a href="https://colab.research.google.com/github/SurajKande/Pipelining/blob/master/simple_ETL_data_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

demonstrating a simple ETL data pipeline. Starting from extracting data from the source, transforming into a desired format, and loading into a SQLite file.

In [0]:
import sqlite3
import pandas as pd

In [0]:
# to connect gdrive to colab for importing dataset
from google.colab import drive
drive.mount('/content/gdrive', force_remount = True)

In [0]:
class Pipeline(object):
    def __init__(self):
        self.population = None
        self.unemployment = None

    def extract(self):
        
        url_popul_est = 'https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/metro/totals/cbsa-est2019-alldata.csv'
        url_unemployment = 'https://www.ers.usda.gov/webdocs/DataFiles/48747/Unemployment.xls?v=318.5'

        self.population = pd.read_csv(url_popul_est, encoding='ISO-8859-1')
        self.unemployment = pd.read_excel(url_unemployment, skiprows=7)

    def transform(self):
        # formatting Population dataset

        # keep the relevant columns only i.e. the columns that contain year-population-estimate and index names
        pop_idx = ['CBSA', 'MDIV', 'STCOU', 'NAME', 'LSAD']
        pop_cols = [c for c in self.population.columns if c.startswith('POPEST')]
        population = self.population[pop_idx + pop_cols].copy()

        # melt, "unpivot" the yearly rate values (from wide format 'columns' to long format 'rows')
        self.population = population.melt(id_vars=pop_idx,
                                          value_vars=pop_cols,
                                          var_name='YEAR',
                                          value_name='POPULATION_EST')
        
        self.population['YEAR'] = self.population['YEAR'].apply(lambda x: x[-4:]) 


        # formatting Unemployment dataset

        # keep the relevant columns only i.e. unemployment-rate-year and names
        unemp_idx = ['FIPStxt', 'Stabr', 'area_name']
        unemp_cols = [c for c in self.unemployment.columns if c.startswith('Unemployment_rate')]
        unemployment = self.unemployment[unemp_idx + unemp_cols].copy()

        # melt, "unpivot" the yearly rate values (from wide format 'columns' to long format 'rows')
        self.unemployment = unemployment.melt(id_vars=unemp_idx,
                                              value_vars=unemp_cols,
                                              var_name='Year',
                                              value_name='Unemployment_rate')
        
        self.unemployment['Year'] = self.unemployment['Year'].apply(lambda x: x[-4:])


    def load(self):
        db = DB()
        self.population.to_sql('population', db.conn, if_exists='append', index=False)
        self.unemployment.to_sql('unemployment', db.conn, if_exists='append', index=False)

In [0]:
population.head()   # before transformation   

Unnamed: 0,CBSA,MDIV,STCOU,NAME,LSAD,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019,NPOPCHG2010,NPOPCHG2011,NPOPCHG2012,NPOPCHG2013,NPOPCHG2014,NPOPCHG2015,NPOPCHG2016,NPOPCHG2017,NPOPCHG2018,NPOPCHG2019,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,BIRTHS2016,BIRTHS2017,BIRTHS2018,BIRTHS2019,DEATHS2010,DEATHS2011,DEATHS2012,...,INTERNATIONALMIG2010,INTERNATIONALMIG2011,INTERNATIONALMIG2012,INTERNATIONALMIG2013,INTERNATIONALMIG2014,INTERNATIONALMIG2015,INTERNATIONALMIG2016,INTERNATIONALMIG2017,INTERNATIONALMIG2018,INTERNATIONALMIG2019,DOMESTICMIG2010,DOMESTICMIG2011,DOMESTICMIG2012,DOMESTICMIG2013,DOMESTICMIG2014,DOMESTICMIG2015,DOMESTICMIG2016,DOMESTICMIG2017,DOMESTICMIG2018,DOMESTICMIG2019,NETMIG2010,NETMIG2011,NETMIG2012,NETMIG2013,NETMIG2014,NETMIG2015,NETMIG2016,NETMIG2017,NETMIG2018,NETMIG2019,RESIDUAL2010,RESIDUAL2011,RESIDUAL2012,RESIDUAL2013,RESIDUAL2014,RESIDUAL2015,RESIDUAL2016,RESIDUAL2017,RESIDUAL2018,RESIDUAL2019
0,10180,,,"Abilene, TX",Metropolitan Statistical Area,165252,165252,165585,166634,167442,167473,168342,169688,170017,170429,171150,172060,333,1049,808,31,869,1346,329,412,721,910,539,2290,2358,2392,2381,2411,2384,2405,2360,2361,409,1501,1590,...,84,199,510,358,416,494,384,314,188,220,124,64,-461,-1050,-310,159,-710,-593,-49,90,208,263,49,-692,106,653,-326,-279,139,310,-5,-3,-9,23,-22,-17,-4,-5,-4,-9
1,10180,,48059.0,"Callahan County, TX",County or equivalent,13544,13545,13512,13511,13488,13502,13505,13589,13789,13968,13990,13943,-33,-1,-23,14,3,84,200,179,22,-47,31,120,122,136,139,153,152,162,142,143,60,158,142,...,0,4,5,5,8,8,8,6,5,5,-3,34,-7,51,36,92,231,183,43,-21,-3,38,-2,56,44,100,239,189,48,-16,-1,-1,-1,-2,-2,-1,0,0,0,-1
2,10180,,48253.0,"Jones County, TX",County or equivalent,20202,20192,20238,20270,19870,20044,19850,19966,19971,19827,19866,20083,46,32,-400,174,-194,116,5,-144,39,217,25,155,190,191,180,158,157,164,158,156,20,213,174,...,4,13,11,11,8,2,2,1,1,1,36,74,-441,173,-188,156,34,-101,38,218,40,87,-430,184,-180,158,36,-100,39,219,1,3,14,5,1,1,-1,0,-1,-2
3,10180,,48441.0,"Taylor County, TX",County or equivalent,131506,131515,131835,132853,134084,133927,134987,136133,136257,136634,137294,138034,320,1018,1231,-157,1060,1146,124,377,660,740,483,2015,2046,2065,2062,2100,2075,2079,2060,2062,329,1130,1274,...,80,182,494,342,400,484,374,307,182,214,91,-44,-13,-1274,-158,-89,-975,-675,-130,-107,171,138,481,-932,242,395,-601,-368,52,107,-5,-5,-22,20,-21,-17,-3,-5,-3,-6
4,10420,,,"Akron, OH",Metropolitan Statistical Area,703200,703196,703031,703200,702109,703621,704908,704382,703524,703987,703855,703479,-165,169,-1091,1512,1287,-526,-858,463,-132,-376,1980,7570,7497,7544,7708,7562,7449,7399,7254,7196,1606,6657,7007,...,221,1129,1203,1821,1635,1922,1881,1500,938,836,-727,-1861,-2736,-778,-1092,-2499,-2875,-881,-1044,-1190,-506,-732,-1533,1043,543,-577,-994,619,-106,-354,-33,-12,-48,-87,-115,-48,-17,-27,-18,-22


In [0]:
population.head()  #after transformation

Unnamed: 0,CBSA,MDIV,STCOU,NAME,LSAD,YEAR,POPULATION_EST
0,10180,,,"Abilene, TX",Metropolitan Statistical Area,2010,165585
1,10180,,48059.0,"Callahan County, TX",County or equivalent,2010,13512
2,10180,,48253.0,"Jones County, TX",County or equivalent,2010,20238
3,10180,,48441.0,"Taylor County, TX",County or equivalent,2010,131835
4,10420,,,"Akron, OH",Metropolitan Statistical Area,2010,703031


In [0]:
unemployment.head()  # before transformation

Unnamed: 0,FIPStxt,Stabr,area_name,Rural_urban_continuum_code_2013,Urban_influence_code_2013,Metro_2013,Civilian_labor_force_2000,Employed_2000,Unemployed_2000,Unemployment_rate_2000,Civilian_labor_force_2001,Employed_2001,Unemployed_2001,Unemployment_rate_2001,Civilian_labor_force_2002,Employed_2002,Unemployed_2002,Unemployment_rate_2002,Civilian_labor_force_2003,Employed_2003,Unemployed_2003,Unemployment_rate_2003,Civilian_labor_force_2004,Employed_2004,Unemployed_2004,Unemployment_rate_2004,Civilian_labor_force_2005,Employed_2005,Unemployed_2005,Unemployment_rate_2005,Civilian_labor_force_2006,Employed_2006,Unemployed_2006,Unemployment_rate_2006,Civilian_labor_force_2007,Employed_2007,Unemployed_2007,Unemployment_rate_2007,Civilian_labor_force_2008,Employed_2008,...,Unemployed_2010,Unemployment_rate_2010,Civilian_labor_force_2011,Employed_2011,Unemployed_2011,Unemployment_rate_2011,Civilian_labor_force_2012,Employed_2012,Unemployed_2012,Unemployment_rate_2012,Civilian_labor_force_2013,Employed_2013,Unemployed_2013,Unemployment_rate_2013,Civilian_labor_force_2014,Employed_2014,Unemployed_2014,Unemployment_rate_2014,Civilian_labor_force_2015,Employed_2015,Unemployed_2015,Unemployment_rate_2015,Civilian_labor_force_2016,Employed_2016,Unemployed_2016,Unemployment_rate_2016,Civilian_labor_force_2017,Employed_2017,Unemployed_2017,Unemployment_rate_2017,Civilian_labor_force_2018,Employed_2018,Unemployed_2018,Unemployment_rate_2018,Civilian_labor_force_2019,Employed_2019,Unemployed_2019,Unemployment_rate_2019,Median_Household_Income_2018,Med_HH_Income_Percent_of_State_Total_2018
0,0,US,United States,,,,142601667.0,136904680.0,5696987.0,3.995035,143786778.0,136978129.0,6808649.0,4.73524,144839532.0,136455911.0,8383621.0,5.788213,145660181.0,136944540.0,8715641.0,5.983544,146725005.0,138614038.0,8110967.0,5.528006,148012786.0,140462401.0,7550385.0,5.101171,150223730.0,143262292.0,6961438.0,4.634047,152191286.0,145156247.0,7035039.0,4.622498,153761298.0,144860522.0,...,14862528.0,9.634056,154606324.0,140765822.0,13840502.0,8.952093,155119209.0,142600412.0,12518797.0,8.070436,155485689.0,144018150.0,11467539.0,7.375302,155957310.0,146338323.0,9618987.0,6.167705,156798981.0,148515185.0,8283796.0,5.283067,158626135.0,150902618.0,7723517.0,4.869007,160158564.0,153183461.0,6975103.0,4.355123,161389026.0,155102319.0,6286707.0,3.895375,163100055.0,157115247.0,5984808.0,3.669409,61937.0,
1,1000,AL,Alabama,,,,2133223.0,2035594.0,97629.0,4.6,2115401.0,2006884.0,108517.0,5.1,2106161.0,1981919.0,124242.0,5.9,2120225.0,1992732.0,127493.0,6.0,2136458.0,2014889.0,121569.0,5.7,2146025.0,2049791.0,96234.0,4.5,2167809.0,2080233.0,87576.0,4.0,2175612.0,2089127.0,86485.0,4.0,2176489.0,2053477.0,...,231483.0,10.5,2202670.0,1990413.0,212257.0,9.6,2176337.0,2003290.0,173047.0,8.0,2174000.0,2017043.0,156957.0,7.2,2160842.0,2014290.0,146552.0,6.8,2161481.0,2030005.0,131476.0,6.1,2182558.0,2055251.0,127307.0,5.8,2186869.0,2090373.0,96496.0,4.4,2216627.0,2130845.0,85782.0,3.9,2241747.0,2174483.0,67264.0,3.0,49881.0,100.0
2,1001,AL,"Autauga County, AL",2.0,2.0,1.0,21720.0,20846.0,874.0,4.0,21955.0,21055.0,900.0,4.1,22094.0,21035.0,1059.0,4.8,22604.0,21462.0,1142.0,5.1,23218.0,22103.0,1115.0,4.8,23949.0,23037.0,912.0,3.8,24398.0,23585.0,813.0,3.3,24383.0,23577.0,806.0,3.3,24687.0,23420.0,...,2282.0,8.9,25836.0,23677.0,2159.0,8.4,25740.0,23961.0,1779.0,6.9,25810.0,24205.0,1605.0,6.2,25592.0,24097.0,1495.0,5.8,25652.0,24321.0,1331.0,5.2,26031.0,24709.0,1322.0,5.1,26075.0,25062.0,1013.0,3.9,26196.0,25261.0,935.0,3.6,26172.0,25458.0,714.0,2.7,59338.0,118.959123
3,1003,AL,"Baldwin County, AL",3.0,2.0,1.0,69533.0,66971.0,2562.0,3.7,69161.0,66195.0,2966.0,4.3,69169.0,65691.0,3478.0,5.0,72299.0,68702.0,3597.0,5.0,74772.0,70919.0,3853.0,5.2,76804.0,73743.0,3061.0,4.0,79711.0,77147.0,2564.0,3.2,82659.0,80099.0,2560.0,3.1,83223.0,79372.0,...,8339.0,10.0,85045.0,77418.0,7627.0,9.0,84414.0,78065.0,6349.0,7.5,85280.0,79626.0,5654.0,6.6,86384.0,81083.0,5301.0,6.1,87872.0,83010.0,4862.0,5.5,90895.0,86060.0,4835.0,5.3,92456.0,88711.0,3745.0,4.1,95233.0,91809.0,3424.0,3.6,97328.0,94675.0,2653.0,2.7,57588.0,115.450773
4,1005,AL,"Barbour County, AL",6.0,6.0,0.0,11373.0,10748.0,625.0,5.5,11250.0,10412.0,838.0,7.4,10971.0,10125.0,846.0,7.7,10977.0,10196.0,781.0,7.1,10633.0,9865.0,768.0,7.2,10760.0,10136.0,624.0,5.8,10705.0,10096.0,609.0,5.7,10334.0,9684.0,650.0,6.3,10161.0,9267.0,...,1262.0,12.3,9849.0,8712.0,1137.0,11.5,9362.0,8283.0,1079.0,11.5,9099.0,8168.0,931.0,10.2,8845.0,7913.0,932.0,10.5,8625.0,7860.0,765.0,8.9,8436.0,7736.0,700.0,8.3,8349.0,7863.0,486.0,5.8,8414.0,7987.0,427.0,5.1,8537.0,8213.0,324.0,3.8,34382.0,68.928049


In [0]:
unemployment.head()  # after transformation

Unnamed: 0,FIPStxt,Stabr,area_name,Year,Unemployment_rate
0,0,US,United States,2000,4.0
1,1000,AL,Alabama,2000,4.6
2,1001,AL,"Autauga County, AL",2000,4.0
3,1003,AL,"Baldwin County, AL",2000,3.7
4,1005,AL,"Barbour County, AL",2000,5.5


In [0]:
class DB(object):

    def __init__(self, db_file='/content/gdrive/My Drive/datasets/databasefile/db.sqlite'):
        self.conn = sqlite3.connect(db_file)
        self.cur = self.conn.cursor()
        self.__init_db()

    def __del__(self):
        self.conn.commit()
        self.conn.close()

    def __init_db(self):
        table1 = f"""CREATE TABLE IF NOT EXISTS population1(
              CBSA INTEGER,
              MDIV REAL,
              STCOU INTEGER,
              NAME TEXT,
              LSAD TEXT,
              YEAR INTEGER,
              POPULATION_EST INTEGER
                );"""

        table2 = f"""CREATE TABLE IF NOT EXISTS unemployment1(
            FIPStxt INTEGER,
            Stabr TEXT,
            area_name TEXT,
            Year INTEGER,
            unemployment_rate REAL
            );"""

        self.cur.execute(table1)
        self.cur.execute(table2)



In [5]:
pipeline = Pipeline()
print('Data Pipeline created')
print('\t extracting data from source .... ')
pipeline.extract()
print('\t formatting and transforming data ... ')
pipeline.transform()
print('\t loading into database ... ')
pipeline.load()
print('\nDone. See: result in "db.sqlite"')

Data Pipeline created
	 extracting data from source .... 
	 formatting and transforming data ... 
	 loading into database ... 

Done. See: result in "db.sqlite"


In [0]:
conn = sqlite3.connect('/content/gdrive/My Drive/datasets/databasefile/db.sqlite')        # connecting to sqlite database 

In [0]:
sql1 = """
SELECT 
    NAME, YEAR, POPULATION_EST
FROM
    population
WHERE
    STCOU is NULL;
"""

In [0]:
df1 = pd.read_sql(sql1, conn)

In [9]:
df1    # reading population data into dataframe df1

Unnamed: 0,NAME,YEAR,POPULATION_EST
0,"Abilene, TX",2010,165585
1,"Akron, OH",2010,703031
2,"Albany, GA",2010,154145
3,"Albany-Lebanon, OR",2010,116891
4,"Albany-Schenectady-Troy, NY",2010,871082
...,...,...,...
47845,"Wooster, OH",2019,115710
47846,"Worthington, MN",2019,21629
47847,"Yankton, SD",2019,22814
47848,"Zanesville, OH",2019,86215


In [0]:
sql2 = """
SELECT 
    Stabr, area_name, unemployment_rate
FROM
    unemployment
"""

In [0]:
df2 = pd.read_sql(sql2, conn)

In [12]:
df2              # reading unemployment data into dataframe df2 

Unnamed: 0,Stabr,area_name,Unemployment_rate
0,US,United States,4.0
1,AL,Alabama,4.6
2,AL,"Autauga County, AL",4.0
3,AL,"Baldwin County, AL",3.7
4,AL,"Barbour County, AL",5.5
...,...,...,...
327495,PR,"Vega Baja Municipio, PR",9.6
327496,PR,"Vieques Municipio, PR",6.9
327497,PR,"Villalba Municipio, PR",15.9
327498,PR,"Yabucoa Municipio, PR",13.1


In [0]:
# df2.groupby("Stabr")["Unemployment_rate"].mean().reset_index()

In [0]:
df1["STATE"]=df1["NAME"].str.split(",").str[1].str[-2:]

In [14]:
df1.groupby("STATE")["POPULATION_EST"].mean().reset_index()

Unnamed: 0,STATE,POPULATION_EST
0,AK,135107.7
1,AL,181885.3
2,AR,165308.2
3,AZ,609394.2
4,CA,1437336.0
5,CO,300086.9
6,CT,733138.4
7,DE,282450.0
8,FL,803228.9
9,GA,251882.6


In [0]:
df2["STATE"] = df2["Stabr"]

In [0]:
merged_dataframe = pd.merge(left= df1.groupby("STATE")["POPULATION_EST"].mean().reset_index() , right = df2.groupby("STATE")["Unemployment_rate"].mean().reset_index(), left_on = 'STATE', right_on = 'STATE') 

In [45]:
merged_dataframe.head()

Unnamed: 0,STATE,POPULATION_EST,Unemployment_rate
0,AK,135107.7,9.410345
1,AL,181885.3,7.149853
2,AR,165308.2,6.374145
3,AZ,609394.2,8.218125
4,CA,1437336.0,8.290932


In [0]:
threshold = merged_dataframe['Unemployment_rate'].describe([0.2,0.4,0.75]).values[4:8]

In [0]:
def return_unemployment_zone(x, tr=threshold):
  if ( x <= tr[1]):
    return "low"
  elif ( x > tr[1] and x <= tr[2] ):
    return "medium"
  elif ( x > tr[2] and x <= tr[3]) :
    return "high"
  else:
    return "critical"

In [0]:
merged_dataframe["zone"] = merged_dataframe['Unemployment_rate'].apply(return_unemployment_zone)    

In [49]:
merged_dataframe.head()

Unnamed: 0,STATE,POPULATION_EST,Unemployment_rate,zone
0,AK,135107.7,9.410345,critical
1,AL,181885.3,7.149853,critical
2,AR,165308.2,6.374145,high
3,AZ,609394.2,8.218125,critical
4,CA,1437336.0,8.290932,critical


In [37]:
!pip install http://13.235.50.20:8080/packages/genpactclassification-0.0.3-py3-none-any.whl#md5=b78d8a3653004e5cdaca6c0087326e7e    

 # installing the package from pypiserver hosted in aws

Collecting genpactclassification==0.0.3
  Downloading http://13.235.50.20:8080/packages/genpactclassification-0.0.3-py3-none-any.whl
Installing collected packages: genpactclassification
Successfully installed genpactclassification-0.0.3


In [0]:
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from genpactclassification import decision_tree_classifier as dt

In [0]:
le = preprocessing.LabelEncoder()
le = le.fit_transform(merged_dataframe['zone'])
merged_dataframe['zone'] = le

In [57]:
merged_dataframe['zone'].head()

0    0
1    0
2    1
3    0
4    0
Name: zone, dtype: int64

In [0]:
X = merged_dataframe.loc[:,'POPULATION_EST':'zone'].values
y = merged_dataframe.loc[:,'zone'].values
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

model,train_accuracy,test_accuracy = dt.decision_tree_classifier(X_train,X_test,y_train,y_test)

In [54]:
model

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [55]:
train_accuracy

100.0

In [56]:
test_accuracy

84.62