Collecting Data

In [1]:
import pandas as pd
import requests as r
import datetime
import sql_functions as sf
import psycopg2


In [2]:
schema = 'capstone_anglianwater'
engine = sf.get_engine()

UK Disposable Income per Capita (1995-2022)
Link to data https://www.ons.gov.uk/economy/grossdomesticproductgdp/timeseries/mwb7/ukea

In [3]:
income_columns = ['year', 'value']
income = pd.read_csv('/Users/aristova/neuefische/Tap_vs_Bottle/data/UK_income_capita.csv', 
                     names=income_columns, 
                     skiprows = 8)

In [4]:
income.drop(range(28,143),axis=0 ,inplace =True)

In [5]:
income.head()

Unnamed: 0,year,value
0,1995,17944
1,1996,18499
2,1997,18521
3,1998,21921
4,1999,22295


In [6]:
income.tail()

Unnamed: 0,year,value
23,2018,27260
24,2019,28033
25,2020,23893
26,2021,27240
27,2022,27965


In [7]:
income.insert(1, "month", 1)



In [8]:
income.tail()

Unnamed: 0,year,month,value
23,2018,1,27260
24,2019,1,28033
25,2020,1,23893
26,2021,1,27240
27,2022,1,27965


In [9]:
income['date'] = income.year.astype(str)+'-' + income.month.astype(str) +'-' + '1'
income.date = pd.to_datetime(income.date)

In [10]:
income.drop(['year','month'], axis = 1, inplace=True)

In [11]:
income

Unnamed: 0,value,date
0,17944,1995-01-01
1,18499,1996-01-01
2,18521,1997-01-01
3,21921,1998-01-01
4,22295,1999-01-01
5,23131,2000-01-01
6,23734,2001-01-01
7,24311,2002-01-01
8,25013,2003-01-01
9,25418,2004-01-01


In [12]:
income.reindex(columns=['date','value'])

Unnamed: 0,date,value
0,1995-01-01,17944
1,1996-01-01,18499
2,1997-01-01,18521
3,1998-01-01,21921
4,1999-01-01,22295
5,2000-01-01,23131
6,2001-01-01,23734
7,2002-01-01,24311
8,2003-01-01,25013
9,2004-01-01,25418


In [13]:
income.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   value   28 non-null     int64         
 1   date    28 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(1)
memory usage: 576.0 bytes


In [14]:
table_name = 'income_UK'
if engine!=None:
    try:
        income.to_sql(name=table_name, # Name of SQL table variable
                        con=engine, # Engine or connection
                        schema=schema, # your class schema variable
                        if_exists='replace', # Drop the table before inserting new values 
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None
else:
    print('No engine')

The income_UK table was imported successfully.
