Collecting Data

In [57]:
import pandas as pd
import requests as r
import sql_functions as sf
import psycopg2


In [58]:
schema = 'capstone_anglianwater'
engine = sf.get_engine()

### UK bottled water Production (1998-2023)

Output of mineral/bottled water by UK manufacturers in GBP Million.

In [59]:
production_columns = ['period', 'value']
production = pd.read_csv('data/bottled_water_production_UK.csv',
                     names=production_columns, 
                     skiprows = 8)

In [60]:
production.head()

Unnamed: 0,period,value
0,1998,2820.2
1,1999,3187.3
2,2000,3057.8
3,2001,3145.7
4,2002,3240.8


In [61]:
production_new = production[production['period'].str.len() == 8]
production_new['date']=pd.to_datetime(production_new['period'], format='%Y %b', dayfirst=True, errors='coerce')
production_new.loc[:, 'year'] = production_new.loc[:, 'date'].dt.year.astype(str)
production_new.loc[:, 'month'] = production_new.loc[:, 'date'].dt.month.astype(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  production_new['date']=pd.to_datetime(production_new['period'], format='%Y %b', dayfirst=True, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  production_new.loc[:, 'year'] = production_new.loc[:, 'date'].dt.year.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  production_

In [62]:
uk_bottled_water_production = production_new.iloc[:,[2,1]]

In [63]:
uk_bottled_water_production

Unnamed: 0,date,value
128,1998-01-01,183.0
129,1998-02-01,196.4
130,1998-03-01,243.0
131,1998-04-01,247.7
132,1998-05-01,287.9
...,...,...
434,2023-07-01,647.1
435,2023-08-01,648.4
436,2023-09-01,632.8
437,2023-10-01,598.0


### UK bottled water  price (1996-2023)

Output price inflation of mineral waters and other bottled waters. Base year for calculation is 2015 (=100)

In [64]:
bw_price_columns = ['period', 'value']
bw_price = pd.read_csv('data/bottled_water_production_price_UK.csv', 
                     names=bw_price_columns, 
                     skiprows = 8)

In [65]:
bw_price.head()

Unnamed: 0,period,value
0,2009,95.6
1,2010,94.8
2,2011,94.8
3,2012,104.2
4,2013,107.5


In [66]:
bw_price_new = bw_price[bw_price['period'].str.len() == 8]
bw_price_new['date']=pd.to_datetime(bw_price_new['period'], format='%Y %b', dayfirst=True, errors='coerce')
bw_price_new.loc[:, 'year'] = bw_price_new.loc[:, 'date'].dt.year.astype(str)
bw_price_new.loc[:, 'month'] = bw_price_new.loc[:, 'date'].dt.month.astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bw_price_new['date']=pd.to_datetime(bw_price_new['period'], format='%Y %b', dayfirst=True, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bw_price_new.loc[:, 'year'] = bw_price_new.loc[:, 'date'].dt.year.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bw_price_new.loc[:,

In [67]:
uk_bottled_water_inflation_clean = bw_price_new.iloc[:,[2,1]]
uk_bottled_water_inflation_clean

Unnamed: 0,date,value
75,1996-01-01,77.0
76,1996-02-01,80.6
77,1996-03-01,78.6
78,1996-04-01,79.3
79,1996-05-01,79.4
...,...,...
406,2023-08-01,120.6
407,2023-09-01,119.8
408,2023-10-01,119.6
409,2023-11-01,122.8


### Top selling UK bottled water brands

 Mineral contents, number of consumers and prices of most commonly bought brands in the UK.

In [68]:
bottled_water_nutritions = pd.read_csv('data/Bottled_water_references - Sheet1.csv', )

In [69]:
bottled_water_nutritions

Unnamed: 0,gibrand,owner,consumers,type,price_per_liter_gbp,sulfate_so42,bicarbonate_hco,calcium_ca++,chloride_cl-,total_dissolved_solids_at_180c,magnesium_mg2,sodium_na+,silica_sio2,nitrate_no3-,potassium_k+,fluoride_f-,ph,remark
0,Highland Spring Water,Highland Spring,6132550,still,0.6,5.3,150.0,40.5,6.1,170.0,10.1,5.6,,3.1,0.7,,,
1,Evian,Danone,5939000,still,1.0,14.0,360.0,80.0,10.0,345.0,26.0,6.5,15.0,3.8,1.0,,,
2,Buxton Spring,Nestle,5141140,still,0.6,13.0,248.0,55.0,37.0,280.0,19.0,24.0,,0.1,1.0,,7.4,
3,Volvic,Danone,4489750,still,0.7,9.0,74.0,12.0,15.0,130.0,8.0,12.0,32.0,7.3,6.0,,,
4,Tesco,Tesco,3748080,still,0.2,11.0,25.0,11.0,14.0,,3.0,10.0,,,,,,
5,San Pellegrino,Nestle,2049090,sparkling,1.1,401.0,244.0,166.0,49.6,853.0,49.5,30.0,7.3,2.8,2.1,0.5,,
6,Sainsbury's,Sainsbury's,1954960,still,0.2,5.0,160.0,40.0,10.0,220.0,20.0,10.0,,1.0,0.0,,7.2,
7,Asda,Asda,1764320,still,0.2,10.0,166.0,40.0,11.0,228.0,14.0,6.0,,8.0,3.0,,7.8,
8,Harrogate Spa,Danone,1444100,still,2.0,13.0,215.0,57.0,37.0,,19.0,8.0,,1.0,,,7.0,
9,Pure Life,Nestle,1354900,still,0.4,9.7,184.6,59.0,18.1,248.0,10.0,11.9,,,1.2,,6.5,


In [70]:
new_table = 'uk_bottled_water_production'
if engine!=None:
    try:
        uk_bottled_water_production.to_sql(name=new_table, # Name of SQL table variable
                        con=engine, # Engine or connection
                        schema=schema,
                        if_exists='replace', # Drop the table before inserting new values 
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {new_table} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None
else:
    print('No engine')

The uk_bottled_water_production table was imported successfully.


In [71]:
new_table = 'uk_bottled_water_nutrition'
if engine!=None:
    try:
        bottled_water_nutritions.to_sql(name=new_table, # Name of SQL table variable
                        con=engine, # Engine or connection
                        schema=schema,
                        if_exists='replace', # Drop the table before inserting new values 
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {new_table} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None
else:
    print('No engine')

The uk_bottled_water_nutrition table was imported successfully.


In [72]:
new_table = 'uk_bottled_water_inflation'
if engine!=None:
    try:
        uk_bottled_water_inflation_clean.to_sql(name=new_table, # Name of SQL table variable
                        con=engine, # Engine or connection
                        schema=schema,
                        if_exists='replace', # Drop the table before inserting new values 
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {new_table} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None
else:
    print('No engine')

The uk_bottled_water_inflation table was imported successfully.
