# Packages

In [1]:
import pandas as pd
import numpy as np
import math
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None) # show full width of showing cols

# Import file ThresholdGas v3

In [16]:
dfThreshold = pd.read_csv("/Volumes/Extreme SSD/01_GasData/GasPriceThreshold_v3.csv")

In [17]:
dfThreshold = dfThreshold.sort_values(by=['block_number', 'gas_price'])

In [18]:
dfThreshold = dfThreshold.drop_duplicates(subset = ["block_number"],keep = "first")

In [19]:
dfThreshold = dfThreshold.rename(columns={"gas_price": "threshold_gas"})

In [20]:
dfThreshold = dfThreshold.reset_index(drop=True)

# Import gasUseed per block from old GasFeePerBlock_BQ_v1

In [21]:
dfGasUsedBlock = pd.read_csv("/Volumes/Extreme SSD/01_GasData/GasFeePerBlock_BQ_v1.csv")

In [22]:
dfGasUsedBlock = dfGasUsedBlock.sort_values(by=['block_number'])

In [23]:
dfGasUsedBlock

Unnamed: 0,block_number,avgGasPrice,sumGasUsed
600384,10093070,3.440147e+10,9992145
1220736,10093071,3.802547e+10,9988660
482813,10093072,6.624596e+10,9996320
801803,10093073,2.372361e+10,9981147
628366,10093074,4.161312e+10,9977063
...,...,...,...
1085515,12500184,5.358343e+10,14994858
667199,12500185,4.171575e+10,14988469
1780448,12500186,5.026618e+10,14995703
2145583,12500187,4.635642e+10,15003643


# Converting to Gwei and add as new column

In [24]:
dfThreshold['threshold_gas']= (dfThreshold['threshold_gas']/(10**9)).astype(float)

In [25]:
dfThreshold

Unnamed: 0,block_number,threshold_gas
0,10093070,16.5
1,10093071,16.5
2,10093072,6.0
3,10093073,16.0
4,10093074,16.0
...,...,...
2346697,12500184,36.3
2346698,12500185,10.0
2346699,12500186,6.0
2346700,12500187,36.0


# Checks
* Start block: 19.5.2020 (go-live of UniswapV2) = 10,093,070 height
* End block: 24.5.2021 (V3 > V2) = 12,500,188 height


In [26]:
startB = 10093070
endB = 12500188
goalLength = endB-startB
goalLength

2407118

In [27]:
blockCount = dfThreshold['block_number'].max() - dfThreshold['block_number'].min()
blockCount
# 2407118

2407118

In [28]:
blockCount == goalLength

True

# Import Barbon file

In [29]:
dfBarb = pd.read_parquet("/Volumes/Extreme SSD/01_GasData/gas_ts_block.par")

In [30]:
dfBarb['block_number_bar'] = dfBarb.index

In [31]:
dfBarb.sort_values(by = "avg_gas", ascending = False)

Unnamed: 0,tot_gas,avg_gas,median_gas,tx,ts,block_number_bar
2770908,21000,3.626777e+07,3.626777e+07,1,2016-12-08 14:48:21,2770908
10237208,23816361,3.006170e+06,4.000000e+01,169,2020-06-10 09:47:04,10237208
10241999,27248171,2.189859e+06,4.800000e+01,232,2020-06-11 03:30:12,10241999
7393675,15877873,1.666683e+06,1.000000e+01,60,2019-03-18 15:28:14,7393675
10247265,26681052,7.692683e+05,3.200000e+01,143,2020-06-11 23:12:14,10247265
...,...,...,...,...,...,...
12473812,12380000,0.000000e+00,0.000000e+00,1,2021-05-20 21:55:38,12473812
12910993,14970692,0.000000e+00,0.000000e+00,1,2021-07-27 23:31:45,12910993
12308587,14858420,0.000000e+00,0.000000e+00,1,2021-04-25 09:14:54,12308587
12429680,12380000,0.000000e+00,0.000000e+00,1,2021-05-14 01:46:46,12429680


In [32]:
# filter for thesis scope
#dfBarb_scope = dfBarb.filter(items = [*range(startB, endB+1)], axis=0) # Was machen wir hier?
dfBarb_scope = dfBarb[(dfBarb.block_number_bar >= startB) & (dfBarb.block_number_bar <= endB)]

In [33]:
#Check
dfBarb_scope['block_number_bar'].nunique() == len(dfBarb_scope.index) # check

True

In [34]:
# Missing blocks in barbon set: more detail in v1
dfBarb_scope['block_number_bar'].nunique() - goalLength

-58290

In [35]:
dfBarb_scope.sort_values(by = "avg_gas", ascending = False)

Unnamed: 0,tot_gas,avg_gas,median_gas,tx,ts,block_number_bar
10237208,23816361,3.006170e+06,40.0,169,2020-06-10 09:47:04,10237208
10241999,27248171,2.189859e+06,48.0,232,2020-06-11 03:30:12,10241999
10247265,26681052,7.692683e+05,32.0,143,2020-06-11 23:12:14,10247265
10778143,23406601,3.349862e+04,479.0,157,2020-09-01 22:48:53,10778143
10864496,29545928,1.571965e+04,182.0,202,2020-09-15 04:42:15,10864496
...,...,...,...,...,...,...
12414845,12380000,0.000000e+00,0.0,1,2021-05-11 18:51:31,12414845
12283500,14553272,0.000000e+00,0.0,1,2021-04-21 12:13:22,12283500
12414709,12380000,0.000000e+00,0.0,1,2021-05-11 18:23:39,12414709
12283781,14442844,0.000000e+00,0.0,1,2021-04-21 13:20:35,12283781


In [36]:
# Merge Barbin and Threshold files

In [37]:
# merge files
dfMerged = dfBarb_scope.merge(dfThreshold[['block_number','threshold_gas']], left_on = "block_number_bar", right_on='block_number')


In [38]:
dfMerged

Unnamed: 0,tot_gas,avg_gas,median_gas,tx,ts,block_number_bar,block_number,threshold_gas
0,22817345,34.401466,20.000000,92,2020-05-19 00:00:32,10093070,10093070,16.5
1,37810488,38.025468,20.000000,59,2020-05-19 00:00:59,10093071,10093071,16.5
2,19912767,66.245960,31.900000,56,2020-05-19 00:01:04,10093072,10093072,6.0
3,43009981,23.723614,18.000001,93,2020-05-19 00:01:29,10093073,10093073,16.0
4,26796744,41.613122,20.000000,77,2020-05-19 00:01:37,10093074,10093074,16.0
...,...,...,...,...,...,...,...,...
2346697,42583689,53.583429,46.200000,266,2021-05-24 23:58:42,12500184,12500184,36.3
2346698,24883565,41.715752,40.000000,213,2021-05-24 23:59:12,12500185,12500185,10.0
2346699,43760172,50.266184,46.805000,326,2021-05-24 23:59:14,12500186,12500186,6.0
2346700,54665462,46.356423,44.000000,163,2021-05-24 23:59:41,12500187,12500187,36.0


In [39]:
dfMerged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2346702 entries, 0 to 2346701
Data columns (total 8 columns):
 #   Column            Dtype         
---  ------            -----         
 0   tot_gas           int64         
 1   avg_gas           float64       
 2   median_gas        float64       
 3   tx                int64         
 4   ts                datetime64[ns]
 5   block_number_bar  int64         
 6   block_number      int64         
 7   threshold_gas     float64       
dtypes: datetime64[ns](1), float64(3), int64(4)
memory usage: 161.1 MB


# Merge with total gas used per block from BQ

In [41]:
# merge files
dfMerged = dfMerged.merge(dfGasUsedBlock[['block_number','sumGasUsed']], on='block_number', how = 'left')


In [44]:
dfMerged = dfMerged.rename(columns={"sumGasUsed": "total_gasUsed"})

In [45]:
dfMerged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2346702 entries, 0 to 2346701
Data columns (total 9 columns):
 #   Column            Dtype         
---  ------            -----         
 0   tot_gas           int64         
 1   avg_gas           float64       
 2   median_gas        float64       
 3   tx                int64         
 4   ts                datetime64[ns]
 5   block_number_bar  int64         
 6   block_number      int64         
 7   threshold_gas     float64       
 8   total_gasUsed     int64         
dtypes: datetime64[ns](1), float64(3), int64(5)
memory usage: 179.0 MB


In [48]:
dfMerged

Unnamed: 0,tot_gas,avg_gas,median_gas,tx,ts,block_number_bar,block_number,threshold_gas,total_gasUsed
0,22817345,34.401466,20.000000,92,2020-05-19 00:00:32,10093070,10093070,16.5,9992145
1,37810488,38.025468,20.000000,59,2020-05-19 00:00:59,10093071,10093071,16.5,9988660
2,19912767,66.245960,31.900000,56,2020-05-19 00:01:04,10093072,10093072,6.0,9996320
3,43009981,23.723614,18.000001,93,2020-05-19 00:01:29,10093073,10093073,16.0,9981147
4,26796744,41.613122,20.000000,77,2020-05-19 00:01:37,10093074,10093074,16.0,9977063
...,...,...,...,...,...,...,...,...,...
2346697,42583689,53.583429,46.200000,266,2021-05-24 23:58:42,12500184,12500184,36.3,14994858
2346698,24883565,41.715752,40.000000,213,2021-05-24 23:59:12,12500185,12500185,10.0,14988469
2346699,43760172,50.266184,46.805000,326,2021-05-24 23:59:14,12500186,12500186,6.0,14995703
2346700,54665462,46.356423,44.000000,163,2021-05-24 23:59:41,12500187,12500187,36.0,15003643


# Save gasPriceAll-file

In [46]:
dfMerged.to_parquet("/Volumes/Extreme SSD/01_GasData/GasPriceAll.par")

In [49]:
dfMerged.to_csv("/Volumes/Extreme SSD/01_GasData/GasPriceAll.csv")