# Introduction to pandas

The python package pandas is very useful to read csv files, but also many text files that are more or less formated as one observation per row and one column for each feature.

As an example, we are going to look at the list of seismic stations from the Northern California seismic network, available here:

http://ncedc.org/ftp/pub/doc/NC.info/NC.channel.summary.day

In [1]:
url = 'http://ncedc.org/ftp/pub/doc/NC.info/NC.channel.summary.day'

First we import useful packages. The package request is useful to read data from a web page.

In [2]:
import numpy as np
import pandas as pd
import io
import pickle
import requests
from datetime import datetime, timedelta
from math import cos, sin, pi, sqrt

The function read_csv is used to open and read your text file. In the case of a well formatted csv file, only the name of the file needs to be entered:

data = pd.read_csv('my_file.csv')

However, many options are available if the file is not well formatted. See more on:

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html

In [3]:
s = requests.get(url).content
data = pd.read_csv(io.StringIO(s.decode('utf-8')), header=None, skiprows=2, sep='\s+', usecols=list(range(0, 13)))
data.columns = ['station', 'network', 'channel', 'location', 'rate', 'start_time', 'end_time', 'latitude', 'longitude', 'elevation', 'depth', 'dip', 'azimuth']

Let us look at the data. They are now stored into a pandas dataframe.

In [4]:
data.head()

Unnamed: 0,station,network,channel,location,rate,start_time,end_time,latitude,longitude,elevation,depth,dip,azimuth
0,AAR,NC,EHZ,--,100.0,"1984/01/01,00:00:00","1987/05/01,00:00:00",39.27594,-121.02696,911.0,0.0,-90.0,0.0
1,AAR,NC,EHZ,--,100.0,"1987/05/01,00:00:00","2006/01/04,19:19:00",39.27594,-121.02696,911.0,0.0,-90.0,0.0
2,AAR,NC,SHZ,--,20.0,"1994/11/28,00:00:00","2006/01/04,19:19:00",39.27594,-121.02696,911.0,0.0,-90.0,0.0
3,AAS,NC,EHZ,--,100.0,"1984/11/27,18:45:00","1987/05/01,00:00:00",38.43014,-121.10959,31.0,0.0,-90.0,0.0
4,AAS,NC,EHZ,--,100.0,"1987/05/01,00:00:00","3000/01/01,00:00:00",38.43014,-121.10959,31.0,0.0,-90.0,0.0


There are two aways of looking at a particular column:

In [5]:
data.station

0        AAR
1        AAR
2        AAR
3        AAS
4        AAS
        ... 
6135     WMP
6136     WMP
6137     WMP
6138     WMP
6139    WWVB
Name: station, Length: 6140, dtype: object

In [6]:
data['station']

0        AAR
1        AAR
2        AAR
3        AAS
4        AAS
        ... 
6135     WMP
6136     WMP
6137     WMP
6138     WMP
6139    WWVB
Name: station, Length: 6140, dtype: object

If we want to look at a given row or column, and we know its index, we can do:

In [7]:
data.iloc[0]

station                       AAR
network                        NC
channel                       EHZ
location                       --
rate                        100.0
start_time    1984/01/01,00:00:00
end_time      1987/05/01,00:00:00
latitude                 39.27594
longitude              -121.02696
elevation                   911.0
depth                         0.0
dip                         -90.0
azimuth                       0.0
Name: 0, dtype: object

In [8]:
data.iloc[:, 0]

0        AAR
1        AAR
2        AAR
3        AAS
4        AAS
        ... 
6135     WMP
6136     WMP
6137     WMP
6138     WMP
6139    WWVB
Name: station, Length: 6140, dtype: object

If we know the name of the column, we can do:

In [9]:
data.loc[:, 'station']

0        AAR
1        AAR
2        AAR
3        AAS
4        AAS
        ... 
6135     WMP
6136     WMP
6137     WMP
6138     WMP
6139    WWVB
Name: station, Length: 6140, dtype: object

We can also access a single value within a column:

In [10]:
data.loc[0, 'station']

'AAR'

We can filter the data with the value taken by a given column:

In [11]:
data.loc[data.station=='KCPB']

Unnamed: 0,station,network,channel,location,rate,start_time,end_time,latitude,longitude,elevation,depth,dip,azimuth
2993,KCPB,NC,BHE,--,50.0,"1999/08/03,00:00:00","2000/06/06,16:00:00",39.68631,-123.58242,1261.0,0.0,0.0,90.0
2994,KCPB,NC,BHE,--,50.0,"2000/06/06,16:00:00","2002/01/24,23:50:00",39.68631,-123.58242,1261.0,0.0,0.0,90.0
2995,KCPB,NC,BHE,--,50.0,"2002/01/24,23:50:00","2002/10/16,23:59:00",39.68631,-123.58242,1261.0,0.0,0.0,90.0
2996,KCPB,NC,BHE,--,20.0,"2002/10/17,00:00:00","2006/01/24,18:00:00",39.68631,-123.58242,1261.0,0.0,0.0,90.0
2997,KCPB,NC,BHN,--,50.0,"1999/08/03,00:00:00","2000/06/06,16:00:00",39.68631,-123.58242,1261.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3066,KCPB,NC,MNE,--,10.0,"2000/06/06,16:00:00","2000/07/12,00:00:00",39.68631,-123.58242,1261.0,0.0,0.0,90.0
3067,KCPB,NC,MNN,--,10.0,"1999/08/03,00:00:00","2000/06/06,16:00:00",39.68631,-123.58242,1261.0,0.0,0.0,0.0
3068,KCPB,NC,MNN,--,10.0,"2000/06/06,16:00:00","2000/07/12,00:00:00",39.68631,-123.58242,1261.0,0.0,0.0,0.0
3069,KCPB,NC,MNZ,--,10.0,"1999/08/03,00:00:00","2000/06/06,16:00:00",39.68631,-123.58242,1261.0,0.0,-90.0,0.0


In [12]:
data.loc[(data.station=='KCPB') | (data.station=='KHBB')]

Unnamed: 0,station,network,channel,location,rate,start_time,end_time,latitude,longitude,elevation,depth,dip,azimuth
2993,KCPB,NC,BHE,--,50.0,"1999/08/03,00:00:00","2000/06/06,16:00:00",39.68631,-123.58242,1261.0,0.0,0.0,90.0
2994,KCPB,NC,BHE,--,50.0,"2000/06/06,16:00:00","2002/01/24,23:50:00",39.68631,-123.58242,1261.0,0.0,0.0,90.0
2995,KCPB,NC,BHE,--,50.0,"2002/01/24,23:50:00","2002/10/16,23:59:00",39.68631,-123.58242,1261.0,0.0,0.0,90.0
2996,KCPB,NC,BHE,--,20.0,"2002/10/17,00:00:00","2006/01/24,18:00:00",39.68631,-123.58242,1261.0,0.0,0.0,90.0
2997,KCPB,NC,BHN,--,50.0,"1999/08/03,00:00:00","2000/06/06,16:00:00",39.68631,-123.58242,1261.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3212,KHBB,NC,LHZ,--,1.0,"2015/10/29,21:18:00","2016/04/28,16:56:00",40.65990,-123.21966,1864.0,0.0,-90.0,0.0
3213,KHBB,NC,LHZ,--,1.0,"2016/04/28,16:56:00","3000/01/01,00:00:00",40.65990,-123.21966,1864.0,0.0,-90.0,0.0
3214,KHBB,NC,LNE,--,1.0,"2015/10/29,21:18:00","2016/04/28,16:56:00",40.65990,-123.21966,1864.0,0.0,0.0,90.0
3215,KHBB,NC,LNN,--,1.0,"2015/10/29,21:18:00","2016/04/28,16:56:00",40.65990,-123.21966,1864.0,0.0,0.0,0.0


In [13]:
data.loc[data.station.isin(['KCPB', 'KHBB'])]

Unnamed: 0,station,network,channel,location,rate,start_time,end_time,latitude,longitude,elevation,depth,dip,azimuth
2993,KCPB,NC,BHE,--,50.0,"1999/08/03,00:00:00","2000/06/06,16:00:00",39.68631,-123.58242,1261.0,0.0,0.0,90.0
2994,KCPB,NC,BHE,--,50.0,"2000/06/06,16:00:00","2002/01/24,23:50:00",39.68631,-123.58242,1261.0,0.0,0.0,90.0
2995,KCPB,NC,BHE,--,50.0,"2002/01/24,23:50:00","2002/10/16,23:59:00",39.68631,-123.58242,1261.0,0.0,0.0,90.0
2996,KCPB,NC,BHE,--,20.0,"2002/10/17,00:00:00","2006/01/24,18:00:00",39.68631,-123.58242,1261.0,0.0,0.0,90.0
2997,KCPB,NC,BHN,--,50.0,"1999/08/03,00:00:00","2000/06/06,16:00:00",39.68631,-123.58242,1261.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3212,KHBB,NC,LHZ,--,1.0,"2015/10/29,21:18:00","2016/04/28,16:56:00",40.65990,-123.21966,1864.0,0.0,-90.0,0.0
3213,KHBB,NC,LHZ,--,1.0,"2016/04/28,16:56:00","3000/01/01,00:00:00",40.65990,-123.21966,1864.0,0.0,-90.0,0.0
3214,KHBB,NC,LNE,--,1.0,"2015/10/29,21:18:00","2016/04/28,16:56:00",40.65990,-123.21966,1864.0,0.0,0.0,90.0
3215,KHBB,NC,LNN,--,1.0,"2015/10/29,21:18:00","2016/04/28,16:56:00",40.65990,-123.21966,1864.0,0.0,0.0,0.0


We can access to a brief summary of the data:

In [14]:
data.station.describe()

count     6140
unique     893
top       KCPB
freq        78
Name: station, dtype: object

In [15]:
data.elevation.describe()

count    6140.000000
mean      650.942248
std       681.708404
min     -1388.000000
25%       146.750000
50%       406.000000
75%       918.000000
max      3680.000000
Name: elevation, dtype: float64

We can perform standard operations on the whole data set:

In [16]:
data.mean()

rate          92.544982
latitude      37.930985
longitude   -121.440088
elevation    650.942248
depth         17.581270
dip          -45.894137
azimuth       18.549186
dtype: float64

In the case of a categorical variable, we can get the list of possile values that this variable can take:

In [17]:
data.channel.unique()

array(['EHZ', 'SHZ', 'HHE', 'HHN', 'HHZ', 'HNE', 'HNN', 'HNZ', 'LHE',
       'LHN', 'LHZ', 'ELE', 'ELN', 'ELZ', 'SLE', 'SLN', 'SLZ', 'LCE',
       'LCL', 'LCQ', 'LOG', 'OCF', 'VCO', 'VEA', 'VEC', 'VEP', 'VFP',
       'VKI', 'GAN', 'GNS', 'GPL', 'GST', 'VDT', 'VEI', 'VPB', 'EHE',
       'EHN', 'BNE', 'BNN', 'BN1', 'BN2', 'BN3', 'BV1', 'EP1', 'EP2',
       'EP3', 'HDO', 'HN1', 'HN2', 'HN3', 'HV1', 'SP2', 'SP3', 'BNZ',
       'LDO', 'HJ2', 'HJ3', 'HJZ', 'ACE', 'GEL', 'GLA', 'GLO', 'ATT',
       'SHE', 'SHN', 'BHE', 'BHN', 'BHZ', 'LNE', 'LNN', 'LNZ', 'MHE',
       'MHN', 'MHZ', 'MNE', 'MNN', 'MNZ', 'HH2', 'HH3', 'LH2', 'LH3',
       'SP1', 'DP1', 'DP2', 'DP3', 'HLE', 'HLN', 'HLZ', 'XNE', 'XNN',
       'XNZ', 'EH1'], dtype=object)

and get the number of times that each value is taken:

In [18]:
data.station.value_counts()

KCPB     78
KMPB     72
JSGB     69
KHMB     63
CCH1     60
         ..
PL21      1
LSS       1
IRG1A     1
AGI       1
BVLB      1
Name: station, Length: 893, dtype: int64

There are several ways of doing an operation on all rows of a column. The first option is to use the map function.

If you are not familiar with lambda function in Python, look at:

https://realpython.com/python-lambda/

In [19]:
data_elevation_mean = data.elevation.mean()
data.elevation.map(lambda p: p - data_elevation_mean)

0       260.057752
1       260.057752
2       260.057752
3      -619.942248
4      -619.942248
           ...    
6135    427.057752
6136    427.057752
6137    427.057752
6138    427.057752
6139   -649.942248
Name: elevation, Length: 6140, dtype: float64

The second option is to use the apply function:

In [20]:
def remean_elevation(row):
    row.elevation = row.elevation - data_elevation_mean
    return row
data.apply(remean_elevation, axis='columns')

Unnamed: 0,station,network,channel,location,rate,start_time,end_time,latitude,longitude,elevation,depth,dip,azimuth
0,AAR,NC,EHZ,--,100.0,"1984/01/01,00:00:00","1987/05/01,00:00:00",39.27594,-121.02696,260.057752,0.0,-90.0,0.0
1,AAR,NC,EHZ,--,100.0,"1987/05/01,00:00:00","2006/01/04,19:19:00",39.27594,-121.02696,260.057752,0.0,-90.0,0.0
2,AAR,NC,SHZ,--,20.0,"1994/11/28,00:00:00","2006/01/04,19:19:00",39.27594,-121.02696,260.057752,0.0,-90.0,0.0
3,AAS,NC,EHZ,--,100.0,"1984/11/27,18:45:00","1987/05/01,00:00:00",38.43014,-121.10959,-619.942248,0.0,-90.0,0.0
4,AAS,NC,EHZ,--,100.0,"1987/05/01,00:00:00","3000/01/01,00:00:00",38.43014,-121.10959,-619.942248,0.0,-90.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6135,WMP,NC,SHN,--,20.0,"1995/07/02,12:00:00","2002/05/08,22:30:00",35.64059,-118.78570,427.057752,0.0,0.0,0.0
6136,WMP,NC,SHZ,--,20.0,"1995/03/02,19:00:00","1995/07/02,12:00:00",35.64059,-118.78570,427.057752,0.0,-90.0,0.0
6137,WMP,NC,SHZ,--,20.0,"1995/07/02,12:00:00","2002/05/08,22:30:00",35.64059,-118.78570,427.057752,0.0,-90.0,0.0
6138,WMP,NC,SHZ,10,20.0,"1995/07/02,12:00:00","1999/05/11,23:59:00",35.64059,-118.78570,427.057752,0.0,-90.0,0.0


We can also carry out simple operations on coulumns, provided they make sense.

In [21]:
data.network + ' - ' + data.station

0        NC - AAR
1        NC - AAR
2        NC - AAR
3        NC - AAS
4        NC - AAS
          ...    
6135     NC - WMP
6136     NC - WMP
6137     NC - WMP
6138     NC - WMP
6139    NC - WWVB
Length: 6140, dtype: object

A useful feature is to group the rows depending on the value of a categorical variable, and then apply the same operation to all the groups. For instance, I want to know how many times each station appears in the file:

In [22]:
data.groupby('station').station.count()

station
AAR      3
AAS      3
ABJ      4
ABR      3
ADW      3
        ..
VRC      4
VSP      2
VWB      1
WMP     10
WWVB     1
Name: station, Length: 893, dtype: int64

Or I want to know what is the lowest and the highest elevation for each station:

In [23]:
data.groupby('station').elevation.min()

station
AAR      911.0
AAS       31.0
ABJ      434.0
ABR       -1.0
ADW      228.0
         ...  
VRC     1666.0
VSP     1545.0
VWB     1736.0
WMP     1078.0
WWVB       1.0
Name: elevation, Length: 893, dtype: float64

In [24]:
data.groupby('station').elevation.max()

station
AAR      911.0
AAS       31.0
ABJ      434.0
ABR       -1.0
ADW      228.0
         ...  
VRC     1666.0
VSP     1545.0
VWB     1736.0
WMP     1078.0
WWVB       1.0
Name: elevation, Length: 893, dtype: float64

We can have access to the data type of each column:

In [25]:
data.dtypes

station        object
network        object
channel        object
location       object
rate          float64
start_time     object
end_time       object
latitude      float64
longitude     float64
elevation     float64
depth         float64
dip           float64
azimuth       float64
dtype: object

Here, pandas does not recognize the start_time and end_time columns as a datetime format, so we cannot use datetime operations on them. We first need to convert these columns into a datetime format:

In [26]:
# Transform column into datetime format
startdate = pd.to_datetime(data['start_time'], format='%Y/%m/%d,%H:%M:%S')
data['start_time'] = startdate
# Avoid 'OutOfBoundsDatetime' error with year 3000
enddate = data['end_time'].str.replace('3000', '2025')
enddate = pd.to_datetime(enddate, format='%Y/%m/%d,%H:%M:%S')
data['end_time'] = enddate

We can now look when each seismic station was installed:

In [27]:
data.groupby('station').apply(lambda df: df.start_time.min())

station
AAR    1984-01-01 00:00:00
AAS    1984-11-27 18:45:00
ABJ    1984-01-01 00:00:00
ABR    1984-01-01 00:00:00
ADW    1984-01-01 00:00:00
               ...        
VRC    1993-09-23 22:20:00
VSP    1993-09-24 22:05:00
VWB    1985-01-01 00:00:00
WMP    1995-03-02 19:00:00
WWVB   1984-01-01 00:00:00
Length: 893, dtype: datetime64[ns]

The agg function allows to carry out several operations to each group of rows:

In [28]:
data.groupby(['station']).elevation.agg(['min', 'max'])

Unnamed: 0_level_0,min,max
station,Unnamed: 1_level_1,Unnamed: 2_level_1
AAR,911.0,911.0
AAS,31.0,31.0
ABJ,434.0,434.0
ABR,-1.0,-1.0
ADW,228.0,228.0
...,...,...
VRC,1666.0,1666.0
VSP,1545.0,1545.0
VWB,1736.0,1736.0
WMP,1078.0,1078.0


In [29]:
data.groupby(['station']).agg({'start_time':lambda x: min(x), 'end_time':lambda x: max(x)})

Unnamed: 0_level_0,start_time,end_time
station,Unnamed: 1_level_1,Unnamed: 2_level_1
AAR,1984-01-01 00:00:00,2006-01-04 19:19:00
AAS,1984-11-27 18:45:00,2025-01-01 00:00:00
ABJ,1984-01-01 00:00:00,2019-06-26 19:17:00
ABR,1984-01-01 00:00:00,1997-08-04 21:02:00
ADW,1984-01-01 00:00:00,2006-04-20 01:08:00
...,...,...
VRC,1993-09-23 22:20:00,2001-07-12 16:50:00
VSP,1993-09-24 22:05:00,2001-07-12 16:50:00
VWB,1985-01-01 00:00:00,1985-03-25 19:00:00
WMP,1995-03-02 19:00:00,2002-05-08 22:30:00


We can also make groups by selecting the values of two categorical variables:

In [30]:
data.groupby(['station', 'channel']).agg({'start_time':lambda x: min(x), 'end_time':lambda x: max(x)})

Unnamed: 0_level_0,Unnamed: 1_level_0,start_time,end_time
station,channel,Unnamed: 2_level_1,Unnamed: 3_level_1
AAR,EHZ,1984-01-01 00:00:00,2006-01-04 19:19:00
AAR,SHZ,1994-11-28 00:00:00,2006-01-04 19:19:00
AAS,EHZ,1984-11-27 18:45:00,2025-01-01 00:00:00
AAS,SHZ,1994-11-28 00:00:00,2006-01-24 18:00:00
ABJ,EHZ,1984-01-01 00:00:00,2019-06-26 19:17:00
...,...,...,...
WMP,EHZ,1995-03-02 19:00:00,2002-05-08 22:30:00
WMP,SHE,1995-07-02 12:00:00,2002-05-08 22:30:00
WMP,SHN,1995-07-02 12:00:00,2002-05-08 22:30:00
WMP,SHZ,1995-03-02 19:00:00,2002-05-08 22:30:00


Previously, we just printed the output, but we can also store it in a new variable:

In [31]:
data_grouped = data.groupby(['station', 'channel']).agg({'start_time':lambda x: min(x), 'end_time':lambda x: max(x)})

In [32]:
data_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,start_time,end_time
station,channel,Unnamed: 2_level_1,Unnamed: 3_level_1
AAR,EHZ,1984-01-01 00:00:00,2006-01-04 19:19:00
AAR,SHZ,1994-11-28 00:00:00,2006-01-04 19:19:00
AAS,EHZ,1984-11-27 18:45:00,2025-01-01 00:00:00
AAS,SHZ,1994-11-28 00:00:00,2006-01-24 18:00:00
ABJ,EHZ,1984-01-01 00:00:00,2019-06-26 19:17:00


When we select only some rows, the index is not automatically reset to start at 0. We can do it manually. Many functions in pandas have also an option to reset the index, and option to transform the dataframe in place, instead of saving the results in another variable.

In [33]:
data_grouped.reset_index()

Unnamed: 0,station,channel,start_time,end_time
0,AAR,EHZ,1984-01-01 00:00:00,2006-01-04 19:19:00
1,AAR,SHZ,1994-11-28 00:00:00,2006-01-04 19:19:00
2,AAS,EHZ,1984-11-27 18:45:00,2025-01-01 00:00:00
3,AAS,SHZ,1994-11-28 00:00:00,2006-01-24 18:00:00
4,ABJ,EHZ,1984-01-01 00:00:00,2019-06-26 19:17:00
...,...,...,...,...
3720,WMP,EHZ,1995-03-02 19:00:00,2002-05-08 22:30:00
3721,WMP,SHE,1995-07-02 12:00:00,2002-05-08 22:30:00
3722,WMP,SHN,1995-07-02 12:00:00,2002-05-08 22:30:00
3723,WMP,SHZ,1995-03-02 19:00:00,2002-05-08 22:30:00


It is also possible to sort the dataset by value.

In [34]:
data_grouped.sort_values(by='start_time')

Unnamed: 0_level_0,Unnamed: 1_level_0,start_time,end_time
station,channel,Unnamed: 2_level_1,Unnamed: 3_level_1
PG1,HV1,1983-06-27 00:00:00,1993-11-10 18:00:00
AAR,EHZ,1984-01-01 00:00:00,2006-01-04 19:19:00
HQR,EHZ,1984-01-01 00:00:00,1992-10-19 18:45:00
HQR,ELE,1984-01-01 00:00:00,1992-10-19 18:45:00
HQR,ELN,1984-01-01 00:00:00,1992-10-19 18:45:00
...,...,...,...
PBIB,HNE,2020-11-19 00:00:00,2025-01-01 00:00:00
PPBB,HNE,2020-11-19 00:00:00,2025-01-01 00:00:00
KBN,HNE,2021-02-26 00:48:00,2025-01-01 00:00:00
KBN,HNN,2021-02-26 00:48:00,2025-01-01 00:00:00


We can apply the sorting to several columns:

In [35]:
data_grouped.sort_values(by=['start_time', 'end_time'])

Unnamed: 0_level_0,Unnamed: 1_level_0,start_time,end_time
station,channel,Unnamed: 2_level_1,Unnamed: 3_level_1
PG1,HV1,1983-06-27 00:00:00,1993-11-10 18:00:00
PGC,EHZ,1984-01-01 00:00:00,1984-04-12 18:00:00
AFO,EHZ,1984-01-01 00:00:00,1984-04-23 00:00:00
MCH,EHZ,1984-01-01 00:00:00,1984-08-01 19:31:00
MCN,EHZ,1984-01-01 00:00:00,1984-09-22 16:45:00
...,...,...,...
PPBB,VEI,2020-11-19 00:00:00,2025-01-01 00:00:00
PPBB,VPB,2020-11-19 00:00:00,2025-01-01 00:00:00
KBN,HNE,2021-02-26 00:48:00,2025-01-01 00:00:00
KBN,HNN,2021-02-26 00:48:00,2025-01-01 00:00:00


A useful pandas function is the merge functions that allows you two merge two dataframes that have some columns in common, but have also different columns that you may want to compare with each other.

For example, I have two earthquake catalogs. The 2007-2009 was established using data from a temporary experiment, and the 2004-2011 was established using data from a permanent seismic network. I would like to know if some earthquakes are detected by a network, but not by the other.

I will compare the catalogs between July 2007 and May 2009. There is a time delay of 10s between the detection time of one catalog compared to the other. I will also filter the catalogs to eleiminate false detections.

In [36]:
tbegin = datetime(2007, 9, 25, 0, 0, 0)
tend = datetime(2009, 5, 14, 0, 0, 0)
dt = 10.0
thresh1 = 1.4
thresh2 = 1.9

I first read the two catalogs, and apply the filtering:

In [37]:
namefile = 'catalog_2007_2009.pkl'
df1 = pickle.load(open(namefile, 'rb'))
df1 = df1[['year', 'month', 'day', 'hour', 'minute', 'second', 'cc', 'nchannel']]
df1 = df1.astype({'year': int, 'month': int, 'day': int, 'hour': int, 'minute': int, 'second': float, 'cc': float, 'nchannel': int})
date = pd.to_datetime(df1.drop(columns=['cc', 'nchannel']))
df1['date'] = date
df1 = df1[(df1['date'] >= tbegin) & (df1['date'] <= tend)]
df1_filter = df1.loc[df1['cc'] * df1['nchannel'] >= thresh1]

namefile = 'catalog_2004_2011.pkl'
df2 = pickle.load(open(namefile, 'rb'))
df2 = df2[['year', 'month', 'day', 'hour', 'minute', 'second', 'cc', 'nchannel']]
df2 = df2.astype({'year': int, 'month': int, 'day': int, 'hour': int, 'minute': int, 'second': float, 'cc': float, 'nchannel': int})
date = pd.to_datetime(df2.drop(columns=['cc', 'nchannel']))
df2['date'] = date
df2['date'] = df2['date'] - timedelta(seconds=dt)
df2 = df2[(df2['date'] >= tbegin) & (df2['date'] <= tend)]
df2_filter = df2.loc[df2['cc'] * df2['nchannel'] >= thresh2]

To make the comparison, I first concatenate the two dataframes into a single dataframe. Then I merge the concatenated dataframe with one of the initial dataframes.

I apply the merge operation on the date column, that is if an earthquake in dataset 1 has the same date as an earthquake in dataset 2, I assume it is the same earthquake. You could also check if several columns have the same value, instead of doing the merge operation on only one column.

The process adds a merge column to the dataset, which indicates whether a row was found only in dataset 1, only in dataset 2, or in both datasets.

In [38]:
# Earthquakes in filtered 2007-2009 catalog but not in (unfiltered) 2004-2011 catalog
df_all = pd.concat([df2, df1_filter], ignore_index=True)
df_merge = df_all.merge(df2.drop_duplicates(), on=['date'], how='left', indicator=True)
df_added_1 = df_merge[df_merge['_merge'] == 'left_only']

# Earthquakes in filtered 2004-2011 catalog but not in (unfiltered) 2007-2009 catalog
df_all = pd.concat([df1, df2_filter], ignore_index=True)
df_merge = df_all.merge(df1.drop_duplicates(), on=['date'], how='left', indicator=True)
df_added_2 = df_merge[df_merge['_merge'] == 'left_only']

In [39]:
df_added_1

Unnamed: 0,year_x,month_x,day_x,hour_x,minute_x,second_x,cc_x,nchannel_x,date,year_y,month_y,day_y,hour_y,minute_y,second_y,cc_y,nchannel_y,_merge
487,2007,10,13,9,57,1.70,0.094065,15,2007-10-13 09:57:01.700,,,,,,,,,left_only
489,2007,10,29,9,2,17.30,0.079188,18,2007-10-29 09:02:17.300,,,,,,,,,left_only
501,2007,12,21,5,12,58.85,0.082917,18,2007-12-21 05:12:58.850,,,,,,,,,left_only
503,2007,12,21,5,13,48.80,0.087369,18,2007-12-21 05:13:48.800,,,,,,,,,left_only
505,2007,12,31,5,52,16.25,0.212977,15,2007-12-31 05:52:16.250,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
673,2009,2,2,12,47,27.65,0.087397,18,2009-02-02 12:47:27.650,,,,,,,,,left_only
679,2009,3,16,3,2,6.65,0.078042,18,2009-03-16 03:02:06.650,,,,,,,,,left_only
680,2009,3,25,4,20,30.35,0.068622,21,2009-03-25 04:20:30.350,,,,,,,,,left_only
683,2009,4,2,1,9,36.75,0.084087,21,2009-04-02 01:09:36.750,,,,,,,,,left_only


In [40]:
df_added_2

Unnamed: 0,year_x,month_x,day_x,hour_x,minute_x,second_x,cc_x,nchannel_x,date,year_y,month_y,day_y,hour_y,minute_y,second_y,cc_y,nchannel_y,_merge
715,2007,10,13,9,50,59.75,0.097495,23,2007-10-13 09:50:49.750,,,,,,,,,left_only
716,2007,10,13,9,54,59.65,0.106908,23,2007-10-13 09:54:49.650,,,,,,,,,left_only
718,2007,10,13,9,56,51.80,0.088420,23,2007-10-13 09:56:41.800,,,,,,,,,left_only
719,2007,10,13,9,57,11.75,0.208753,23,2007-10-13 09:57:01.750,,,,,,,,,left_only
720,2007,10,13,9,57,41.60,0.088150,23,2007-10-13 09:57:31.600,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
930,2009,4,2,3,31,35.80,0.078478,26,2009-04-02 03:31:25.800,,,,,,,,,left_only
933,2009,4,26,21,14,35.35,0.115993,26,2009-04-26 21:14:25.350,,,,,,,,,left_only
936,2009,4,26,21,17,18.35,0.088909,26,2009-04-26 21:17:08.350,,,,,,,,,left_only
937,2009,4,26,21,23,4.65,0.119792,26,2009-04-26 21:22:54.650,,,,,,,,,left_only
