In [1]:
import pandas as pd
import numpy as np

# Control data

The purpose of this workbook is to extract potential controls from both the EPO and the PCT specifications. This ensures that there is no overlap between the EPO and PCT controls so that a potential patent may appear in the controls twice, as this may otherwise bias the results. The final potential is combined across both the EPO and the PCT, used in the creating_EPO/PCT_control workbooks.

### EPO potential citations

In [2]:
#read in both the firm and individual data
EPO_ind_data = pd.read_csv("Patents data/202001_EPO_Inv_reg.txt", sep= "|", header = 0)
EPO_firm_data = pd.read_csv("Patents data/202001_EPO_App_reg.txt", sep = "|", header = 0)


In [3]:
#drop any unecessary rows from the data 
EPO_ind_data.drop(["pub_nbr", "person_id", "inv_name", "address", "reg_code",
                   "ctry_code", "reg_share", "inv_share"], axis = 1, inplace = True)

In [4]:
#drop any unecessary rows from the data
EPO_firm_data.drop(["pub_nbr", "person_id", "app_name", "address", "reg_code",
                   "ctry_code", "reg_share", "app_share"], axis = 1, inplace = True)

In [5]:
#get a list of all app numbers from the both EPOs
#use set to make sure there are no duplicates
EPO_all_app_nbr = list(set(list(EPO_ind_data["app_nbr"]) + list(EPO_firm_data["app_nbr"])))

In [6]:
#drop the app nbr code as the IPC code only contains appln_id which it will be merged on
EPO_firm_data.drop(["app_nbr"], axis = 1, inplace = True)
EPO_ind_data.drop(["app_nbr"], axis = 1, inplace = True)

In [7]:
#check to make sure that no data has been dropped
EPO_firm_data.count()

appln_id    3784463
dtype: int64

In [8]:
#list all the firm id's
EPO_firm_list = list(EPO_firm_data["appln_id"])

In [9]:
#see if there are any ids in individual data that are not the same in the individual id
EPO_inv_unique = EPO_ind_data[~EPO_ind_data["appln_id"].isin(EPO_firm_list)]

In [10]:
#output the results of the difference
EPO_inv_unique

Unnamed: 0,appln_id
84171,95006
181381,153738
217928,169818
217929,169818
217930,169818
304289,212280
357396,233507
357397,233507
357398,233507
360161,234485


In [11]:
#repeat the same for the firm data
EPO_ind_list = list(EPO_ind_data["appln_id"])

In [12]:
#it appears that the firms have 10,000 extra application ids
EPO_firm_unique = EPO_firm_data[~EPO_firm_data["appln_id"].isin(EPO_ind_list)]
EPO_firm_unique

Unnamed: 0,appln_id
592,618
2696,2773
2933,3011
3032,3116
3034,3118
3114,3202
3160,3277
3247,3407
3254,3416
3336,3516


In [13]:
#get a list of all application id's
all_EPO_list = set(EPO_ind_list + EPO_firm_list)

In [14]:
#check the length of this list
len(all_EPO_list)

3507391

In [15]:
#create a dataframe of this list
all_EPO = pd.DataFrame(all_EPO_list)

In [16]:
all_EPO.rename(columns = {0: "appln_id"}, inplace = True)

In [17]:
all_EPO

Unnamed: 0,appln_id
0,16777216
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [18]:
#read in the IPC dataframe that includes prio_ye IPC
EPO_IPC = pd.read_csv("Patents data/202001_EPO_IPC.txt", sep = "|", header = 0)

In [19]:
#using the definition that the first IPC code is the primary code
#extract the first IPC code associated with that appln_id
def first(s):
    return s.iloc[0]
    
IPC_first = EPO_IPC.groupby(["appln_id"]).agg(first)

In [21]:
#merge the EPO data with the IPC code data
all_EPO_IPC = all_EPO.merge(IPC_first, on="appln_id")

In [22]:
#count the results
all_EPO_IPC.count()

appln_id     3507391
prio_year    3507391
app_year     3507391
IPC          3505684
dtype: int64

### PCT potential citations

In [23]:
#read in all PCT information
EPO_PCT = pd.read_csv("Patents data/202001_EPO_PCT.txt", sep = "|", header = 0)
PCT_firm = pd.read_csv("Patents data/202001_PCT_App_reg.txt", sep = "|", header = 0)
PCT_ind = pd.read_csv("Patents data/202001_PCT_Inv_reg.txt", sep = "|", header = 0)
PCT_IPC = pd.read_csv("Patents data/202001_PCT_IPC.txt", sep = "|", header = 0)

In [24]:
#check the number of codes that cross both EPO and PCT boundaries
EPO_PCT.count()

app_nbr    3549526
pct_nbr    3549526
dtype: int64

In [25]:
#count the number of IPC codes
PCT_IPC.count()

pct_nbr      12601935
prio_year    12601935
app_year     12601935
IPC          12599616
dtype: int64

In [26]:
#drop any unecessary information from the firm database
PCT_firm.drop(["internat_appln_nr", "appln_id","app_name", "address", "reg_code", "ctry_code",
              "reg_share", "app_share"], axis = 1, inplace=True)

In [27]:
#drop any unecessary information from the individual database
PCT_ind.drop(["internat_appln_nr", "appln_id","inv_name", "address", "reg_code", "ctry_code",
              "reg_share", "inv_share"], axis = 1, inplace=True)

In [28]:
#create a dataframe of all the pct_nbr's 
PCT_all = pd.DataFrame(set(list(PCT_firm["pct_nbr"]) + list(PCT_ind["pct_nbr"])))

In [29]:
#rename the column to pct_nbr
PCT_all.rename(columns = {0: "pct_nbr"},inplace = True)

In [30]:
#remove all results that appear in the EPO dataframe above
PCT_EPO_notEPO = EPO_PCT[~EPO_PCT["app_nbr"].isin(EPO_all_app_nbr)]

In [31]:
#we want PCT applications that also progress to the EPO but do not appear in the EPO data above
#first of all create a list of all pct_nbr's that appear in the PCT_EPO list but not in the EPO list above
PCT_EPO_list = list(PCT_EPO_notEPO["pct_nbr"])
#extract only PCT results that have a corresponding EPO number but are not duplicated above
PCT_all_EPO = PCT_all[PCT_all["pct_nbr"].isin(PCT_EPO_list)]

In [32]:
#for the PCT IPC database we want only the first result
def first(s):
    return s.iloc[0]
    
IPC_first_PCT = PCT_IPC.groupby(["pct_nbr"]).agg(first)

In [33]:
#merge the results from above with the IPC database
PCT_IPC_all_EPO = PCT_all_EPO.merge(IPC_first_PCT, on = "pct_nbr")

In [34]:
#check the result
PCT_IPC_all_EPO

Unnamed: 0,pct_nbr,prio_year,app_year,IPC
0,WO2013091521,2011,2012,C07F009/53
1,WO2019084426,2017,2018,A61M029/00
2,WO2010045609,2008,2009,B65G045/00
3,WO2019098131,2017,2018,G02B005/08
4,WO2004085669,2003,2004,C12Q001/68
5,WO2016184213,2015,2016,H04L009/32
6,WO2001061289,2000,2001,G01G023/37
7,WO2014073190,2012,2013,H01G011/26
8,WO2017203410,2016,2017,H01H009/02
9,WO2005015965,2003,2004,H05K001/09


## Merge the two

Once both potential citations have been identified, they must be merged in order to make sure that the controls are representative of the complete set of data that is available.

In [35]:
#we then want to combine both the EPO data and the IPC data 
#this is so that any control choice depends not only on EPO data but also IPC at the same time
#this reduces any potential bias in the results
combined = pd.concat([all_EPO_IPC, PCT_IPC_all_EPO], axis = 0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [36]:
#check the combined results
combined

Unnamed: 0,IPC,app_year,appln_id,pct_nbr,prio_year
0,A01N025/02,1989,16777216.0,,1988
1,G06K007/00,2000,1.0,,1999
2,C07K014/00,1992,2.0,,1991
3,G01T001/00,2000,3.0,,1999
4,H01L021/60,2000,4.0,,1999
5,H01L021/28,2000,5.0,,1999
6,H04L012/00,2000,6.0,,1999
7,H04L012/28,2000,7.0,,1999
8,B01D053/86,2000,8.0,,2000
9,A62C031/02,2000,9.0,,1999


In [None]:
#output this to a new csv to be merged later on with the actual citations
combined.to_csv("Patents data/potential_control.csv",index = False)