In [1]:
import pandas as pd
import numpy as np

# Removing self citations and splitting control and treatment

This workbook works to remove self-citations from the treatment group and split the dataset into treatment and controls for the EPO and patent datasets.

To remove self-citations, both firm and individual Id's are used to identify self-citations. This means that there may be issues in terms of identify self-citations where the id has changed. This is because the id is assigned based on a perfect match. Therefore, if there is an incomplete match then this will not be picked up.

This is therefore likely to bias the results towards finding localisation effects if this issue is large.

Future work would therefore seek to improve this.

The first step is to read in the files that contain the cited, citing and control values.

In [2]:
#read in both the EPO and PCT citations with controls
#These come from creating_EPO_ctrls.ipynb and creating_PCT_ctrls.ipynb
EPO_cit_ctrl = pd.read_csv("Patents data/EPO_cit_wCTRL_single2.csv")
PCT_cit_ctrl = pd.read_csv("Patents data/PCT_cit_wCTRL_single2.csv")

In [3]:
#drop unecessary columns for both datasets
EPO_cit_ctrl.drop(["Unnamed: 0", "Index"], axis = 1, inplace = True)

In [4]:
PCT_cit_ctrl.drop(["Unnamed: 0", "Index"], axis = 1, inplace = True)

## Firstly, EPO citations

In [5]:
#count how many citations there are
EPO_cit_ctrl.count()

Citing_app_nbr      76151
Citing_appln_id     76151
Cited_App_nbr       76151
Cited_Appln_id      76151
prio_year           76151
Citing_IPC          76146
IPC_subclass        76146
Control_IPC         76146
app_year            76151
Control_appln_id    60850
Control_pct_nbr     15301
dtype: int64

In [6]:
#create a list of all cited application numbers
EPO_cit_cited = list(EPO_cit_ctrl["Cited_App_nbr"].unique())

In [7]:
#create a list of all citing application numbers
EPO_cit_citing = list(EPO_cit_ctrl["Citing_app_nbr"].unique())

In [8]:
#for the firm I only want the firm names to be merged with the cited information
#This means we can remove any self citations based on whether they belonged to the same firm
EPO_firm = pd.read_csv("Patents data/202001_EPO_App_Reg.txt", sep = "|", header = 0)

In [9]:
#for this we need all that is needed is the app_nbr and the person_id 
EPO_firm.drop(["address", "reg_code", "ctry_code", "reg_share", "pub_nbr", "app_share", "app_name", "appln_id"], axis = 1, inplace = True)

In [10]:
#the result is a small dataframe which includes app_nbr and person_id
EPO_firm

Unnamed: 0,app_nbr,person_id
0,EP20000103094,1
1,EP20000107845,7
2,EP20000107845,8
3,EP20000202556,20
4,EP20000202556,21
5,EP20000300208,26
6,EP20000310305,26
7,EP20000310786,33
8,EP20000962768,1
9,EP20000970100,41


In [11]:
#we want only firms for which their patent is cited
EPO_firm_cited = EPO_firm[EPO_firm["app_nbr"].isin(EPO_cit_cited)]

In [12]:
#we group the results by application number as multiple firms may have been part of a single patent
#the person_id's are put into a list so that all ids for the firms are in a single column
#The solution for this comes from: https://stackoverflow.com/questions/35024023/pandas-groupby-result-into-multiple-columns
EPO_firm_cited_grouped = EPO_firm_cited.groupby("app_nbr").agg({'person_id':lambda x:x.tolist()})
#the index is then reset
EPO_firm_cited_grouped.reset_index(inplace= True)
#the person_id column is then renamed to the cited_firm id column
#this shows the ids of each of the firms that were involved in that patent
EPO_firm_cited_grouped.rename(columns = {"person_id":"Cited_firm_id"}, inplace = True)

In [13]:
#we then want to do the same with the citing applications
#so we get only firms which their application number is the same as in citing column
EPO_firm_citing = EPO_firm[EPO_firm["app_nbr"].isin(EPO_cit_citing)]

In [14]:
#the same solution is used as above
EPO_firm_citing_grouped = EPO_firm_citing.groupby("app_nbr").agg({'person_id':lambda x:x.tolist()})
EPO_firm_citing_grouped.reset_index(inplace= True)
#this is renamed as the citing firm id so that it is not confused with the cited firm id
EPO_firm_citing_grouped.rename(columns = {"person_id": "citing_firm_id"}, inplace = True)

In [15]:
#the resulting dataframes are merged
#first of all the cited group are merged so taht we have firm ids attached to each cited publication
EPO_cit_ctrl = EPO_cit_ctrl.merge(EPO_firm_cited_grouped, left_on = "Cited_App_nbr", right_on = "app_nbr")
#then we merge with the citing firm ids so that we know which citing ids are linked to the citing document
EPO_cit_ctrl = EPO_cit_ctrl.merge(EPO_firm_citing_grouped, left_on = "Citing_app_nbr", right_on = "app_nbr")

In [16]:
#these two redundant columns are then removed
EPO_cit_ctrl.drop(["app_nbr_x", "app_nbr_y"], axis=1, inplace = True)

In [17]:
#the results are then checked
EPO_cit_ctrl

Unnamed: 0,Citing_app_nbr,Citing_appln_id,Cited_App_nbr,Cited_Appln_id,prio_year,Citing_IPC,IPC_subclass,Control_IPC,app_year,Control_appln_id,Control_pct_nbr,Cited_firm_id,citing_firm_id
0,EP19780101388,16428374,EP19780300148,16429355.0,1977,A61K031/43,A61K,A61K031/70,1978,16428163.0,,[28827],[1386114]
1,EP19790300237,16436952,EP19780300148,16429355.0,1978,A61K031/43,A61K,A61K,1979,16431752.0,,[28827],[28827]
2,EP19790301944,16438624,EP19780300148,16429355.0,1978,A61K031/41,A61K,A61K031/57,1979,16430931.0,,[28827],[2236098]
3,EP19790302384,16439056,EP19780300148,16429355.0,1978,A61K031/43,A61K,A61K031/44,1979,16430771.0,,[28827],[11539]
4,EP19790302384,16439056,EP19790300096,16436815.0,1978,A61K031/43,A61K,A61K039/29,1979,16440703.0,,[28827],[11539]
5,EP19820305125,16503864,EP19780300148,16429355.0,1981,A61K031/395,A61K,A61K031/545,1982,16489267.0,,[28827],[209914]
6,EP19880810407,16740869,EP19780300148,16429355.0,1987,A61K031/43,A61K,A61K031/445,1988,16718530.0,,[28827],[1621]
7,EP19790300234,16436949,EP19780300203,16429410.0,1978,A01N043/40,A01N,A01N035/06,1979,16438411.0,,[12885],[12885]
8,EP19790300251,16436966,EP19780300203,16429410.0,1978,A01N043/40,A01N,A01N043/653,1979,16434589.0,,[12885],[12885]
9,EP19790102118,16432764,EP19780300203,16429410.0,1978,A01N043/40,A01N,A01N025/10,1979,16437241.0,,[12885],[1621]


In [18]:
#tried this but did not work
#EPO_cit_ctrl_dropped = EPO_cit_ctrl.drop(EPO_cit_ctrl[EPO_cit_ctrl["citing_firm_id"] == EPO_cit_ctrl["Cited_firm_id"]].index)

In [19]:
#using the solution from the link below:
#https://stackoverflow.com/questions/62082699/is-there-a-way-to-check-in-a-dataframe-whether-a-value-in-a-list-in-one-column
#this iterates over each row and each column
#it checks to see whether a cited firm's id appears in the citing firm id
#if it does then we need to drop it as this is classed as a self citation

to_drop = []

for a in EPO_cit_ctrl.index:
    for val in EPO_cit_ctrl["citing_firm_id"][a]:
        if val in EPO_cit_ctrl["Cited_firm_id"][a]:
            to_drop.append(a)
            
#check the length that subsequently needs to be dropped
#so 6791 citations need to be removed
len(to_drop)

6791

In [20]:
#then the results are dropped
EPO_cit_ctrl_firmselfdrop = EPO_cit_ctrl.drop(to_drop)

The same methodology is applied to the individual data so that there is no self-citations by individuals outside of firms

In [21]:
#the EPO individual data is read in
EPO_ind = pd.read_csv("Patents data/202001_EPO_Inv_reg.txt", sep = "|", header = 0)

In [22]:
#columns that are not relevant are dropped
EPO_ind.drop(["appln_id", "pub_nbr", "inv_name", "address", "reg_code", "ctry_code",
              "reg_share", "inv_share"], axis = 1, inplace = True)

In [23]:
#reduce the ind dataframe to only those that are cited
EPO_ind_cited = EPO_ind[EPO_ind["app_nbr"].isin(EPO_cit_cited)]

In [24]:
#groupby app_nbr, with the person_id aggregated into a list
EPO_ind_cited_grouped = EPO_ind_cited.groupby("app_nbr").agg({'person_id':lambda x:x.tolist()})
#reset the index 
EPO_ind_cited_grouped.reset_index(inplace= True)
#rename the column
EPO_ind_cited_grouped.rename(columns = {"person_id":"Cited_ind_id"}, inplace = True)

In [25]:
#do the same for the citing column
EPO_ind_citing = EPO_ind[EPO_ind["app_nbr"].isin(EPO_cit_citing)]

In [26]:
EPO_ind_citing_grouped = EPO_ind_citing.groupby("app_nbr").agg({'person_id':lambda x:x.tolist()})
EPO_ind_citing_grouped.reset_index(inplace= True)
EPO_ind_citing_grouped.rename(columns = {"person_id": "citing_ind_id"}, inplace = True)

In [27]:
#merge the previously created dataframe with the new groups
EPO_cit_ctrl_firmselfdrop = EPO_cit_ctrl_firmselfdrop.merge(EPO_ind_cited_grouped, left_on = "Cited_App_nbr", right_on = "app_nbr")
EPO_cit_ctrl_firmselfdrop = EPO_cit_ctrl_firmselfdrop.merge(EPO_ind_citing_grouped, left_on = "Citing_app_nbr", right_on = "app_nbr")

In [28]:
#drop the unecessary columns
EPO_cit_ctrl_firmselfdrop.drop(["app_nbr_x", "app_nbr_y"], axis=1, inplace = True)

In [29]:
#check the result to make sure it makes sense
EPO_cit_ctrl_firmselfdrop

Unnamed: 0,Citing_app_nbr,Citing_appln_id,Cited_App_nbr,Cited_Appln_id,prio_year,Citing_IPC,IPC_subclass,Control_IPC,app_year,Control_appln_id,Control_pct_nbr,Cited_firm_id,citing_firm_id,Cited_ind_id,citing_ind_id
0,EP19780101388,16428374,EP19780300148,16429355.0,1977,A61K031/43,A61K,A61K031/70,1978,16428163.0,,[28827],[1386114],"[2218534, 2218535, 2218536, 2218537]","[2216665, 2216666]"
1,EP19790301944,16438624,EP19780300148,16429355.0,1978,A61K031/41,A61K,A61K031/57,1979,16430931.0,,[28827],[2236098],"[2218534, 2218535, 2218536, 2218537]",[2236179]
2,EP19790302384,16439056,EP19780300148,16429355.0,1978,A61K031/43,A61K,A61K031/44,1979,16430771.0,,[28827],[11539],"[2218534, 2218535, 2218536, 2218537]",[2237044]
3,EP19790302384,16439056,EP19790300096,16436815.0,1978,A61K031/43,A61K,A61K039/29,1979,16440703.0,,[28827],[11539],[2218537],[2237044]
4,EP19820305125,16503864,EP19780300148,16429355.0,1981,A61K031/395,A61K,A61K031/545,1982,16489267.0,,[28827],[209914],"[2218534, 2218535, 2218536, 2218537]","[2341206, 2347075, 2347076]"
5,EP19880810407,16740869,EP19780300148,16429355.0,1987,A61K031/43,A61K,A61K031/445,1988,16718530.0,,[28827],[1621],"[2218534, 2218535, 2218536, 2218537]","[2403985, 2521651, 2750251]"
6,EP20080163922,201033,EP19790300096,16436815.0,2003,C12Q001/68,C12Q,C12Q001/68,2004,147447.0,,[28827],[223388],[2218537],"[298714, 298715, 298716, 298717, 298718, 29871..."
7,EP19790102118,16432764,EP19780300203,16429410.0,1978,A01N043/40,A01N,A01N025/10,1979,16437241.0,,[12885],[1621],[2218661],"[1719, 2214349, 2225067, 2225068]"
8,EP19790102118,16432764,EP19780300853,16430044.0,1978,A01N043/40,A01N,A01N035/04,1979,16441051.0,,[12885],[1621],[2219864],"[1719, 2214349, 2225067, 2225068]"
9,EP19790103431,16434023,EP19780300203,16429410.0,1978,C07D 213/26,C07D,C07D,1979,16437691.0,,[12885],[2227427],[2218661],"[2227428, 2227429, 2227430, 2227431, 2227432, ..."


In [30]:
#reapted the previous loop to find out which patents are self-citations and remove
to_drop2 = []

for a in EPO_cit_ctrl_firmselfdrop.index:
    for val in EPO_cit_ctrl_firmselfdrop["citing_ind_id"][a]:
        if val in EPO_cit_ctrl_firmselfdrop["Cited_ind_id"][a]:
            to_drop2.append(a)
            
len(to_drop2)

390

In [31]:
#drop the resulting outputs
EPO_cit_ctrl_no_self = EPO_cit_ctrl_firmselfdrop.drop(to_drop2)

In [32]:
#check the resulting output
EPO_cit_ctrl_no_self

Unnamed: 0,Citing_app_nbr,Citing_appln_id,Cited_App_nbr,Cited_Appln_id,prio_year,Citing_IPC,IPC_subclass,Control_IPC,app_year,Control_appln_id,Control_pct_nbr,Cited_firm_id,citing_firm_id,Cited_ind_id,citing_ind_id
0,EP19780101388,16428374,EP19780300148,16429355.0,1977,A61K031/43,A61K,A61K031/70,1978,16428163.0,,[28827],[1386114],"[2218534, 2218535, 2218536, 2218537]","[2216665, 2216666]"
1,EP19790301944,16438624,EP19780300148,16429355.0,1978,A61K031/41,A61K,A61K031/57,1979,16430931.0,,[28827],[2236098],"[2218534, 2218535, 2218536, 2218537]",[2236179]
2,EP19790302384,16439056,EP19780300148,16429355.0,1978,A61K031/43,A61K,A61K031/44,1979,16430771.0,,[28827],[11539],"[2218534, 2218535, 2218536, 2218537]",[2237044]
3,EP19790302384,16439056,EP19790300096,16436815.0,1978,A61K031/43,A61K,A61K039/29,1979,16440703.0,,[28827],[11539],[2218537],[2237044]
4,EP19820305125,16503864,EP19780300148,16429355.0,1981,A61K031/395,A61K,A61K031/545,1982,16489267.0,,[28827],[209914],"[2218534, 2218535, 2218536, 2218537]","[2341206, 2347075, 2347076]"
5,EP19880810407,16740869,EP19780300148,16429355.0,1987,A61K031/43,A61K,A61K031/445,1988,16718530.0,,[28827],[1621],"[2218534, 2218535, 2218536, 2218537]","[2403985, 2521651, 2750251]"
6,EP20080163922,201033,EP19790300096,16436815.0,2003,C12Q001/68,C12Q,C12Q001/68,2004,147447.0,,[28827],[223388],[2218537],"[298714, 298715, 298716, 298717, 298718, 29871..."
7,EP19790102118,16432764,EP19780300203,16429410.0,1978,A01N043/40,A01N,A01N025/10,1979,16437241.0,,[12885],[1621],[2218661],"[1719, 2214349, 2225067, 2225068]"
8,EP19790102118,16432764,EP19780300853,16430044.0,1978,A01N043/40,A01N,A01N035/04,1979,16441051.0,,[12885],[1621],[2219864],"[1719, 2214349, 2225067, 2225068]"
9,EP19790103431,16434023,EP19780300203,16429410.0,1978,C07D 213/26,C07D,C07D,1979,16437691.0,,[12885],[2227427],[2218661],"[2227428, 2227429, 2227430, 2227431, 2227432, ..."


In [33]:
#we want to make sure that citing/cited/ctrl pairs are kept together i.e. their pairs can be identified
#The way to do this si create an index for each of the pairs as done below
EPO_cit_ctrl_no_self.reset_index(level=0, inplace = True)

#from this can produce a control dataframe and a actual result dataframe
EPO_cit_noself_ctrl = EPO_cit_ctrl_no_self.drop(["Citing_app_nbr", "Citing_appln_id", "Citing_IPC", "app_year"], axis =1)
#push this to a csv
EPO_cit_noself_ctrl.to_csv("Patents data/EPO_cited&ctrl_noself.csv", index=False)
#get the actual citation csv
EPO_cit_noself = EPO_cit_ctrl_no_self.drop(["Control_IPC", "app_year", "Control_appln_id", "Control_pct_nbr"], axis = 1)
EPO_cit_noself.to_csv("Patents data/EPO_cited&citing_noself.csv", index = False)

# Now the PCT data

In [34]:
#create a list of both the citing and citing patents application number (which will change to the PCT number)
PCT_cit_cited = list(PCT_cit_ctrl["Cited_App_nbr"].unique())
PCT_cit_citing = list(PCT_cit_ctrl["Citing_app_nbr"].unique())

In [35]:
#read in both the firm and the individual data so that both can be used in the join to check for self citations
PCT_firm = pd.read_csv("Patents data/202001_PCT_App_reg.txt", sep = "|", header = 0) 
PCT_ind = pd.read_csv("Patents data/202001_PCT_Inv_reg.txt", sep = "|", header = 0)

In [36]:
#Also read in the PCT citation data that contains IPC codes
PCT_citation = pd.read_csv("Patents data/PCT_cit_wIPC_code.csv")

In [37]:
#this is to check which the length of unique values in reference to those that are also in the EPO database
PCT_citing_EPO = list(PCT_citation["Citing_app_nbr_EPO"].unique())
PCT_cited_EPO = list(PCT_citation["Cited_App_nbr_EPO"].unique())

In [38]:
#the length of the citing EPO numbers is 63,000
len(PCT_citing_EPO)

63673

In [39]:
#then limit the EPO firm data to those who have their app number in line with those in the EPO related data
EPO_PCT_firm_citing = EPO_firm[EPO_firm["app_nbr"].isin(PCT_citing_EPO)]
EPO_PCT_firm_cited = EPO_firm[EPO_firm["app_nbr"].isin(PCT_cited_EPO)]

In [40]:
#From this it appears that there is missing around 30,000 applications at the unique level in the EPO database
len(EPO_PCT_firm_citing["app_nbr"].unique())

34127

In [41]:
EPO_PCT_ind_citing = EPO_ind[EPO_ind["app_nbr"].isin(PCT_citing_EPO)]
EPO_PCT_ind_cited = EPO_ind[EPO_ind["app_nbr"].isin(PCT_cited_EPO)]

In [42]:
#a similar finding is also seen in the unque aspect 
#this will be dealt with later but it must be noted that there is a discrepancy
#where 30,000 applications that the EPO_PCT data suggests should be linked as was not linked
len(EPO_PCT_ind_citing["app_nbr"].unique())

34116

In [43]:
#This reads in the EPO_PCT data base that links the PCT applications with the EPO applications
EPO_PCT = pd.read_csv("Patents data/202001_EPO_PCT.txt", sep = "|", header = 0)

In [44]:
#create a list of the app numbers from the database
EPO_PCT_list = list(EPO_PCT["app_nbr"])
#take out the information that is part of this and see how many are in the PCT_citation file
PCT_in_EPO_citing = PCT_citation[PCT_citation["Citing_app_nbr_EPO"].isin(EPO_PCT_list)]

In [45]:
#this confirms the length found previously, further suggesting the difference in length
len(PCT_in_EPO_citing["Citing_app_nbr_EPO"].unique())

63673

In [46]:
#going back to the PCT data
#from the firm we can drop irrelevant information
#this does not contain a person or firm id there ap_name has to be used
#this coud create issues further down the line ******
PCT_firm.drop(["internat_appln_nr", "appln_id", "address", "reg_code", "ctry_code",
              "reg_share", "app_share"], axis = 1, inplace = True)

In [47]:
#inspect the result
PCT_firm

Unnamed: 0,pct_nbr,app_name
0,WO1978000001,"SAMANTA, Mrinmay"
1,WO1978000003,"ZELLER PLASTIK Koehn, Gräbner & Co."
2,WO1978000004,ADVANCED CHEMICAL EQUIPMENT LIMITED
3,WO1978000005,"MOSBACH, Klaus H."
4,WO1978000006,BATTELLE MEMORIAL INSTITUTE
5,WO1978000007,"PFEIFFER, William M."
6,WO1978000008,Guschky & Tönnesmann GmbH & Co. KG
7,WO1978000009,"INTERNATIONAL WATER SAVING SYSTEMS, INC."
8,WO1978000011,Garching Instrumente Gesellschaft zur industri...
9,WO1978000012,RHONE-POULENC TEXTILE


In [48]:
#perform the same procedure as before for both the cited and citing firm information
PCT_firm_cited = PCT_firm[PCT_firm["pct_nbr"].isin(PCT_cit_cited)]
PCT_firm_citing = PCT_firm[PCT_firm["pct_nbr"].isin(PCT_cit_citing)]

In [49]:
#group them into lists so they can be checked against each other
PCT_firm_cited_grouped = PCT_firm_cited.groupby("pct_nbr").agg({'app_name':lambda x:x.tolist()})
PCT_firm_cited_grouped.reset_index(inplace= True)
PCT_firm_cited_grouped.rename(columns = {"app_name":"Cited_firm_id"}, inplace = True)

In [50]:
#same with the citing information
PCT_firm_citing_grouped = PCT_firm_citing.groupby("pct_nbr").agg({'app_name':lambda x:x.tolist()})
PCT_firm_citing_grouped.reset_index(inplace= True)
PCT_firm_citing_grouped.rename(columns = {"app_name": "citing_firm_id"}, inplace = True)

In [51]:
#merge this with the PCT control file
PCT_cit_ctrl = PCT_cit_ctrl.merge(PCT_firm_cited_grouped, left_on = "Cited_App_nbr", right_on = "pct_nbr")
PCT_cit_ctrl = PCT_cit_ctrl.merge(PCT_firm_citing_grouped, left_on = "Citing_app_nbr", right_on = "pct_nbr")

In [52]:
#drop unecessary information
PCT_cit_ctrl.drop(["pct_nbr_x", "pct_nbr_y"], axis=1, inplace = True)

In [53]:
#check that no results have been lost
PCT_cit_ctrl.count()

Citing_app_nbr      79320
Citing_appln_id     79320
Cited_App_nbr       79320
Cited_Appln_id      79320
prio_year           79320
Citing_IPC          79320
IPC_subclass        79320
Control_IPC         79300
app_year            79300
Control_appln_id    49751
Control_pct_nbr     29549
Cited_firm_id       79320
citing_firm_id      79320
dtype: int64

In [54]:
#iterate over the lists between them to check which needs to be dropped
to_drop3 = []

for a in PCT_cit_ctrl.index:
    for val in PCT_cit_ctrl["citing_firm_id"][a]:
        if val in PCT_cit_ctrl["Cited_firm_id"][a]:
            to_drop3.append(a)
            
len(to_drop3)
#the resulting length indicates that there are no self-citations in this dataset between firms 
#this is interesting given the 6,000 cases previously 

42

In [55]:
#no files were actually dropped so the same file can be used
PCT_cit_ctrl_firmselfdrop = PCT_cit_ctrl.drop(to_drop3)

In [56]:
#perform the same procedure for individuals
PCT_ind.drop(["internat_appln_nr", "appln_id", "address", "reg_code", "ctry_code", 
              "reg_share", "inv_share"], axis =1, inplace = True)

In [57]:
PCT_ind_cited = PCT_ind[PCT_ind["pct_nbr"].isin(PCT_cit_cited)]
PCT_ind_cited_grouped = PCT_ind_cited.groupby("pct_nbr").agg({'inv_name':lambda x:x.tolist()})
PCT_ind_cited_grouped.reset_index(inplace= True)
PCT_ind_cited_grouped.rename(columns = {"inv_name":"Cited_ind_id"}, inplace = True)

In [58]:
PCT_ind_citing = PCT_ind[PCT_ind["pct_nbr"].isin(PCT_cit_citing)]
PCT_ind_citing_grouped = PCT_ind_citing.groupby("pct_nbr").agg({'inv_name':lambda x:x.tolist()})
PCT_ind_citing_grouped.reset_index(inplace= True)
PCT_ind_citing_grouped.rename(columns = {"inv_name":"Citing_ind_id"}, inplace = True)

In [59]:
PCT_cit_ctrl_firmselfdrop = PCT_cit_ctrl_firmselfdrop.merge(PCT_ind_cited_grouped, left_on = "Cited_App_nbr", right_on = "pct_nbr")
PCT_cit_ctrl_firmselfdrop = PCT_cit_ctrl_firmselfdrop.merge(PCT_ind_citing_grouped, left_on = "Citing_app_nbr", right_on = "pct_nbr")

In [60]:
PCT_cit_ctrl_firmselfdrop

Unnamed: 0,Citing_app_nbr,Citing_appln_id,Cited_App_nbr,Cited_Appln_id,prio_year,Citing_IPC,IPC_subclass,Control_IPC,app_year,Control_appln_id,Control_pct_nbr,Cited_firm_id,citing_firm_id,pct_nbr_x,Cited_ind_id,pct_nbr_y,Citing_ind_id
0,WO1979000092,15648948,WO1979000002,11473341.0,1977,C12L011/00,C12L,,,,,[NATIONAL RESEARCH DEVELOPMENT CORPORATION],[BIER-DRIVE AG],WO1979000002,"[MARTIN, Archer, John, Porter, HAMPSON, Frank]",WO1979000092,"[MÖCKESCH, Erich]"
1,WO1983000236,15649723,WO1979000002,11473341.0,1981,G04B037/05,G04B,G04B019/06,1981.0,16469777.0,,[NATIONAL RESEARCH DEVELOPMENT CORPORATION],[FIRMA H. FINGER],WO1979000002,"[MARTIN, Archer, John, Porter, HAMPSON, Frank]",WO1983000236,"[FINGER, Hansjörg]"
2,WO1993005833,47214466,WO1979000002,46912661.0,1991,A61M005/32,A61M,A61M025/00,1991.0,16964845.0,,[NATIONAL RESEARCH DEVELOPMENT CORPORATION],"[JESGAMABE, S.L.]",WO1979000002,"[MARTIN, Archer, John, Porter, HAMPSON, Frank]",WO1993005833,"[GALIANA SABATER, Amando]"
3,WO1994011576,47250451,WO1979000002,46912661.0,1992,D21H017/45,D21H,D21H027/40,1993.0,,WO1994004435,[NATIONAL RESEARCH DEVELOPMENT CORPORATION],[Solutia Inc.],WO1979000002,"[MARTIN, Archer, John, Porter, HAMPSON, Frank]",WO1994011576,"[LEE, Kang, In]"
4,WO1997005434,47345574,WO1979000002,22670158.0,1995,F25B041/00,F25B,F25B045/00,1995.0,17121856.0,,[NATIONAL RESEARCH DEVELOPMENT CORPORATION],"[Katai, Lajos Jr., Katai, Lajos, Katai, Zoltan]",WO1979000002,"[MARTIN, Archer, John, Porter, HAMPSON, Frank]",WO1997005434,"[Katai, Lajos, Katai, Zoltan, Katai, Lajos Jr.]"
5,WO1998007448,47388255,WO1979000002,22670158.0,1996,A61K008/19,A61K,A61K009/16,1997.0,,WO1998006439,[NATIONAL RESEARCH DEVELOPMENT CORPORATION],[ADA FOUNDATION],WO1979000002,"[MARTIN, Archer, John, Porter, HAMPSON, Frank]",WO1998007448,"[VOGEL, Gerald, L., TAKAGI, Shozo, CHOW, Laure..."
6,WO1980000202,43452175,WO1980000019,43451779.0,1978,G07F011/62,G07F,G07F005/24,1979.0,16440913.0,,[PORTER CHADBURN LIMITED],"[UMC INDUSTRIES, INC.]",WO1980000019,"[PORTER, Harold]",WO1980000202,"[ALBRECHT, Paul, N., SCHULLER, James, T., CANI..."
7,WO1988001113,15653284,WO1980000019,43451779.0,1986,H03L007/14,H03L,H03L007/18,1987.0,16661533.0,,[PORTER CHADBURN LIMITED],[Hughes Aircraft Company],WO1980000019,"[PORTER, Harold]",WO1988001113,"[LIND, Harold, V.]"
8,WO1990003927,47132716,WO1980000019,6509723.0,1988,G01N033/72,G01N,G01N021/89,1989.0,16772569.0,,[PORTER CHADBURN LIMITED],[PACIFIC BIOTECH INC.],WO1980000019,"[PORTER, Harold]",WO1990003927,"[FAN, Eugene, CHEN, Fon-Chiu, Mia, MILNER, Mic..."
9,WO1992000151,47179753,WO1980000019,22700929.0,1990,G02C013/00,G02C,G02C005/16,1991.0,16870139.0,,[PORTER CHADBURN LIMITED],[POLYMER TECHNOLOGY CORPORATION],WO1980000019,"[PORTER, Harold]",WO1992000151,"[SMITH, Francis, X., WROBEL, Stanley, J., RAHE..."


In [61]:
PCT_cit_ctrl_firmselfdrop.drop(["pct_nbr_x", "pct_nbr_y"], axis=1, inplace = True)

In [62]:
#checl the output
#PCT_cit_ctrl_firmselfdrop.drop(["Cited_ind_id_x"], axis = 1, inplace = True)
#PCT_cit_ctrl_firmselfdrop.rename(columns = {"Cited_ind_id_y": "Cited_ind_id"}, inplace = True)


In [63]:
#no individuals should be dropped so check again with more information
#potentially use the lower function
to_drop4 = []

for a in PCT_cit_ctrl_firmselfdrop.index:
    for val in PCT_cit_ctrl_firmselfdrop["Citing_ind_id"][a]:
        if val in PCT_cit_ctrl_firmselfdrop["Cited_ind_id"][a]:
            to_drop4.append(a)
            
len(to_drop4)

0

In [64]:
PCT_cit_ctrl_firmselfdrop.count()

Citing_app_nbr      78258
Citing_appln_id     78258
Cited_App_nbr       78258
Cited_Appln_id      78258
prio_year           78258
Citing_IPC          78258
IPC_subclass        78258
Control_IPC         78238
app_year            78238
Control_appln_id    49158
Control_pct_nbr     29080
Cited_firm_id       78258
citing_firm_id      78258
Cited_ind_id        78258
Citing_ind_id       78258
dtype: int64

In [65]:
#we want to make sure that citing/cited/ctrl pairs are kept together i.e. their pairs can be identified
#The way to do this si create an index for each of the pairs as done below
PCT_cit_ctrl_firmselfdrop.reset_index(level=0, inplace= True)

#from this can produce a control dataframe and a actual result dataframe
PCT_cit_noself_ctrl = PCT_cit_ctrl_firmselfdrop.drop(["Citing_app_nbr", "Citing_appln_id", "Citing_IPC", "app_year"], axis =1)
#push this to a csv
PCT_cit_noself_ctrl.to_csv("Patents data/PCT_cited&ctrl_noself.csv", index=False)
#get the actual citation csv
PCT_cit_noself = PCT_cit_ctrl_firmselfdrop.drop(["Control_IPC", "app_year", "Control_appln_id", "Control_pct_nbr"], axis = 1)
PCT_cit_noself.to_csv("Patents data/PCT_cited&citing_noself.csv", index = False)

# PCT firm EPO

The same analysis is performed but only limiting to those with a record in the EPO. The results are the same as previously and are not outputed. But the results are there if needed to be.

These results were not used.

In [66]:
PCT_citation.drop(["Unnamed: 0", "Citn_origin", "Citn_lag_year", "Citn_lag_month", "ISA"], axis = 1, inplace = True)

In [67]:
PCT_EPO_list = set(list(EPO_PCT_ind_citing["app_nbr"].unique())+list(EPO_PCT_firm_citing["app_nbr"].unique()))

In [68]:
PCT_citation_EPO = PCT_citation[PCT_citation["Citing_app_nbr_EPO"].isin(PCT_EPO_list)]

In [69]:
PCT_citation_EPO

Unnamed: 0,Citing_app_nbr,Citing_appln_id,Cited_pub_nbr,Cited_App_nbr,Cited_Appln_id,Citing_app_nbr_EPO,Cited_App_nbr_EPO,prio_year,app_year,IPC
0,WO1979000092,15648948,WO1979000985,WO1979000002,11473341.0,EP19780900082,EP19780900017,1977,1978,C12L011/00
1,WO1983000236,15649723,WO1979000985,WO1979000002,11473341.0,EP19810902481,EP19780900017,1981,1981,G04B037/05
2,WO1993005833,47214466,WO1979000482,WO1979000002,46912661.0,EP19920920347,EP19780900017,1991,1992,A61M005/32
3,WO1994011576,47250451,WO1979000482,WO1979000002,46912661.0,EP19940900357,EP19780900017,1992,1993,D21H017/45
4,WO1997005434,47345574,WO1979000483,WO1979000002,22670158.0,EP19950927905,EP19780900017,1995,1995,F25B041/00
5,WO1998007448,47388255,WO1979000483,WO1979000002,22670158.0,EP19970938396,EP19780900017,1996,1997,A61K008/19
6,WO1980000202,43452175,WO1980001507,WO1980000019,43451779.0,EP19790900851,EP19790900551,1978,1979,G07F011/62
11,WO1997000162,11703772,WO1980002748,WO1980000019,42065867.0,EP19960917675,EP19790900551,1995,1996,B29C045/66
12,WO2013050076,405391824,WO1980001666,WO1980000019,6509723.0,EP20110767686,EP19790900551,2011,2011,H04L005/00
13,WO1982000011,41873509,WO1982000372,WO1981000831,46950439.0,EP19810901814,EP19800901658,1980,1981,A21C001/14


In [70]:
PCT_EPO_citing = list(PCT_citation_EPO["Citing_app_nbr_EPO"].unique())
PCT_EPO_cited = list(PCT_citation_EPO["Cited_App_nbr_EPO"].unique())

In [71]:
print(len(PCT_EPO_citing))
print(len(PCT_EPO_cited))

34130
12102


In [72]:
PCT_firm_cited_EPO = EPO_firm[EPO_firm["app_nbr"].isin(PCT_EPO_cited)]
PCT_firm_cited_EPO_grouped = PCT_firm_cited_EPO.groupby("app_nbr").agg({'person_id':lambda x:x.tolist()})
PCT_firm_cited_EPO_grouped.reset_index(inplace= True)
PCT_firm_cited_EPO_grouped.rename(columns = {"person_id":"Cited_firm_id"}, inplace = True)

In [73]:
PCT_firm_cited_EPO_grouped

Unnamed: 0,app_nbr,Cited_firm_id
0,EP19780900003,[2220784]
1,EP19780900017,[419432]
2,EP19780900063,[2220884]
3,EP19780900109,[419432]
4,EP19780900122,[2220989]
5,EP19780900128,[419432]
6,EP19780900141,[2221027]
7,EP19780900162,[2221054]
8,EP19780900318,[2221263]
9,EP19790900009,[244363]


In [74]:
PCT_firm_citing_EPO = EPO_firm[EPO_firm["app_nbr"].isin(PCT_EPO_citing)]
CT_firm_citing_EPO_grouped = PCT_firm_citing_EPO.groupby("app_nbr").agg({'person_id':lambda x:x.tolist()})
CT_firm_citing_EPO_grouped.reset_index(inplace= True)
CT_firm_citing_EPO_grouped.rename(columns = {"person_id": "citing_firm_id"}, inplace = True)

In [75]:
PCT_citation_EPO = PCT_citation_EPO.merge(PCT_firm_cited_EPO_grouped, left_on = "Cited_App_nbr_EPO", right_on = "app_nbr")
PCT_citation_EPO = PCT_citation_EPO.merge(CT_firm_citing_EPO_grouped, left_on = "Citing_app_nbr_EPO", right_on = "app_nbr")

In [76]:
PCT_citation_EPO

Unnamed: 0,Citing_app_nbr,Citing_appln_id,Cited_pub_nbr,Cited_App_nbr,Cited_Appln_id,Citing_app_nbr_EPO,Cited_App_nbr_EPO,prio_year,app_year,IPC,app_nbr_x,Cited_firm_id,app_nbr_y,citing_firm_id
0,WO1979000092,15648948,WO1979000985,WO1979000002,11473341.0,EP19780900082,EP19780900017,1977,1978,C12L011/00,EP19780900017,[419432],EP19780900082,[2220915]
1,WO1983000236,15649723,WO1979000985,WO1979000002,11473341.0,EP19810902481,EP19780900017,1981,1981,G04B037/05,EP19780900017,[419432],EP19810902481,[2316256]
2,WO1993005833,47214466,WO1979000482,WO1979000002,46912661.0,EP19920920347,EP19780900017,1991,1992,A61M005/32,EP19780900017,[419432],EP19920920347,[3197943]
3,WO1994011576,47250451,WO1979000482,WO1979000002,46912661.0,EP19940900357,EP19780900017,1992,1993,D21H017/45,EP19780900017,[419432],EP19940900357,[91919]
4,WO1997005434,47345574,WO1979000483,WO1979000002,22670158.0,EP19950927905,EP19780900017,1995,1995,F25B041/00,EP19780900017,[419432],EP19950927905,"[3523663, 3523664, 3523665]"
5,WO1998007448,47388255,WO1979000483,WO1979000002,22670158.0,EP19970938396,EP19780900017,1996,1997,A61K008/19,EP19780900017,[419432],EP19970938396,[230873]
6,WO1980000202,43452175,WO1980001507,WO1980000019,43451779.0,EP19790900851,EP19790900551,1978,1979,G07F011/62,EP19790900551,[2234368],EP19790900851,[2242674]
7,WO1997000162,11703772,WO1980002748,WO1980000019,42065867.0,EP19960917675,EP19790900551,1995,1996,B29C045/66,EP19790900551,[2234368],EP19960917675,[1412121]
8,WO2013050076,405391824,WO1980001666,WO1980000019,6509723.0,EP20110767686,EP19790900551,2011,2011,H04L005/00,EP19790900551,[2234368],EP20110767686,[44835045]
9,WO1982000011,41873509,WO1982000372,WO1981000831,46950439.0,EP19810901814,EP19800901658,1980,1981,A21C001/14,EP19800901658,[2275983],EP19810901814,[2230924]


In [77]:
PCT_citation_EPO.drop(["app_nbr_x", "app_nbr_y"], axis=1, inplace = True)

In [78]:
to_drop5 = []

for a in PCT_citation_EPO.index:
    for val in PCT_citation_EPO["citing_firm_id"][a]:
        if val in PCT_citation_EPO["Cited_firm_id"][a]:
            to_drop5.append(a)
            
len(to_drop5)

30

In [79]:
PCT_citation_EPO_firmselfdrop = PCT_citation_EPO.drop(to_drop5)

### now with ind data

In [80]:
PCT_ind_cited_EPO = EPO_ind[EPO_ind["app_nbr"].isin(PCT_EPO_cited)]
PCT_ind_cited_EPO_grouped = PCT_ind_cited_EPO.groupby("app_nbr").agg({'person_id':lambda x:x.tolist()})
PCT_ind_cited_EPO_grouped.reset_index(inplace= True)
PCT_ind_cited_EPO_grouped.rename(columns = {"person_id":"Cited_ind_id"}, inplace = True)

In [81]:
PCT_ind_citing_EPO = EPO_ind[EPO_ind["app_nbr"].isin(PCT_EPO_citing)]
PCT_ind_citing_EPO_grouped = PCT_ind_citing_EPO.groupby("app_nbr").agg({'person_id':lambda x:x.tolist()})
PCT_ind_citing_EPO_grouped.reset_index(inplace= True)
PCT_ind_citing_EPO_grouped.rename(columns = {"person_id": "citing_ind_id"}, inplace = True)

In [82]:
PCT_citation_EPO_firmselfdrop = PCT_citation_EPO_firmselfdrop.merge(PCT_ind_cited_EPO_grouped, left_on = "Cited_App_nbr_EPO", right_on = "app_nbr")
PCT_citation_EPO_firmselfdrop = PCT_citation_EPO_firmselfdrop.merge(PCT_ind_citing_EPO_grouped, left_on = "Citing_app_nbr_EPO", right_on = "app_nbr")

In [83]:
PCT_citation_EPO_firmselfdrop.drop(["app_nbr_x", "app_nbr_y"], axis=1, inplace = True)

In [84]:
PCT_citation_EPO_firmselfdrop

Unnamed: 0,Citing_app_nbr,Citing_appln_id,Cited_pub_nbr,Cited_App_nbr,Cited_Appln_id,Citing_app_nbr_EPO,Cited_App_nbr_EPO,prio_year,app_year,IPC,Cited_firm_id,citing_firm_id,Cited_ind_id,citing_ind_id
0,WO1979000092,15648948,WO1979000985,WO1979000002,11473341.0,EP19780900082,EP19780900017,1977,1978,C12L011/00,[419432],[2220915],"[2220815, 2220816]",[2220916]
1,WO1983000236,15649723,WO1979000985,WO1979000002,11473341.0,EP19810902481,EP19780900017,1981,1981,G04B037/05,[419432],[2316256],"[2220815, 2220816]",[2316257]
2,WO1993005833,47214466,WO1979000482,WO1979000002,46912661.0,EP19920920347,EP19780900017,1991,1992,A61M005/32,[419432],[3197943],"[2220815, 2220816]",[3197944]
3,WO1994011576,47250451,WO1979000482,WO1979000002,46912661.0,EP19940900357,EP19780900017,1992,1993,D21H017/45,[419432],[91919],"[2220815, 2220816]",[3295902]
4,WO1997005434,47345574,WO1979000483,WO1979000002,22670158.0,EP19950927905,EP19780900017,1995,1995,F25B041/00,[419432],"[3523663, 3523664, 3523665]","[2220815, 2220816]","[3523663, 3523664, 3523665]"
5,WO1998007448,47388255,WO1979000483,WO1979000002,22670158.0,EP19970938396,EP19780900017,1996,1997,A61K008/19,[419432],[230873],"[2220815, 2220816]","[2971868, 3305295, 3766193]"
6,WO1980000202,43452175,WO1980001507,WO1980000019,43451779.0,EP19790900851,EP19790900551,1978,1979,G07F011/62,[2234368],[2242674],[2234369],"[2242675, 2242676, 2242677]"
7,WO1997000162,11703772,WO1980002748,WO1980000019,42065867.0,EP19960917675,EP19790900551,1995,1996,B29C045/66,[2234368],[1412121],[2234369],"[3630856, 49249409]"
8,WO2013050076,405391824,WO1980001666,WO1980000019,6509723.0,EP20110767686,EP19790900551,2011,2011,H04L005/00,[2234368],[44835045],[2234369],"[47882231, 49187691, 49473864]"
9,WO1982000011,41873509,WO1982000372,WO1981000831,46950439.0,EP19810901814,EP19800901658,1980,1981,A21C001/14,[2275983],[2230924],[2275983],[2315298]


In [85]:
to_drop6 = []

for a in PCT_citation_EPO_firmselfdrop.index:
    for val in PCT_citation_EPO_firmselfdrop["citing_ind_id"][a]:
        if val in PCT_citation_EPO_firmselfdrop["Cited_ind_id"][a]:
            to_drop6.append(a)
            
len(to_drop6)

0

In [86]:
PCT_citation_EPO_firmselfdrop = PCT_citation_EPO_firmselfdrop.drop(to_drop6)

In [87]:
PCT_citation_EPO_firmselfdrop.to_csv("Patents data/PCT_cited&citing_noself_EPO.csv")