In [1]:
import pandas as pd
import numpy as np

# Creating PCT controls

This workbook creates controls for the EPO dataset. Considering that both the EPO and PCT datasets will be combined for the analysis, then controls for the orignial EPO dataset are not contrsicted to results solely to those that have intial application under the EPO. Thus, controls can come from both the EPO and the PCT dataset.

These datasets come from the results found in: Control_data.ipynb and Exploring_citation_data.ipynb

Given that creating these controls results in a dataset containing over 150,000,000 rows, EPO and PCT controls are created seperately. Thus, this workbook is computationally expensive in terms of memory usage. My own computer is able to complete this due to availability of 16gb of RAM.

In [2]:
#read in the PCT citation dataset information that contains the IPC codes
#this is because we want the IPC codes to be able to merge on
PCT_cit = pd.read_csv("Patents data/PCT_cit_wIPC_code.csv")

In [3]:
#check the counts now to make sure that the same number of rows are retained later
PCT_cit.count()

Unnamed: 0            79350
Citing_app_nbr        79350
Citing_appln_id       79350
Cited_pub_nbr         79350
Cited_App_nbr         79350
Cited_Appln_id        79350
Citn_origin           79350
Citn_lag_year         79350
Citn_lag_month        79350
ISA                   79350
Citing_app_nbr_EPO    79350
Cited_App_nbr_EPO     79350
prio_year             79350
app_year              79350
IPC                   79323
dtype: int64

In [4]:
#get a list of all PCT numbers so that they can be removed from the potential controls so there are no duplications
PCT_list = list(set(list(PCT_cit["Citing_app_nbr"]) + list(PCT_cit["Cited_App_nbr"])))

In [5]:
#double check the length of unamed to make sure each row is unique
len(PCT_cit["Unnamed: 0"].unique())

79350

In [6]:
#remove any unecessary information so that the subsequent processes can be performed 
PCT_cit.drop(["app_year", "ISA", "Citn_lag_month", "Citn_lag_year", "Citn_origin", "Citing_app_nbr_EPO", "Cited_App_nbr_EPO",
              "Cited_pub_nbr"], axis = 1, inplace = True)

In [7]:
#rename the unamed column to the index columns
PCT_cit.rename(columns = {"Unnamed: 0": "Index"}, inplace = True)

In [8]:
#drop any NAN values from the above
PCT_cit.dropna(inplace = True)

In [9]:
#similar to the EPO controls notebook, we want only the subclass so that it is only the first 4 letters of the IPC string
PCT_cit["IPC_subclass"] = PCT_cit["IPC"].str[0:4]

In [10]:
#read in the potential control data
Pot_ctrl = pd.read_csv("Patents data/potential_control.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
#remove any results such that there will be no duplication
#Pot_ctrl = Pot_ctrl[~Pot_ctrl["pct_nbr"].isin(PCT_list)]

In [12]:
#We want only the subclass to merge on so get the first four characters of the string
Pot_ctrl["IPC_subclass"] = Pot_ctrl["IPC"].str[0:4]

In [13]:
#merge on the IPC subclass and priority year 
#the priority year is such that the application has been applied for in that year
PCT_Cit_ctrl = PCT_cit.merge(Pot_ctrl, on = ["IPC_subclass", "prio_year"], how = "left")

In [14]:
#check the resulting outputs
PCT_Cit_ctrl = PCT_Cit_ctrl[PCT_Cit_ctrl["Citing_app_nbr"] != PCT_Cit_ctrl["pct_nbr"]]

In [15]:
PCT_Cit_ctrl

Unnamed: 0,Index,Citing_app_nbr,Citing_appln_id,Cited_App_nbr,Cited_Appln_id,prio_year,IPC_x,IPC_subclass,IPC_y,app_year,appln_id,pct_nbr
0,0,WO1979000092,15648948,WO1979000002,11473341.0,1977,C12L011/00,C12L,,,,
1,1,WO1983000236,15649723,WO1979000002,11473341.0,1981,G04B037/05,G04B,G04B027/02,1981.0,16465253.0,
2,1,WO1983000236,15649723,WO1979000002,11473341.0,1981,G04B037/05,G04B,G04B019/06,1981.0,16469777.0,
3,1,WO1983000236,15649723,WO1979000002,11473341.0,1981,G04B037/05,G04B,G04B011/00,1981.0,16478369.0,
4,1,WO1983000236,15649723,WO1979000002,11473341.0,1981,G04B037/05,G04B,G04B037/05,1981.0,16485260.0,
5,1,WO1983000236,15649723,WO1979000002,11473341.0,1981,G04B037/05,G04B,G04B037/14,1981.0,16485439.0,
6,1,WO1983000236,15649723,WO1979000002,11473341.0,1981,G04B037/05,G04B,G04B037/11,1982.0,16489090.0,
7,1,WO1983000236,15649723,WO1979000002,11473341.0,1981,G04B037/05,G04B,G04B019/14,1982.0,16490505.0,
8,1,WO1983000236,15649723,WO1979000002,11473341.0,1981,G04B037/05,G04B,G04B037/08,1982.0,16493173.0,
9,1,WO1983000236,15649723,WO1979000002,11473341.0,1981,G04B037/05,G04B,G04B037/00,1982.0,16493437.0,


In [16]:
#rename the columns so that we know what information there is
PCT_Cit_ctrl.rename(columns = {"IPC_x":"Citing_IPC",
                              "IPC_y": "Control_IPC",
                              "appln_id": "Control_appln_id",
                              "pct_nbr": "Control_pct_nbr"}, inplace = True)

In [17]:
#similar to the EPO control notebook, we want only a single control for each citation
#therefore groupby the index and only take a single row using the pd.sample method
#this takes a random row from the dataframe and outputs it
PCT_cit_ctrl_single = PCT_Cit_ctrl.groupby("Index").apply(lambda x: x.sample(1)).reset_index(drop=True)

In [18]:
#check the resulting output
PCT_cit_ctrl_single

Unnamed: 0,Index,Citing_app_nbr,Citing_appln_id,Cited_App_nbr,Cited_Appln_id,prio_year,Citing_IPC,IPC_subclass,Control_IPC,app_year,Control_appln_id,Control_pct_nbr
0,0,WO1979000092,15648948,WO1979000002,11473341.0,1977,C12L011/00,C12L,,,,
1,1,WO1983000236,15649723,WO1979000002,11473341.0,1981,G04B037/05,G04B,G04B019/14,1982.0,16490505.0,
2,2,WO1993005833,47214466,WO1979000002,46912661.0,1991,A61M005/32,A61M,A61M005/32,1991.0,,WO1992020385
3,3,WO1994011576,47250451,WO1979000002,46912661.0,1992,D21H017/45,D21H,D21H017/35,1993.0,17068136.0,
4,4,WO1997005434,47345574,WO1979000002,22670158.0,1995,F25B041/00,F25B,F25B013/00,1996.0,17155009.0,
5,5,WO1998007448,47388255,WO1979000002,22670158.0,1996,A61K008/19,A61K,A61K039/39,1997.0,,WO1998010790
6,6,WO1980000202,43452175,WO1980000019,43451779.0,1978,G07F011/62,G07F,G07F005/24,1979.0,16440913.0,
7,7,WO1988001113,15653284,WO1980000019,43451779.0,1986,H03L007/14,H03L,H03L007/06,1987.0,16680614.0,
8,8,WO1990003927,47132716,WO1980000019,6509723.0,1988,G01N033/72,G01N,G01N035/10,1989.0,,WO1989007980
9,9,WO1992000151,47179753,WO1980000019,22700929.0,1990,G02C013/00,G02C,G02C007/02,1991.0,326350.0,


In [19]:
#output this to a new csv
PCT_cit_ctrl_single.to_csv("Patents data/PCT_cit_wCTRL_single3.csv")