In [None]:
import pandas as pd
import numpy as np

# Creating EPO controls

This workbook creates controls for the EPO dataset. Considering that both the EPO and PCT datasets will be combined for the analysis, then controls for the orignial EPO dataset are not contrsicted to results solely to those that have intial application under the EPO. Thus, controls can come from both the EPO and the PCT dataset.

These datasets come from the results found in: Control_data.ipynb and Exploring_citation_data.ipynb

Given that creating these controls results in a dataset containing over 150,000,000 rows, EPO and PCT controls are created seperately. Thus, this workbook is computationally expensive in terms of memory usage. My own computer is able to complete this due to availability of 16gb of RAM.

In [None]:
#read in the EPO citation data
#this is citations that appear in the EPO information
EPO_cit = pd.read_csv("Patents data/EPO_cit_wIPC_code.csv")

In [None]:
#rename the unamed column to the index
#this will be used later on to groupby as it is the only column that is different for each result
EPO_cit.rename(columns = {"Unnamed: 0": "Index"}, inplace = True)

In [None]:
#we want only the IPC subclass which is the first four characters in the IPC string
#use the solution found on the following website
#http://www.datasciencemadesimple.com/return-first-n-character-from-left-of-column-in-pandas-python/
#how to extract the first n digits frmo a string in python and store it in another column
EPO_cit["IPC_subclass"] = EPO_cit["IPC"].str[0:4]

In [None]:
#drop any unecessary columns so as to reduce space
#these will be reintroduced at a later point in time
EPO_cit.drop(["app_year", "PCT_Route", "Citn_lag_month", "Citn_lag_year", "Citn_category",
              "Citn_origin", "Cit_Total"], axis=1, inplace=True)

In [None]:
#create a list of all citing and cited application ids
#this is to make sure there are no duplication in the control group later
EPO_list = list(set(list(EPO_cit["Citing_appln_id"]) + list(EPO_cit["Cited_Appln_id"])))

In [None]:
#read in the data of the potential controls
Pot_ctrl = pd.read_csv("Patents data/potential_control.csv")

In [None]:
#count the resulting output
Pot_ctrl.count()

In [None]:
#we want the controls that are not duplications of already existing citations
#therefore remove the chance of any dupplications
#Pot_ctrl = Pot_ctrl[~Pot_ctrl["appln_id"].isin(EPO_list)]
#Similar to as above we want the subclass to be able to merge on so get the first four characters of a string
Pot_ctrl["IPC_subclass"] = Pot_ctrl["IPC"].str[0:4]

In [None]:
#merge the EPO citation data with the potential controls 
#this results in a very large dataframe where each result has different controls
EPO_Cit_ctrl = EPO_cit.merge(Pot_ctrl, on = ["IPC_subclass", "prio_year"], how = "left")


In [None]:
EPO_Cit_ctrl = EPO_Cit_ctrl[EPO_Cit_ctrl["Citing_appln_id"] != EPO_Cit_ctrl["appln_id"]]

In [None]:
EPO_Cit_ctrl

In [None]:
#rename the columns so we know what they are
EPO_Cit_ctrl.rename(columns = {"IPC_x":"Citing_IPC",
                              "IPC_y": "Control_IPC",
                              "appln_id": "Control_appln_id"}, inplace = True)

In [None]:
#we want to groupby the index so that individual results are retained
#however we only want a single control for each
#the solution to this was found in the second answer of the following website
#https://stackoverflow.com/questions/22472213/python-random-selection-per-group
EPO_cit_ctrl_single = EPO_Cit_ctrl.groupby("Index").apply(lambda x: x.sample(1)).reset_index(drop=True)


In [None]:
#rename the final column so we know what it is
EPO_cit_ctrl_single.rename(columns = {"pct_nbr": "Control_pct_nbr"}, inplace = True)


In [None]:
#check the resulting output and make sure that no columns have been left behind
EPO_cit_ctrl_single

In [None]:
#output the result to a new csv
EPO_cit_ctrl_single.to_csv("Patents data/EPO_cit_wCTRL_single2.csv")

#This notebook stores a lot of the results in RAM and so no more operations can be performed in this notebook
#a new notebook is created for the PCT citations and subsequent controls