In [None]:
import pandas as pd
import numpy as np

# Getting citations 

The purpose of this workbook is to extract patents that have been cited in the citations database and export the results that contain patents that have been cited and merge this with citations.

In [None]:
#import patents identified in the UK
EPO_data = pd.read_csv("Patents data/UK_patent_teams.csv")

In [None]:
#read in the conversion between EPO and PCT applications
EPO_PCT = pd.read_csv("Patents data/202001_EPO_PCT.txt", sep = "|", header = 0)

In [None]:
#get a list of all application numbers from the UK patent dataset
EPO_list = list(EPO_data["app_nbr"].unique())

In [None]:
#shrink the EPO_CPT dataset to only those who come from the UK dataset
PCT_joint = EPO_PCT[EPO_PCT["app_nbr"].isin(EPO_list)]

In [None]:
#get a list of all the pct numbers from that list
PCT_joint_list = list(PCT_joint["pct_nbr"])

# Citation data

### EPO citation data

In [None]:
#import the EPO citations database as a pandas dataframe
EPO_cit = pd.read_csv("Patents data/Citations/202001_EPO_CITATIONS.txt", sep = "|", header = 0)

In [None]:
#remove all files for which the cited publication data is not there
EPO_cit = EPO_cit[EPO_cit["Cited_pub_date"].notna()]

In [None]:
#filter down to those for which the cited application number is in the EPO list
EPO_cit_ind_GB = EPO_cit[EPO_cit["Cited_App_nbr"].isin(EPO_list)]

In [None]:
#change the format of the dates to actual dates rather than numbers
from datetime import datetime
dates = list(EPO_cit_ind_GB["Citing_pub_date"])

dateschanged = []

for x in dates:
    x = str(x)
    date = datetime.strptime(x, '%Y%m%d').strftime('%Y/%m/%d')
    dateschanged.append(date)
    

In [None]:
dates2 = list(EPO_cit_ind_GB["Cited_pub_date"])

dateschanged2 = []

for t in dates2:
    t = int(t)
    t = str(t)
    date2 = datetime.strptime(t, '%Y%m%d').strftime('%Y/%m/%d')
    dateschanged2.append(date2)

In [None]:
EPO_cit_ind_GB["Citing_pub_date"] = dateschanged
EPO_cit_ind_GB["Cited_pub_date"] = dateschanged2

In [None]:
#export the result to EPO citations csv
EPO_cit_ind_GB.to_csv("Patents data/EPO_citations_GB.csv")

In [None]:
#explore how many patents are actually grouped in this dataset
EPO_cit_grouped = EPO_cit_ind_GB.groupby(["Cited_pub_nbr"]).count()

In [None]:
#the mean number of citations that a patent gets is 2.54 if it did receive a citation
EPO_cit_grouped.mean()
#the number of patents receving a citation under this is 30,038 patents
EPO_cit_grouped.count()

In [None]:
#get a list of all the cited application numbers
EPO_cited = list(EPO_cit_grouped["Cited_App_nbr"])

In [None]:
#get the PCT application numbers 
PCT_also_cited = EPO_PCT[EPO_PCT["app_nbr"].isin(EPO_cited)]

In [None]:
PCT_also_cited = list(PCT_also_cited["pct_nbr"])

### PCT citations

In [None]:
#read in the PCT citation data
PCT_cit = pd.read_csv("Patents data/Citations/202001_PCT_CITATIONS.txt", sep = "|", header = 0)

In [None]:
#remove those that do not have a citation date
PCT_cit = PCT_cit[PCT_cit["Cited_pub_date"].notna()]

In [None]:
#get only those PCT citations whose application number is in the joint list
PCT_in_EPOGB = PCT_cit[PCT_cit["Cited_App_nbr"].isin(PCT_joint_list)]

In [None]:
#change the dates such that they are in date time format
PCT_dates = list(PCT_in_EPOGB["Citing_pub_date"])

PCT_dateschanged = []

for x in PCT_dates:
    x = str(x)
    date = datetime.strptime(x, '%Y%m%d').strftime('%Y/%m/%d')
    PCT_dateschanged.append(date)
    


In [None]:
PCT_dates2 = list(PCT_in_EPOGB["Cited_pub_date"])

PCT_dateschanged2 = []

for t in PCT_dates2:
    t = int(t)
    t = str(t)
    date2 = datetime.strptime(t, '%Y%m%d').strftime('%Y/%m/%d')
    PCT_dateschanged2.append(date2)

In [None]:
PCT_in_EPOGB["Citing_pub_date"] = PCT_dateschanged
PCT_in_EPOGB["Cited_pub_date"] = PCT_dateschanged2

In [None]:
#export the file to PCT citations database
PCT_in_EPOGB.to_csv("Patents data/PCT_citations_GB.csv")

In [None]:
#group by the cited patent publication number
PCT_cit_grouped = PCT_in_EPOGB.groupby(["Cited_App_nbr"]).count()

In [None]:
#reset the index
PCT_cit_grouped.reset_index(inplace=True)

In [None]:
#check for overlap between PCT and EPO citations - it appears there is none
PCT_cit_grouped_also = PCT_cit_grouped[PCT_cit_grouped["Cited_pub_nbr"].isin(PCT_also_cited)]

In [None]:
#the mean number of citations a patent receives that gets citations is 3.04
PCT_cit_grouped.mean()
#the number of cited patents is 26408
PCT_cit_grouped

In [None]:
len(PCT_cit_grouped["Cited_App_nbr"].unique())

In [None]:
#extracting the patents that are actually cited
EPO_cit_ind_GB_list = list(EPO_cit_ind_GB["Cited_App_nbr"])



In [None]:
PCT_in_EPOGB_list = list(PCT_in_EPOGB["Cited_App_nbr"])
PCT_in_EPOGB_mixed = EPO_PCT[EPO_PCT["pct_nbr"].isin(PCT_in_EPOGB_list)]
PCT_in_EPOGB_mixed_list = list(PCT_in_EPOGB_mixed["app_nbr"])


In [None]:
len(set(PCT_in_EPOGB_mixed_list))
#len(set(EPO_cit_ind_GB_list))

In [None]:
cited_patents = EPO_cit_ind_GB_list + PCT_in_EPOGB_mixed_list

In [None]:
len(cited_patents)

In [None]:
EPO_data.count()

In [None]:
cited_patents_table = EPO_data[EPO_data["app_nbr"].isin(cited_patents)]

In [None]:
len(cited_patents_table["app_nbr"].unique())

In [None]:
cited_patents_table.to_csv("Patents data/UK_Patents_cited.csv")