# Data Exploration Notebook

In [1]:
import pandas as pd
# pd.set_option('max_colwidth', None)

## Train Data

In [2]:
df = pd.read_parquet('../data/train.parquet')

In [3]:
# checking dtypes etc.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3030 entries, 0 to 3029
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             3000 non-null   float64
 1   index                  3000 non-null   float64
 2   claim_id               2999 non-null   object 
 3   patent_application_id  2999 non-null   object 
 4   cited_document_id      3000 non-null   object 
 5   text                   2998 non-null   object 
 6   text_b                 3000 non-null   object 
 7   label                  3000 non-null   float64
 8   date                   3000 non-null   float64
 9   DIznQ_0                2999 non-null   float64
dtypes: float64(5), object(5)
memory usage: 236.8+ KB


In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,index,label,date,DIznQ_0
count,3000.0,3000.0,3000.0,3000.0,2999.0
mean,2492561.0,2494046.0,0.583333,20147610.0,0.500294
std,1468050.0,1464272.0,0.493089,14093.13,0.287226
min,76.0,76.0,0.0,20120700.0,0.000915
25%,1225735.0,1229267.0,0.0,20140200.0,0.255147
50%,2482978.0,2486731.0,1.0,20150510.0,0.498018
75%,3769415.0,3765171.0,1.0,20160520.0,0.74869
max,5043290.0,5043290.0,1.0,20170330.0,0.999761


In [5]:
# name Unnamed: 0 column -> unnamed_col
df.columns = ['unnamed_col', 'index', 'claim_id', 'patent_application_id', 'cited_document_id', 'text', 'text_b', 'label', 'date', 'DIznQ_0']

# fix indexing
# replace NaNs in columns 'index' and 'unnamed_col', change float to int
df['index'] = df['index'].fillna(0).apply(lambda x: int(x))
df['unnamed_col'] = df['unnamed_col'].fillna(0).apply(lambda x: int(x))

# create updated_index assuming that unnamed_col and index are corresponding to each other
# updated_index == index, when index != 0
# updated_index == unnamed_col, when index == 0
df['updated_index'] = df.apply(lambda row: row['index'] if row['index'] != 0 else row['unnamed_col'], axis=1)

# drop columns: unnamed_col, index and rearrange columns order
df = df[[
        # 'unnamed_col', 
        # 'index', 
        'updated_index',
        'text', 
        'text_b', 
        'label', 
        'claim_id', 
        'patent_application_id',
        'cited_document_id', 
        'date', 
        'DIznQ_0',
        ]
        ]

# drop NaN in subset 'text', 'text_b', 'label';
# left NaN in other columns ('claim_id', 'patent_application_id', 'cited_document_id', 'date', 'DIznQ_0')
# in purpose to save as much training data as possible
df = df.dropna(subset=['updated_index', 'text', 'text_b', 'label'])

# # check duplicated with/without subsets
# len(df[df.duplicated(keep=False)])
# len(df[df.duplicated(subset=['text', 'text_b'], keep=False)]) 
# len(df[df.duplicated(subset=['text', 'text_b', 'label'], keep=False)])
# len(df[df.duplicated(subset=['updated_index', 'text', 'text_b', 'label'], keep=False)])

# drop duplicated
# df = df.drop_duplicates(subset=['updated_index', 'text', 'text_b', 'label'], keep='first')
df = df.drop_duplicates(subset=['text', 'text_b'], keep='first')

# change label datatype to int
df['label'] = df['label'].apply(lambda x: int(x))

# measure length in chars for text and text_b
df['text_len'] = df['text'].apply(lambda x: len(x))
df['text_b_len'] = df['text_b'].apply(lambda x: len(x))

# rename column updated_index to index
df.columns = ['index', 'text', 'text_b', 'label', 'claim_id', 'patent_application_id', 'cited_document_id', 'date', 'DIznQ_0', 'text_len', 'text_b_len']


In [6]:
# Train Validation Split 20% - TODO later in the experiment

In [7]:
# Train Dataset Information

print(f"Number of samples: {len(df)}")
print(f"Distinct patent applications: {df['patent_application_id'].nunique()}")
print(f"Distinct cited documents: {df['cited_document_id'].nunique()}")
print(f"Distinct claim texts: {df['text'].nunique()}")
print(f"Distinct cited paragraphs: {df['text_b'].nunique()}")
print(f"Median claim length (chars): {df['text_len'].median()}")
print(f"Median paragraph length (chars): {df['text_b_len'].median()}")
print(f"Mean claim length (chars): {int(df['text_len'].mean())}")
print(f"Mean paragraph length (chars): {int(df['text_b_len'].mean())}")
print(f"Labels - 0, Non-novelty-destroying: {len(df[df['label'] == 0])}")
print(f"Labels - 1, Novelty-destroying: {len(df[df['label'] == 1])}")

Number of samples: 2912
Distinct patent applications: 2346
Distinct cited documents: 2382
Distinct claim texts: 2854
Distinct cited paragraphs: 2899
Median claim length (chars): 271.0
Median paragraph length (chars): 479.5
Mean claim length (chars): 391
Mean paragraph length (chars): 578
Labels - 0, Non-novelty-destroying: 1214
Labels - 1, Novelty-destroying: 1698


### Truncate and Save Train Data

In [8]:
df_train = df[['index', 'text', 'text_b', 'label']]
df_train.set_index('index', drop=True, inplace=True)
df_train.to_parquet('../data/train_clean.parquet')

In [9]:
df_train = pd.read_parquet('../data/train_clean.parquet')
df_train

Unnamed: 0_level_0,text,text_b,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2165095,Method for processing a current image of an im...,Figure 14 shows one embodiment of a flow diagr...,0
1840726,Seamless quenched and tempered steel pipe acco...,A tensile test is carried out with respect to ...,0
642170,The apparatus of claim 1 further comprising me...,For example when packet data is expected to be...,1
2820665,THE METHOD FOR CHARACTERISING THE STATE OF USE...,First an IR image of the printed matter P2 is ...,0
2452920,The solar cell module of claim 1 wherein the i...,The plurality of emitter regions 119a are sepa...,1
...,...,...,...
199239,A method according to any of the above wherein...,Then the Sidoped GaN layer is exposed by etchi...,1
636389,The filament winding method according to claim...,The present invention relates to filament wind...,1
3730181,A storage battery evaluating method comprising...,The above results may be said to indicate that...,1
3419428,An electronic device comprising a display unit...,An indicator 201 may be further provided to in...,0


## Test Data

In [10]:
df = pd.read_parquet('../data/test.parquet')

In [11]:
# checking dtypes etc.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 808 entries, 0 to 807
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  800 non-null    float64
 1   claim_id               800 non-null    object 
 2   patent_application_id  800 non-null    object 
 3   cited_document_id      800 non-null    object 
 4   text                   800 non-null    object 
 5   text_b                 799 non-null    object 
 6   label                  800 non-null    float64
 7   date                   800 non-null    float64
 8   DIznQ_0                800 non-null    float64
dtypes: float64(4), object(5)
memory usage: 56.9+ KB


In [12]:
df.describe()

Unnamed: 0,index,label,date,DIznQ_0
count,800.0,800.0,800.0,800.0
mean,5633061.0,0.45125,20171450.0,0.503976
std,362112.7,0.497929,2436.061,0.281988
min,4992661.0,0.0,20170330.0,0.004344
25%,5337198.0,0.0,20170610.0,0.263315
50%,5622598.0,0.0,20170800.0,0.518379
75%,5957261.0,1.0,20171100.0,0.734159
max,6259093.0,1.0,20180130.0,0.999912


In [13]:
# dropna and drop_duplicates
df = df.dropna(subset=['index', 'text', 'text_b', 'label']).drop_duplicates(subset=['index', 'text', 'text_b', 'label'])

# fix indexing
df['index'] = df['index'].apply(lambda x: int(x))

# change label datatype to int
df['label'] = df['label'].apply(lambda x: int(x))

# rearrange columns order
df = df[[
        'index', 
        'text', 
        'text_b', 
        'label', 
        'claim_id', 
        'patent_application_id',
        'cited_document_id', 
        'date', 
        'DIznQ_0',
        ]
        ]

# measure length in chars for text and text_b
df['text_len'] = df['text'].apply(lambda x: len(x))
df['text_b_len'] = df['text_b'].apply(lambda x: len(x))

In [14]:
df

Unnamed: 0,index,text,text_b,label,claim_id,patent_application_id,cited_document_id,date,DIznQ_0,text_len,text_b_len
0,5770811,The system 200 of any of claims 2 to 5 the hol...,The maximum outer diameter of the stentvalve 1...,0,203152_0,EP3213718A1,EP2520251,20170906.0,,135,494
1,6177765,Method for monitoring tactical simulations acc...,Then at step 406 the processor 705 begins to r...,1,203466_1,EP3264394A1,EP2066128,20180103.0,0.509102,297,406
2,6113378,The collecting bag of claim 1 wherein the one ...,In a further aspect the invention provides a v...,0,217047_0,EP3254647A1,EP985390,20171213.0,0.515626,372,274
3,6082010,A container according to any preceding claim w...,To take tobacco 18 out of the tobacco pouch 23...,0,215557_0,EP3251971A1,EP2845498,20171206.0,0.651643,196,515
4,6025661,The mobile terminal100 of any one of claims 1 ...,In the unselected state a whole stack 401 show...,0,221861_0,EP3246802A1,EP2434380,20171122.0,0.282341,338,687
...,...,...,...,...,...,...,...,...,...,...,...
803,5761450,A development cartridge mounted in a mounting ...,The fixing device 60 includes a heating roller...,1,197249_1,EP3214503A1,EP2037327,20170906.0,0.292397,836,403
804,5188305,The device 100 200 300 400 500 of claim 4 wher...,The embodiments of the present invention may b...,1,174819_1,EP3163955A1,EP2552162,20170503.0,0.244995,426,712
805,5056828,The motor drive system according to claim 1 wh...,When the trigger switch 12 is turned ON the CP...,0,117374_0,EP3150338A1,EP2140983,20170405.0,0.665531,468,555
806,5089201,The machine according to the preceding claim f...,Further characteristics and advantages of the ...,1,123990_1,EP3153028A1,EP2478803,20170412.0,0.121884,180,334


In [15]:
# Train Dataset Information

print(f"Number of samples: {len(df)}")
print(f"Distinct patent applications: {df['patent_application_id'].nunique()}")
print(f"Distinct cited documents: {df['cited_document_id'].nunique()}")
print(f"Distinct claim texts: {df['text'].nunique()}")
print(f"Distinct cited paragraphs: {df['text_b'].nunique()}")
print(f"Median claim length (chars): {df['text_len'].median()}")
print(f"Median paragraph length (chars): {df['text_b_len'].median()}")
print(f"Mean claim length (chars): {int(df['text_len'].mean())}")
print(f"Mean paragraph length (chars): {int(df['text_b_len'].mean())}")
print(f"Labels - 0, Non-novelty-destroying: {len(df[df['label'] == 0])}")
print(f"Labels - 1, Novelty-destroying: {len(df[df['label'] == 1])}")

Number of samples: 768
Distinct patent applications: 597
Distinct cited documents: 614
Distinct claim texts: 749
Distinct cited paragraphs: 766
Median claim length (chars): 289.0
Median paragraph length (chars): 478.5
Mean claim length (chars): 428
Mean paragraph length (chars): 566
Labels - 0, Non-novelty-destroying: 423
Labels - 1, Novelty-destroying: 345


### Truncate and Save Test Data

In [16]:
df_test = df[['index', 'text', 'text_b', 'label']]
df_test.set_index('index', drop=True, inplace=True)
df_test.to_parquet('../data/test_clean.parquet')

In [17]:
df_test

Unnamed: 0_level_0,text,text_b,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5770811,The system 200 of any of claims 2 to 5 the hol...,The maximum outer diameter of the stentvalve 1...,0
6177765,Method for monitoring tactical simulations acc...,Then at step 406 the processor 705 begins to r...,1
6113378,The collecting bag of claim 1 wherein the one ...,In a further aspect the invention provides a v...,0
6082010,A container according to any preceding claim w...,To take tobacco 18 out of the tobacco pouch 23...,0
6025661,The mobile terminal100 of any one of claims 1 ...,In the unselected state a whole stack 401 show...,0
...,...,...,...
5761450,A development cartridge mounted in a mounting ...,The fixing device 60 includes a heating roller...,1
5188305,The device 100 200 300 400 500 of claim 4 wher...,The embodiments of the present invention may b...,1
5056828,The motor drive system according to claim 1 wh...,When the trigger switch 12 is turned ON the CP...,0
5089201,The machine according to the preceding claim f...,Further characteristics and advantages of the ...,1


In [18]:
df_test = pd.read_parquet('../data/test_clean.parquet')
df_test

Unnamed: 0_level_0,text,text_b,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5770811,The system 200 of any of claims 2 to 5 the hol...,The maximum outer diameter of the stentvalve 1...,0
6177765,Method for monitoring tactical simulations acc...,Then at step 406 the processor 705 begins to r...,1
6113378,The collecting bag of claim 1 wherein the one ...,In a further aspect the invention provides a v...,0
6082010,A container according to any preceding claim w...,To take tobacco 18 out of the tobacco pouch 23...,0
6025661,The mobile terminal100 of any one of claims 1 ...,In the unselected state a whole stack 401 show...,0
...,...,...,...
5761450,A development cartridge mounted in a mounting ...,The fixing device 60 includes a heating roller...,1
5188305,The device 100 200 300 400 500 of claim 4 wher...,The embodiments of the present invention may b...,1
5056828,The motor drive system according to claim 1 wh...,When the trigger switch 12 is turned ON the CP...,0
5089201,The machine according to the preceding claim f...,Further characteristics and advantages of the ...,1


## Duplicated index check

In [19]:
# check for duplicated index in Train and Test sets
check_indexes_list = df_train.index.tolist()
check_indexes_list.extend(df_test.index.tolist())

In [23]:
if len(check_indexes_list) == len(set(check_indexes_list)):
    print("No duplicated index found.")
else:
    print('Found duplicated index!')

No duplicated index found.


3680