### Downloading and extracting dataset

In [3]:
#download the dataset
!gdown 'https://drive.google.com/uc?id=1va7plI-h6FDSL8-HaAi2WPWvc1WpnKKD'

Downloading...
From (original): https://drive.google.com/uc?id=1va7plI-h6FDSL8-HaAi2WPWvc1WpnKKD
From (redirected): https://drive.google.com/uc?id=1va7plI-h6FDSL8-HaAi2WPWvc1WpnKKD&confirm=t&uuid=bad80d2a-1d0f-4240-90cf-2a09c9cc6478
To: /content/datasets.tar.gz
100% 311M/311M [00:06<00:00, 46.9MB/s]


In [4]:
# unzip the file  (may take around 10 mins)
! tar -xvzf ./datasets.tar.gz

# remove the zip file
! rm datasets.tar.gz

Content_JSONs/Cited_2020_Uncited_2010-2019_Cleaned_Content_22k/CLEANED_CONTENT_DATASET_cited_patents_by_2020_uncited_2010-2019.json
Content_JSONs/Citing_2020_Cleaned_Content_12k/Citing_Train_Test/citing_TEST.json
Content_JSONs/Citing_2020_Cleaned_Content_12k/Citing_Train_Test/citing_TRAIN.json
Citation_JSONs/Citation_Train.json


In [5]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
!ln -s /content/drive/MyDrive/Fac/Master/IR/output ./output

ln: failed to create symbolic link './output/output': File exists


## Librairy install

In [7]:
!pip install nltk
! pip install rank-bm25



## import librairy

In [8]:
import pandas as pd
import json
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.metrics.pairwise import linear_kernel
import nltk
from nltk.corpus import stopwords, wordnet

import matplotlib.pyplot as plt

import numpy as np
from tqdm.auto import tqdm

from rank_bm25 import BM25Okapi
import string



In [9]:
#import the neccessary nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')  # For lemmatization
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

### Helper functions

In [10]:
def load_json_data(file_path):
    with open(file_path, "r") as file:
        contents = json.load(file)
    return contents


def create_tfidf_matrix(citing_dataset, nonciting_dataset, vectorizer=TfidfVectorizer()):
    """
    Creates TF-IDF matrix for the given citing and non-citing datasets based on the specified text column.

    Parameters:
    citing_dataset (json)): DataFrame containing citing patents.
    nonciting_dataset (json): DataFrame containing non-citing patents.
    vectorizer (TfidfVectorizer, optional): TfidfVectorizer object for vectorizing text data.
                                             Defaults to TfidfVectorizer().

    Returns:
    tuple: A tuple containing TF-IDF matrices for citing and non-citing patents respectively.
           (tfidf_matrix_citing, tfidf_matrix_nonciting)
    """
    all_text = [patent['text'] for patent in citing_dataset + nonciting_dataset]

    # Vectorizing descriptions
    print("Vectorizing descriptions...")
    tfidf_matrix = vectorizer.fit_transform(tqdm(all_text, desc="TF-IDF"))

    # Since we're interested in similarities between citing and cited patents,
    # we need to split the TF-IDF matrix back into two parts
    split_index = len(citing_dataset)
    tfidf_matrix_citing = tfidf_matrix[:split_index]
    tfidf_matrix_nonciting = tfidf_matrix[split_index:]

    # Size of vocabulary
    print("Size of vocabulary:", len(vectorizer.vocabulary_))

    return tfidf_matrix_citing, tfidf_matrix_nonciting



def get_mapping_dict(mapping_df):
    """
    Creates dictionary of citing ids to non-citing id based on given dataframe (which is based on providedjson)

    Parameters:
    mapping_df (DataFrame): DataFrame containing mapping between citing and cited patents
    Returns:
    dict: dictionary of unique citing patent ids to list of cited patent ids
    """
    mapping_dict = {}

    for _, row in mapping_df.iterrows():
        key = row[0]  # Value from column 0
        value = row[2]  # Value from column 2
        if key in mapping_dict:
            mapping_dict[key].append(value)
        else:
            mapping_dict[key] = [value]

    return mapping_dict

def create_corpus(corpus, text_type):
    """
    Extracts text data from a corpus based on the specified text type.

    Parameters:
    corpus (list): List of dictionaries representing patent documents.
    text_type (str): Type of text to extract ('title', 'abstract', 'claim1', 'claims', 'description', 'fulltext').

    Returns:
    list: List of dictionaries with 'id' and 'text' keys representing each document in the corpus.
    """

    app_ids = [doc['Application_Number'] + doc['Application_Category'] for doc in corpus]

    cnt = 0 # count the number of documents without text
    texts = []  # list of texts
    ids_to_remove = []  # list of ids of documents without text, to remove them from the corpus

    if text_type == 'title':
        for doc in corpus:
            try:
                texts.append(doc['Content']['title'])
            except: # if the document does not have a title
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
        print(f"Number of documents without title: {cnt}")

    elif text_type == 'abstract':
        for doc in corpus:
            try:
                texts.append(doc['Content']['pa01'])
            except: # if the document does not have an abstract
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
        print(f"Number of documents without abstract: {cnt}")

    elif text_type == 'claim1':
        for doc in corpus:
            try:
                texts.append(doc['Content']['c-en-0001'])
            except: # if the document does not have claim 1
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
        print(f"Number of documents without claim 1: {cnt}")

    elif text_type == 'claims':
        # all the values with the key starting with 'c-en-', each element in the final list is a list of claims
        for doc in corpus:
            doc_claims = []
            for key in doc['Content'].keys():
                if key.startswith('c-en-'):
                    doc_claims.append(doc['Content'][key])
            if len(doc_claims) == 0:    # if the document does not have any claims
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
            else:
                doc_text_string = ' '.join(doc_claims)
                texts.append(doc_text_string)
        print(f"Number of documents without claims: {cnt}")

    elif text_type == 'description':
        # all the values with the key starting with 'p'
        for doc in corpus:
            doc_text = []
            for key in doc['Content'].keys():
                if key.startswith('p'):
                    doc_text.append(doc['Content'][key])
            if len(doc_text) == 0:  # if the document does not have any description
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
            else:
                doc_text_string = ' '.join(doc_text)
                texts.append(doc_text_string)
        print(f"Number of documents without description: {cnt}")

    elif text_type == 'fulltext':
        for doc in corpus:
            doc_text = list(doc['Content'].values())
            doc_text_string = ' '.join(doc_text)
            texts.append(doc_text_string)
        if cnt > 0:
            print(f"Number of documents without any text: {cnt}")

    else:
        raise ValueError("Invalid text type")

    if len(ids_to_remove) > 0:
        print(f"Removing {len(ids_to_remove)} documents without required text")
        for id_ in ids_to_remove[::-1]:
            idx = app_ids.index(id_)
            del app_ids[idx]

    # Create a list of dictionaries with app_ids and texts
    corpus_data = [{'id': app_id, 'text': text} for app_id, text in zip(app_ids, texts)]

    return corpus_data


def get_true_and_predicted(citing_to_cited_dict, recommendations_dict):
    """
    Get the true and predicted labels for the metrics calculation.

    Parameters:
    citing_to_cited_dict : dict of str : list of str
        Mapping between citing patents and the list of their cited patents
    recommendations_dict : dict of str : list of str
        Mapping between citing patents and the sorted list of recommended patents

    Returns:
    list of list
        True relevant items for each recommendation list.
    list of list
        Predicted recommended items for each recommendation list.
    int
        Number of patents not in the citation mapping
    """
    # Initialize lists to store true labels and predicted labels
    true_labels = []
    predicted_labels = []
    not_in_citation_mapping = 0

    # Iterate over the items in both dictionaries
    for citing_id in recommendations_dict.keys():
        # Check if the citing_id is present in both dictionaries
        if citing_id in citing_to_cited_dict:
            # If yes, append the recommended items from both dictionaries to the respective lists
            true_labels.append(citing_to_cited_dict[citing_id])
            predicted_labels.append(recommendations_dict[citing_id])
        else:
            not_in_citation_mapping += 1

    return true_labels, predicted_labels, not_in_citation_mapping



def mean_recall_at_k(true_labels, predicted_labels, k=10):
    """
    Calculate the mean Recall@k for a list of recommendations.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.
    k : int
        Number of recommendations to consider.

    Returns:
    float
        Mean Recall@k value.
    """
    recalls_at_k = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate Recall@k for each recommendation list
        true_set = set(true)
        k = min(k, len(pred))
        relevant_count = sum(1 for item in pred[:k] if item in true_set)
        recalls_at_k.append(relevant_count / len(true_set))

    # Calculate the mean Recall@k
    mean_recall = sum(recalls_at_k) / len(recalls_at_k)

    return mean_recall

def mean_inv_ranking(true_labels, predicted_labels):
    """
    Calculate the mean of lists of the mean inverse rank of true relevant items
    in the lists of sorted recommended items.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.

    Returns:
    float
        Mean of lists of the mean inverse rank of true relevant items.
    """
    mean_ranks = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate the inverse rank of true relevant items
        # in the recommendation list
        ranks = []
        for item in true:
            try:
                rank = 1 / (pred.index(item) + 1)
            except ValueError:
                rank = 0  # If item not found, assign 0
            ranks.append(rank)

        # Calculate the mean inverse rank of true relevant items
        # in the recommendation list
        mean_rank = sum(ranks) / len(ranks)
        mean_ranks.append(mean_rank)

    # Calculate the mean of the mean inverse ranks across all recommendation lists
    mean_of_mean_ranks = sum(mean_ranks) / len(mean_ranks)

    return mean_of_mean_ranks


def mean_ranking(true_labels, predicted_labels):
    """
    Calculate the mean of lists of the mean rank of true relevant items
    in the lists of sorted recommended items.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.

    Returns:
    float
        Mean of lists of the mean rank of true relevant items.
    """
    mean_ranks = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate the rank of true relevant items
        # in the recommendation list
        ranks = []
        for item in true:
            try:
                rank = pred.index(item) + 1
            except ValueError:
                rank = len(pred)  # If item not found, assign the length of the list
            ranks.append(rank)

        # Calculate the mean rank of true relevant items
        # in the recommendation list
        mean_rank = sum(ranks) / len(ranks)
        mean_ranks.append(mean_rank)

    # Calculate the mean of the mean ranks across all recommendation lists
    mean_of_mean_ranks = sum(mean_ranks) / len(mean_ranks)

    return mean_of_mean_ranks



def mean_average_precision(true_labels, predicted_labels, k=10):
    """
    Calculate the mean Average Precision for a list of recommendations.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.
    k : int
        Number of recommendations to consider.

    Returns:
    float
        Mean Average Precision value.
    """
    average_precisions = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate Average Precision for each recommendation list
        true_set = set(true)
        precision_at_k = []
        relevant_count = 0
        for i, item in enumerate(pred[:k]):
            if item in true_set:
                relevant_count += 1
                precision_at_k.append(relevant_count / (i + 1))
        average_precision = sum(precision_at_k) / len(true_set)
        average_precisions.append(average_precision)

    # Calculate the mean Average Precision
    mean_average_precision = sum(average_precisions) / len(average_precisions)

    return mean_average_precision

def top_k_ranks(citing, cited, cosine_similarities, k=10):
    # Create a dictionary to store the top k ranks for each citing patent
    top_k_ranks = {}
    for i, content_id in enumerate(citing):
        top_k_ranks[content_id['id']] = [cited[j]['id'] for j in np.argsort(cosine_similarities[i])[::-1][:k]]
    return top_k_ranks


## Loading dataset

In [11]:
json_citing_train = load_json_data("./Content_JSONs/Citing_2020_Cleaned_Content_12k/Citing_Train_Test/citing_TRAIN.json")
json_citing_test = load_json_data("./Content_JSONs/Citing_2020_Cleaned_Content_12k/Citing_Train_Test/citing_TEST.json")

json_nonciting = load_json_data("./Content_JSONs/Cited_2020_Uncited_2010-2019_Cleaned_Content_22k/CLEANED_CONTENT_DATASET_cited_patents_by_2020_uncited_2010-2019.json")
json_citing_to_cited = load_json_data("./Citation_JSONs/Citation_Train.json") # Citing ids are unique

citing_dataset_df = pd.DataFrame(json_citing_train)

nonciting_dataset_df = pd.DataFrame(json_nonciting)
mapping_dataset_df = pd.DataFrame(json_citing_to_cited)

* Dataset of the query : **citing_dataset_df**

* Dataset of the DB : **nonciting_dataset_df**

In [45]:
print((nonciting_dataset_df['Application_Number'] == '2221486').unique())


[False  True]


In [52]:
# citing_dataset_df.iloc[0].Content

# Reall  work

In [13]:
# merge all text into one column
citing_dataset_merge_df = pd.DataFrame(citing_dataset_df['Application_Number'])
citing_dataset_merge_df['Application_Category'] = citing_dataset_df['Application_Category']
citing_dataset_merge_df['text'] = citing_dataset_df['Content'].apply(lambda x: ' '.join(x.values()))

In [14]:

stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)
# Combine the stop words and punctuation into a single set
combined_null_words = stop_words.union(punctuation)

def tokenizing(text):
    """
    Cette fonction prend un texte en entrée, le tokenise, supprime les mots commun,
    met tout en minuscules et retire la ponctuation.

    Args:
    text (str): Le texte à tokeniser.

    Returns:
    list: Une liste de tokens nettoyés.
    """
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.lower() not in combined_null_words]
    return tokens

out = tokenizing(citing_dataset_merge_df['text'][0])
print(out)

['device', 'controlling', 'braking', 'trailer', 'device', '1', 'controlling', 'braking', 'trailer', 'comprises', 'one', 'control', 'line', '2', 'connectable', 'source', 'work', 'fluid', 'first', 'pressure', 'braking', 'line', '3', 'connectable', 'service', 'braking', 'system', '4', 'trailer', 'communicating', 'control', 'line', '2', 'one', 'additional', 'line', '5', 'connectable', 'source', 'work', 'fluid', 'second', 'pressure', 'one', 'emergency', 'line', '6', 'connectable', 'additional', 'line', '5', 'connectable', 'emergency', 'and/or', 'parking', 'brake', '7', 'trailer', 'type', 'hydraulically', 'released', 'spring', 'brake', 'one', 'discharge', 'line', '8', 'work', 'fluid', 'communicating', 'collection', 'tank', '9', 'first', 'valve', 'means', 'operable', 'braking', 'position', 'wherein', 'additional', 'line', '5', 'isolated', 'discharge', 'line', '8', 'emergency', 'position', 'wherein', 'additional', 'line', '5', 'communicating', 'discharge', 'line', '8', 'second', 'valve', 'mean

In [28]:
def get_most_significant_word(text, top_k=200) :
    bm25 = BM25Okapi([text])
    score = []
    df = pd.DataFrame(text, columns=['token'])

    for i in range(len(df)):
        query = df['token'][i]
        df.loc[i, 'score'] = bm25.get_scores([query])[0]

    return df.sort_values(by=['score'], ascending=False)[:top_k]

out_2 = get_most_significant_word(out, top_k=6)
print(out_2)

              token     score
2789   deactivating -0.274653
2790  corresponding -0.274653
2848           last -0.274653
2814         remain -0.274653
2846           good -0.274653
1139          vi-vi -0.274653


In [29]:
def do_poss_tagging(text, pos_tags=None) :
  if pos_tags is None :
    nouns = ['NN', 'NNS', 'NNP', 'NNPS']
    verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    adverbes = ['RB', 'RBR', 'RBS']
    pos_tags = nouns + verbs + adverbes #what type to keep

  out = nltk.pos_tag(text)

  out = [word for word, pos in out if pos in pos_tags]

  return out

out_3 = do_poss_tagging(out_2['token'])
print(out_3)

['deactivating', 'corresponding', 'vi-vi']


In [17]:
# citing_dataset_merge_df['text'] = citing_dataset_merge_df['text'].apply(lambda x: get_most_significant_word([x], top_k=200)['token'].tolist())

In [18]:
def get_synonyms_list(word_list):
  def get_synonyms(word):

      # look for the synonyms, discard same word has results
      for syn in wordnet.synsets(word):
          for lemma in syn.lemmas():
              if lemma.name()!= word :
                return lemma.name()
      return None

  for word in word_list:
    synonym = get_synonyms(word)
    if synonym:
      word_list.append(synonym)
  return word_list

# out_4 = get_synonyms_list(out_3[:20])
# print(out_4)


In [19]:
# len(out_4)

In [20]:
save_location = "./output/"
def save_df(df, save_path):
    df.to_json(save_path)

In [21]:
def pipe_saver(df, top_k=500, save=True):

  name = f"prepro_top_{top_k}"
  # create the dir of the the reponse
  if save :
      # Create the directory
      print(save_location+name)
      save_location_path = save_location+name +"/"
      os.makedirs(save_location_path, exist_ok=True)

  # first tokenize
  df['token'] = df['text'].apply(lambda x: tokenizing(x))
  if save :
    save_df(df, save_location_path + "tokenized.json")

  # then get the most significant word
  df.loc['token'] = df['token'].apply(lambda x: get_most_significant_word(x, top_k=top_k)['token'].tolist())
  if save :
    save_df(df, save_location_path + "most_significant_word.json")

  # postaging
  df.loc['token'] = df['token'].apply(lambda x: do_poss_tagging(x))
  if save :
    save_df(df, save_location_path + "postagged.json")

  # get synonyms
  # df['token'] = df['token'].apply(lambda x: get_synonyms_list(x))

  # if save :
  #   save_df(df, save_location_path + "full_pipeline.json")

  return df


In [53]:
# citing_dataset_merge_df[1].text

In [23]:
pipe_saver(citing_dataset_merge_df[:2], top_k=100, save=True)

./output/prepro_top_100


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['token'] = df['text'].apply(lambda x: tokenizing(x))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc['token'] = df['token'].apply(lambda x: get_most_significant_word(x, top_k=top_k)['token'].tolist())


TypeError: 'float' object is not iterable