In [1]:
pip install requests faker

Collecting requests
  Using cached requests-2.32.3-py3-none-any.whl (64 kB)
Collecting faker
  Using cached faker-37.3.0-py3-none-any.whl (1.9 MB)
Collecting urllib3<3,>=1.21.1
  Using cached urllib3-2.4.0-py3-none-any.whl (128 kB)
Collecting certifi>=2017.4.17
  Using cached certifi-2025.4.26-py3-none-any.whl (159 kB)
Collecting idna<4,>=2.5
  Using cached idna-3.10-py3-none-any.whl (70 kB)
Collecting charset-normalizer<4,>=2
  Using cached charset_normalizer-3.4.2-cp310-cp310-macosx_10_9_universal2.whl (201 kB)
Collecting tzdata
  Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: urllib3, tzdata, idna, charset-normalizer, certifi, requests, faker
Successfully installed certifi-2025.4.26 charset-normalizer-3.4.2 faker-37.3.0 idna-3.10 requests-2.32.3 tzdata-2025.2 urllib3-2.4.0
[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
api_key = ""
api_id = ""

In [6]:
## GET OUTPUTS

import requests
import json

# Params
number_of_outputs = 1000

params = {"Limit": number_of_outputs}
headers = {
    "Content-Type": "application/json",
    "X-DWS-OCS-Key": api_key,
    "X-DWS-OCS-AppId": api_id,
}
outputs_response = requests.get(
    "https://dws.roche.com/DigitalWorkspace_OCS_API/rest/Sinequa/OCS_Output?",
    params=params,
    headers=headers,
)
outputs_data = outputs_response.json()["RecordList"]
print(f"Number of outputs: {len(outputs_data)}")

# Serializing json
outputs_json_object = json.dumps(outputs_data, indent=4)

with open("outputs.json", "w") as outfile:
    outfile.write(outputs_json_object)

Number of outputs: 551


In [7]:
## GET WORKS

import requests
import json

# Params
number_of_works = 3000

params = {"Limit": number_of_works}
headers = {
    "Content-Type": "application/json",
    "X-DWS-OCS-Key": api_key,
    "X-DWS-OCS-AppId": api_id,
}
works_response = requests.get(
    "https://dws.roche.com/DigitalWorkspace_OCS_API/rest/Sinequa/OCS_Work?",
    params=params,
    headers=headers,
)
works_data = works_response.json()["RecordList"]
print(f"Number of works: {len(works_data)}")

# Serializing json
works_json_object = json.dumps(works_data, indent=4)

with open("works.json", "w") as outfile:
    outfile.write(works_json_object)

Number of works: 2794


In [8]:
# MERGE WORKS AND OUTPUTS

# Assuming works_data and outputs_data are lists of dicts
combined = works_data + outputs_data
combined_json_object = json.dumps(combined, indent=4)

with open("original_data.json", "w") as outfile:
    outfile.write(combined_json_object)

In [9]:
### Fetching expert IDs from the Digital Workspace Service (DWS) API
experts_ids = []
experts = []

for item in combined:
    expert_id = item.get("ExpertId")
    if expert_id not in experts_ids:
        experts_ids.append(item["ExpertId"])

print(f"Number of unique expert IDs: {len(experts_ids)}")

for expert_id in experts_ids:
    params = {"UserId": expert_id}
    expert_response = requests.get(
        "https://dws.roche.com/DigitalWorkspace_Home_API/rest/Sinequa/GetExpertProfile_GET?",
        params=params,
        headers=headers,
    )

    if (
        expert_response.status_code == 200
        and len(expert_response.json()["RecordList"]) > 0
    ):
        experts.append(expert_response.json()["RecordList"][0])
    else:
        print(f"Failed to fetch expert profile for ID: {expert_id}")

print(
    f"Fetched {len(experts)} experts out of processed {len(combined)} outputs and works."
)

experts_json_object = json.dumps(experts, indent=4)

with open("experts.json", "w") as outfile:
    outfile.write(experts_json_object)

Number of unique expert IDs: 114
Failed to fetch expert profile for ID: 1049157
Fetched 113 experts out of processed 3345 outputs and works.


In [10]:
import faker
import json

fake = faker.Faker()
original_file = "original_data.json"
expert_file = "experts.json"


remove_list = [
    "EncodedTitle",
    "ClickURL",
    "externalJobs",
    "Photo",
    "Content",
    "internalProjects",
    "Recomendations",
    "ActivityLinks",
    "Username",
]
name_list = ["Title", "FirstName", "LastName"]
works_list = ["Owner"]
contact_list = ["Email", "Phone", "Mobile"]


def remove_data(processed_record):
    for attribute in remove_list:
        if attribute in processed_record:
            del processed_record[attribute]


def anonymize_name(processed_record, list, name, last_name, full_name):
    for attribute in list:
        if attribute in processed_record:
            if "FirstName" in attribute:
                processed_record[attribute] = processed_record[attribute].replace(
                    processed_record[attribute], name
                )
            elif "LastName" in attribute:
                processed_record[attribute] = processed_record[attribute].replace(
                    processed_record[attribute], last_name
                )
            else:
                processed_record[attribute] = processed_record[attribute].replace(
                    processed_record[attribute], full_name
                )


def anonymize_contacts(processed_record, email, phone, mobile):
    for attribute in contact_list:
        if attribute in processed_record:
            if "Email" in attribute:
                processed_record[attribute] = processed_record[attribute].replace(
                    processed_record[attribute], email
                )
            elif "Phone" in attribute:
                processed_record[attribute] = processed_record[attribute].replace(
                    processed_record[attribute], phone
                )
            elif "Mobile" in attribute:
                processed_record[attribute] = processed_record[attribute].replace(
                    processed_record[attribute], mobile
                )


data = json.load(open(original_file))
expert_data = json.load(open(expert_file))
for expert_record in expert_data:
    fake_name = fake.first_name()
    fake_last_name = fake.last_name()
    fake_full_name = f"{fake_name} {fake_last_name}"
    fake_email = f"{fake_name.lower()}.{fake_last_name.lower()}@roche.com"
    fake_phone = fake.phone_number()
    fake_mobile = fake.phone_number()
    remove_data(expert_record)
    for record in data:
        if expert_record["Id"] == record["ExpertId"]:
            remove_data(record)
            anonymize_name(
                record, works_list, fake_name, fake_last_name, fake_full_name
            )
            anonymize_contacts(record, fake_email, fake_phone, fake_mobile)
    anonymize_name(expert_record, name_list, fake_name, fake_last_name, fake_full_name)
    anonymize_contacts(expert_record, fake_email, fake_phone, fake_mobile)

works_cleaned_json_object = json.dumps(data, indent=4)
experts_cleaned_json_object = json.dumps(expert_data, indent=4)
with open("works_cleaned.json", "w") as outfile:
    outfile.write(works_cleaned_json_object)
with open("experts_cleaned.json", "w") as outfile:
    outfile.write(experts_cleaned_json_object)