In [1]:
import re
import json
from pymongo import MongoClient

# Setup MongoDB connection
client = MongoClient('localhost', 27017)
db = client['Metadata_FSTT']  # Replace with your database name
cleaned_db = client['Metadata_Cleaned']  # New database for cleaned documents

# Define the collections to process
collections_to_process = [
    "equipes_recherche", "espace-entreprise", "fstt_actualites",
    "FORMATION-CONTINUE", "FORMATION-INITIALE", "FORMATION-INITIALE-informations",
    "FORMATION_CONTINUE_informations", "espace-etudiant-biblio", "espace-etudiant-clubs",
    "faculte_conseilEtab", "faculte_contact", "faculte_departements",
    "faculte_motdoyen", "faculte_presentation", "fstt_service",
    "fstt_spider", "recherche_struct"
]
# collections_to_process = [
#     "equipe_recherche", "espace_entreprise", "fstt_actualites",
#     "formation_continue", "formation_initiale", "formation_initiale_information",
#     "formation_continue_informations", "espace_etudiant_biblio", "espace_etudiant_clubs",
#     "faculte_conseilEtab", "faculte_contact", "faculte_departements",
#     "faculte_motdoyen", "faculte_presentation", "fstt_service",
#     "fstt_spider", "recherche_struct"
# ]

In [2]:
def transform_collection(collection_name, documents):
    transformed_documents = []
    if collection_name == "equipes_recherche":
        for doc in documents:
            membres_info = ""
            membres = doc.get("membres", [])
            for membre in membres:
                membres_info += f"\n- {membre.get('nom', '')} {membre.get('prenom', '')} ({membre.get('email', '')})"
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "research_team",
                "title": doc["title"],
                "content": "\n".join([
                    f"laboratoire: {doc.get('laboratoire', '')}",
                    f"directeur_infos: {doc.get('directeur_infos', '')}",
                    f"axes_recherche: {' '.join(doc.get('axes_recherche', []))}",
                    f"projets_recherche_link: {' '.join(doc.get('projets_recherche_link', []))}",
                    f"these_habil_soutenues_link: {' '.join(doc.get('these_habil_soutenues_link', []))}",
                    f"prod_scientifique: {' '.join(doc.get('prod_scientifique', []))}",
                    "membres:" + membres_info,
                    f"other_membre_key: {doc.get('other_key', '')}"
                ]),
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    elif collection_name == "espace-entreprise":
        for doc in documents:
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "entrepreneurship_space",
                "title": doc["title"],
                "content": "\n".join([
                    f"qui_sommes_nous: {' '.join(doc.get('qui_sommes_nous', []))}",
                    f"objectif: {' '.join(doc.get('objectif', []))}",
                    f"Comment: {' '.join(doc.get('Comment', []))}",
                    f"activite_service: {' '.join(doc.get('activite_service', []))}"
                ]),
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    elif collection_name == "fstt_actualites":
        for doc in documents:
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "news",
                "title": doc["title"],
                "content": doc["content"],
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    elif collection_name == "FORMATION-CONTINUE":
        for doc in documents:
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "continuing_education",
                "title": doc["title"],
                "content": doc["cleaned_content"],
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    elif collection_name == "FORMATION-INITIALE":
        for doc in documents:
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "initial_education",
                "title": doc["title"],
                "content": doc["Content"],
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    elif collection_name == "FORMATION-INITIALE-informations":
        for doc in documents:
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "initial_education_info",
                "title": doc["title"],
                "content": f"Formation: {doc['Formation']}\nObjectifs: {doc['Objectifs']}\nProgramme: {doc['Programme']}",
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    elif collection_name == "FORMATION_CONTINUE_informations":
        for doc in documents:
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "continuing_education_info",
                "title": doc["Formation"],
                "content": f"Filiere: {doc['Filiere']}\nObjectif: {doc['Objectif']}\nPublic_concerne: {doc['Public_concerne']}\nDebouche: {', '.join(doc['debouche'])}",
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    elif collection_name == "espace-etudiant-biblio":
        for doc in documents:
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "student_space_biblio",
                "title": doc["title_biblio"],
                "content": "\n".join(doc["info_biblio"]),
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    elif collection_name == "espace-etudiant-clubs":
        for doc in documents:
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "student_space_club",
                "title": doc["title_club"],
                "content": doc["info_club"],
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    elif collection_name == "faculte_conseilEtab":
        for doc in documents:
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "faculty_council",
                "title": doc["Title"],
                "content": f"Brief: {doc['Brief']}\nResponsabilite: {doc['Responsabilite']}\nName: {doc['Name']}",
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    elif collection_name == "faculte_contact":
        for doc in documents:
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "faculty_contact",
                "title": doc["title"],
                "content": f"Localisation: {doc['localisation']}\nNumero Telephone: {doc['numero_telephone']}\nFax: {doc['fax']}\nEmail: {doc['email']}",
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    elif collection_name == "faculte_departements":
        for doc in documents:
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "faculty_department",
                "title": doc["title"],
                "content": f"Chef: {doc['chef']}\nEmail: {doc['email']}",
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    elif collection_name == "faculte_motdoyen":
        for doc in documents:
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "deans_message",
                "title": doc["title"],
                "content": doc["content"],
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    elif collection_name == "faculte_presentation":
        for doc in documents:
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "faculty_presentation",
                "title": doc["title"],
                "content": doc["content"],
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    elif collection_name == "fstt_service":
        for doc in documents:
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "service",
                "title": doc["service"],
                "content": f"Brief: {doc['Brief']}\nContent: {doc['content']}",
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    elif collection_name == "fstt_spider":
        for doc in documents:
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "announcement",
                "title": doc["title"],
                "content": doc["Content"],
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    elif collection_name == "recherche_struct":
        for doc in documents:
            transformed_doc = {
                "id": str(doc.get("_id", "")),
                "type": "research_lab",
                "title": doc["title"],
                "content": doc["laboratoire"],
                "metadata": {
                    "url": doc["url"]
                }
            }
            transformed_documents.append(transformed_doc)
    return transformed_documents

In [3]:
def clean_text(text):
    if not isinstance(text, (str, bytes)):
        return ''  # Return an empty string if text is not a string or bytes-like object

    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters except basic punctuation
    text = re.sub(r'[^\w\s,.?!:;\-\'\"()@#]', '', text)
    return text
def clean_document(doc):
    doc['title'] = clean_text(doc['title'])
    doc['content'] = clean_text(doc['content'])
    return doc

In [4]:
def process_and_store_documents(collection_name):
    collection = db[collection_name]
    documents = collection.find()

    transformed_documents = transform_collection(collection_name, documents)
    print(f"Transformed documents from {collection_name}: {len(transformed_documents)}")

    cleaned_documents = [clean_document(doc) for doc in transformed_documents]
    print(f"Cleaned documents from {collection_name}: {len(cleaned_documents)}")

    if cleaned_documents:  # Check if there are documents to insert
        # Store cleaned documents in the new collection
        cleaned_collection = cleaned_db[collection_name]
        cleaned_collection.insert_many(cleaned_documents)
        print(f"Processed and stored documents from {collection_name}")
    else:
        print(f"No documents to store from {collection_name}")
for collection_name in collections_to_process:
    process_and_store_documents(collection_name)

Transformed documents from equipes_recherche: 20
Cleaned documents from equipes_recherche: 20
Processed and stored documents from equipes_recherche
Transformed documents from espace-entreprise: 1
Cleaned documents from espace-entreprise: 1
Processed and stored documents from espace-entreprise
Transformed documents from fstt_actualites: 568
Cleaned documents from fstt_actualites: 568
Processed and stored documents from fstt_actualites
Transformed documents from FORMATION-CONTINUE: 1
Cleaned documents from FORMATION-CONTINUE: 1
Processed and stored documents from FORMATION-CONTINUE
Transformed documents from FORMATION-INITIALE: 1
Cleaned documents from FORMATION-INITIALE: 1
Processed and stored documents from FORMATION-INITIALE
Transformed documents from FORMATION-INITIALE-informations: 33
Cleaned documents from FORMATION-INITIALE-informations: 33
Processed and stored documents from FORMATION-INITIALE-informations
Transformed documents from FORMATION_CONTINUE_informations: 12
Cleaned doc

In [5]:
import os

# Define the folder path to store JSON files
output_folder = "cleaned_documents"

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

def fetch_documents_and_store_in_json(collection_name, output_folder):
    collection = cleaned_db[collection_name]
    documents = collection.find({}, {'_id': 0, 'id': 1, 'type': 1, 'title': 1, 'content': 1, 'metadata': 1})

    document_list = list(documents)
    
    output_file_path = os.path.join(output_folder, f'cleaned_{collection_name}.json')
    
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(document_list, f, ensure_ascii=False, indent=4)
    
    print(f"Stored cleaned documents from {collection_name} in {output_file_path}")

# Store cleaned documents for each collection in a separate JSON file within the output folder
for collection_name in collections_to_process:
    fetch_documents_and_store_in_json(collection_name, output_folder)

Stored cleaned documents from equipes_recherche in cleaned_documents/cleaned_equipes_recherche.json
Stored cleaned documents from espace-entreprise in cleaned_documents/cleaned_espace-entreprise.json
Stored cleaned documents from fstt_actualites in cleaned_documents/cleaned_fstt_actualites.json
Stored cleaned documents from FORMATION-CONTINUE in cleaned_documents/cleaned_FORMATION-CONTINUE.json
Stored cleaned documents from FORMATION-INITIALE in cleaned_documents/cleaned_FORMATION-INITIALE.json
Stored cleaned documents from FORMATION-INITIALE-informations in cleaned_documents/cleaned_FORMATION-INITIALE-informations.json
Stored cleaned documents from FORMATION_CONTINUE_informations in cleaned_documents/cleaned_FORMATION_CONTINUE_informations.json
Stored cleaned documents from espace-etudiant-biblio in cleaned_documents/cleaned_espace-etudiant-biblio.json
Stored cleaned documents from espace-etudiant-clubs in cleaned_documents/cleaned_espace-etudiant-clubs.json
Stored cleaned documents f