In [24]:
from pymongo import MongoClient
from pathlib import Path
import sys
import json
import os
import random

In [33]:
def get_credentials():
    current_dir = os.getcwd()
    config_path = Path(current_dir) / 'config' / 'client.json'
    with config_path.open('r') as f:
        credentials = json.load(f)
    return credentials['username'], credentials['pwd'], credentials['hostname'], credentials['port']

def db_connect():
    username, pwd, hostname, port = get_credentials()
    try:
        client = MongoClient(f'mongodb://{username}:{pwd}@{hostname}:{port}')
        print("Connected to MongoDB")
        client.server_info()  # Validate connection
    except Exception as e:
        print(f"Error connecting to MongoDB: {e}")
        sys.exit(1)
    return client

def get_collection(collection_name):
    dbname = "manifoldDB"
    client = db_connect()
    db = client[dbname]
    collection = db[collection_name]
    return collection, client

def save_as_json(data, filename):
    os.makedirs(os.path.dirname(filename), exist_ok=True)  # Ensure directory exists
    with open(filename, 'w') as file:
        json.dump(data, file, indent=4)
    print(f"Data saved to {filename}")

def find_all_texts(data, key='text'):
    texts = []
    if isinstance(data, dict):
        if key in data:
            texts.append(data[key])
        for value in data.values():
            texts.extend(find_all_texts(value, key))
    elif isinstance(data, list):
        for item in data:
            texts.extend(find_all_texts(item, key))
    return texts

def extract_all_texts(docs):
    processed_data = []
    for doc in docs:
        doc_data = {
            'id': doc['id'],
            'texts': find_all_texts(doc)
        }
        processed_data.append(doc_data)
    return processed_data


collection, client = get_collection('comments')
all_ids = [doc['_id'] for doc in collection.find({}, {'_id': 1})]
sampled_ids = random.sample(all_ids, 5000)
samples = collection.find({'_id': {'$in': sampled_ids}})
samples_list = list(samples)
processed_samples = extract_all_texts(samples_list)
save_as_json(processed_samples, '../raw/manifold_samples_5k.json')
client.close()

Connected to MongoDB
Data saved to data/manifold_samples_5k.json
