#### Installed required packages

In [1]:
# pip install pymongo


#### Created pymongo connection string

In [2]:
import os
import pandas as pd
pd.set_option("display.max_column", None)

from pymongo import MongoClient
from bson import json_util
import json

mongo_url = input("Enter the Mongo connection string: ")
client = MongoClient(mongo_url)
print("Python-MongoDB connection created succesfully -\n", client)


Python-MongoDB connection created succesfully -
 MongoClient(host=['3.146.176.180:27017'], document_class=dict, tz_aware=False, connect=True)


#### List of collections present in the database

In [3]:
db = client['talentsync-backend-production']
collection_names = db.list_collection_names()
print(collection_names)


['candidate_questions', 'job_hr_questions', 'candidates', 'auth_keys', 'jobs', 'audit_logs', 'job_hr_settings', 'batches', 'questions', 'candidates_errors', 'users', 'companies', 'messages', 'auth_tokens']


#### Finding out all the documents present in the 'batches' collection

In [4]:
collection = db['batches']
results = collection.find()
print(results)


<pymongo.synchronous.cursor.Cursor object at 0x000002746946A420>


In [5]:
collection = db['batches']
results = collection.find()
for doc in results:
    print(doc, end='\n\n')


{'_id': ObjectId('6822eb6f6342bdaa117239be'), 'uploaded_by': ObjectId('6822e2f56342bdaa117239b8'), 'company_id': ObjectId('6822e3346342bdaa117239ba'), 'batch_id': Binary(b'm\x18]v\xf2>Rv\x867C\x03\xf9I\xb7\xa8', 4), 'batch_name': 'Batch - SIS 1', 'upload_count': 10, 'job_id': ObjectId('6822eaf56342bdaa117239bd'), 'status': 'completed', 'start_time': datetime.datetime(2025, 5, 13, 6, 49, 19, 226000), 'end_time': datetime.datetime(2025, 5, 13, 6, 52, 0, 161000)}

{'_id': ObjectId('6822f3e46342bdaa117239c7'), 'uploaded_by': ObjectId('6822e2f56342bdaa117239b8'), 'company_id': ObjectId('6822e3346342bdaa117239ba'), 'batch_id': Binary(b'\xc9\xe1\x1e\x13W\x95Ud\x89\x00\xdf\xd2\x01\x01Y\xc6', 4), 'batch_name': 'Batch - 2', 'upload_count': 1, 'job_id': ObjectId('6822eaf56342bdaa117239bd'), 'status': 'completed', 'start_time': datetime.datetime(2025, 5, 13, 7, 25, 24, 299000), 'end_time': datetime.datetime(2025, 5, 13, 7, 25, 43, 542000)}

{'_id': ObjectId('6822f5d26342bdaa117239ca'), 'uploaded_b

#### View the very first document of the collection

In [6]:
collection = db['batches']
result = collection.find_one()
print(result)


{'_id': ObjectId('6822eb6f6342bdaa117239be'), 'uploaded_by': ObjectId('6822e2f56342bdaa117239b8'), 'company_id': ObjectId('6822e3346342bdaa117239ba'), 'batch_id': Binary(b'm\x18]v\xf2>Rv\x867C\x03\xf9I\xb7\xa8', 4), 'batch_name': 'Batch - SIS 1', 'upload_count': 10, 'job_id': ObjectId('6822eaf56342bdaa117239bd'), 'status': 'completed', 'start_time': datetime.datetime(2025, 5, 13, 6, 49, 19, 226000), 'end_time': datetime.datetime(2025, 5, 13, 6, 52, 0, 161000)}


In [7]:
collection = db['batches']
result = collection.find_one()
print(json.dumps(result, indent=4, default=json_util.default))


{
    "_id": {
        "$oid": "6822eb6f6342bdaa117239be"
    },
    "uploaded_by": {
        "$oid": "6822e2f56342bdaa117239b8"
    },
    "company_id": {
        "$oid": "6822e3346342bdaa117239ba"
    },
    "batch_id": {
        "$binary": {
            "base64": "bRhddvI+UnaGN0MD+Um3qA==",
            "subType": "04"
        }
    },
    "batch_name": "Batch - SIS 1",
    "upload_count": 10,
    "job_id": {
        "$oid": "6822eaf56342bdaa117239bd"
    },
    "status": "completed",
    "start_time": {
        "$date": "2025-05-13T06:49:19.226Z"
    },
    "end_time": {
        "$date": "2025-05-13T06:52:00.161Z"
    }
}


#### Finding out the number of documents present in the collection 

In [8]:
# print("shape: ", df.shape)


In [9]:
num_rows = collection.count_documents({})
print("shape:", num_rows )


shape: 196


#### Finding out the unique fields present in this collection

In [10]:
# print("columns: ", df.columns)


In [11]:
all_fields = set()
for field in collection.find():
    all_fields.update(field.keys())
    
print("Columns:", list(all_fields))


Columns: ['batch_id', 'company_id', 'end_time', 'upload_count', 'start_time', 'status', 'uploaded_by', '_id', 'job_id', 'type', 'batch_name']


#### Finding out a info about the each field present in the collection


In [12]:
from collections import defaultdict
import pprint


field_info = defaultdict(lambda: {"count": 0, "types": set()})
total_docs = 0

# Iterate over all documents in the collection
for doc in collection.find({}):
    total_docs += 1
    for field, value in doc.items():
        field_info[field]["count"] += 1
        field_info[field]["types"].add(type(value).__name__)

# Display like df.info()
print(f"{'Field':<25} {'Non-Null Count':<15} {'Data Types'}")
print("-" * 60)
for field, info in field_info.items():
    print(f"{field:<25} {info['count']:<15} {', '.join(sorted(info['types']))}")

print(f"\nTotal Documents: {total_docs}")


Field                     Non-Null Count  Data Types
------------------------------------------------------------
_id                       196             ObjectId
uploaded_by               196             ObjectId
company_id                196             ObjectId
batch_id                  196             Binary
batch_name                158             str
upload_count              117             int
job_id                    196             ObjectId
status                    196             str
start_time                196             datetime
end_time                  190             datetime
type                      159             str

Total Documents: 196


#### Find value counts for upload_count

In [13]:
# df['upload_count'].value_counts()


In [14]:
query = [
    {
        "$group":{
            "_id": "$upload_count",
            "count": {"$sum": 1}
        }
    },
    {
        "$sort": {"count": -1} # sort by count in descending order
    }
]

result = collection.aggregate(query)

for doc in result:
    print(f"{doc['_id']}: {doc['count']}")


1: 107
None: 79
180: 4
3: 2
2: 2
10: 1
5: 1


#### Find value counts for upload type

In [15]:
# df['type'].value_counts()


In [16]:
query = [
    {
        "$group":{
            "_id": "$type",
            "count": {"$sum": 1}
        }
    },
    {
        "$sort": {"count": -1}
    }
]

result = collection.aggregate(query)

for doc in result:
    print(f"{doc['_id']}: {doc['count']}")


bulk: 80
single: 79
None: 37


#### Find value counts for upload status

In [17]:
# df['status'].value_counts()


In [18]:
query = [
    {
        "$group":{
            "_id": "$status",
            "count": {"$sum": 1}
        }
    },
    {
        "$sort": {"count": -1}
    }
]

result = collection.aggregate(query)

for doc in result:
    print(f"{doc["_id"]}: {doc['count']}")


completed: 190
processing: 6


#### Inspect 'batch_name','upload_count', 'type', 'status' together

In [19]:
# df_result = df[['batch_name','upload_count', 'type', 'status']].sort_values(by=['status','type', 'upload_count'], ascending=True)
# df_result.sample(5)


In [20]:
query = [
    {
        "$project": {
            "_id": 0,
            "batch_name": 1,
            "upload_count": 1,
            "type": 1,
            "status": 1
        }
    },
    {
        "$sort": {
            "status": 1,
            "type": 1,
            "upload_count": 1
        }
    },
    {
        "$sample": {
            "size": 5  # randomly sample 5 after sorting
        }
    }
]

result = list(collection.aggregate(query))

for doc in result:
    print(doc)


{'batch_name': 'a12', 'type': 'bulk', 'upload_count': 1, 'status': 'completed'}
{'batch_name': 'batch 3', 'upload_count': 1, 'status': 'completed'}
{'batch_name': 'a11', 'type': 'bulk', 'upload_count': 1, 'status': 'completed'}
{'batch_name': '4642', 'type': 'bulk', 'upload_count': 1, 'status': 'completed'}
{'type': 'single', 'status': 'completed', 'batch_name': 'Wp-Careers'}


#### Replace upload_count = "" with 1.0 where batch_name = "Wp-Careers"

In [21]:
# import numpy as np
# df_copy = df.copy()

# df_copy.loc[
#     (df['batch_name'] == "Wp-Careers") & (df['upload_count'].isna()),
#     'upload_count'
# ] = 1.0

# df_copy = df_copy[df_copy['batch_name'] == "Wp-Careers"]
# df_copy[['batch_name','upload_count', 'type', 'status']].sample(5)


In [26]:
query = [
    {
        "$match": {
            "batch_name": "Wp-Careers"
        }
    },
    {
        "$project": {
            "_id": 0,
            "batch_name": 1,
            "upload_count": {
                "$ifNull": ["$upload_count", 1.0]
            },
            "type": 1,
            "status": 1
        }
    },
    {
        "$sort": {
            "status": 1,
            "type": 1,
            "upload_count": 1
        }
    },
    {
        "$sample": {
            "size": 5
        }
    }
]

result = list(collection.aggregate(query))

for doc in result:
    print(doc)


{'type': 'single', 'status': 'completed', 'batch_name': 'Wp-Careers', 'upload_count': 1.0}
{'type': 'single', 'status': 'completed', 'batch_name': 'Wp-Careers', 'upload_count': 1.0}
{'type': 'single', 'status': 'completed', 'batch_name': 'Wp-Careers', 'upload_count': 1.0}
{'type': 'single', 'status': 'completed', 'batch_name': 'Wp-Careers', 'upload_count': 1.0}
{'type': 'single', 'status': 'completed', 'batch_name': 'Wp-Careers', 'upload_count': 1.0}


In [27]:
query = [
    {
        "$match": {
            "batch_name": "Wp-Careers"
        }
    },
    {
        "$project": {
            "_id": 0,
            "batch_name": 1,
            "upload_count": 1,
            "type": 1,
            "status": 1
        }
    },
    {
        "$sort": {
            "status": 1,
            "type": 1,
            "upload_count": 1
        }
    },
    {
        "$sample": {
            "size": 5
        }
    }
]

result = list(collection.aggregate(query))

for doc in result:
    print(doc)


{'type': 'single', 'status': 'completed', 'batch_name': 'Wp-Careers'}
{'type': 'single', 'status': 'completed', 'batch_name': 'Wp-Careers'}
{'type': 'single', 'status': 'completed', 'batch_name': 'Wp-Careers'}
{'type': 'single', 'status': 'completed', 'batch_name': 'Wp-Careers'}
{'type': 'single', 'status': 'completed', 'batch_name': 'Wp-Careers'}
