In [1]:
from elasticsearch import Elasticsearch,helpers
from tqdm import tqdm

es = Elasticsearch(
    "https://localhost:9200",
    verify_certs=False,
    ssl_show_warn=False,
    basic_auth=("elastic", "elastic") 
)

try:
    info = es.info()
    print("‚úÖ Connected to Elasticsearch!")
    print(f"Cluster name: {info['cluster_name']}")
    print(f"Version: {info['version']['number']}")
except Exception as e:
    print("‚ùå Connection failed:", e)

‚úÖ Connected to Elasticsearch!
Cluster name: elasticsearch
Version: 8.5.1


In [2]:
source_index = "trump_comments_scored"
reference_index = "trump_comments"
target_index = "mastodon_comments_scored_fixed"

es.indices.create(
    index=target_index,
    body={
        "mappings": {
            "properties": {
                "id": {"type": "keyword"},
                "type": {"type": "keyword"},
                "platform": {"type": "keyword"},
                "content": {"type": "text"},
                "author": {"type": "keyword"},
                "like": {"type": "integer"},
                "post_id": {"type": "keyword"},
                "created_utc": {"type": "date"},
                "text_for_sentiment": {"type": "text"},
                "bertweet_sentiment": {"type": "float"}
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'mastodon_comments_scored_fixed'})

In [3]:
reference_map = {}
scroll = es.search(
    index=reference_index,
    scroll="2m",
    size=1000,
    body={"_source": ["id", "created_utc"], "query": {"match_all": {}}}
)

scroll_id = scroll["_scroll_id"]
docs = scroll["hits"]["hits"]

total = scroll["hits"]["total"]["value"]
print(f"üîç Total documents to process: {total}")

pbar = tqdm(total=total)

while docs:
    for doc in docs:
        source = doc["_source"]
        if "id" in source and "created_utc" in source:
            reference_map[source["id"]] = source["created_utc"]
            pbar.update(1)
        else:
            print(f"‚ö†Ô∏è Missing fields in doc {doc.get('_id')}")

    scroll = es.scroll(scroll_id=scroll_id, scroll="2m")
    scroll_id = scroll["_scroll_id"]
    docs = scroll["hits"]["hits"]

pbar.close()
print(f"‚úÖ Collected {len(reference_map)} timestamps from reference index.")
for i, (doc_id, created_utc) in enumerate(reference_map.items()):
    print(f"{i+1}. id: {doc_id}, created_utc: {created_utc}")
    if i >= 4:
        break

üîç Total documents to process: 8837


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8837/8837 [00:00<00:00, 23504.42it/s]

‚úÖ Collected 8837 timestamps from reference index.
1. id: mastodon_comment_114446733793395790, created_utc: 2025-05-04T00:29:27+00:00
2. id: mastodon_comment_114446757491485020, created_utc: 2025-05-04T00:35:31.449000+00:00
3. id: mastodon_comment_114444044485668385, created_utc: 2025-05-03T13:05:33+00:00
4. id: mastodon_comment_114444208954877297, created_utc: 2025-05-03T13:47:23+00:00
5. id: mastodon_comment_114444265124364022, created_utc: 2025-05-03T14:01:39+00:00





In [4]:
scroll = es.search(
    index=source_index,
    scroll="2m",
    size=1000,
    body={"query": {"match_all": {}}}
)
scroll_id = scroll["_scroll_id"]
docs = scroll["hits"]["hits"]
total = scroll["hits"]["total"]["value"]
pbar = tqdm(total=total, desc="Reindexing")

actions = []


while docs:
    for doc in docs:
        doc_id = doc["_source"]["id"]
        source = doc["_source"]

        if "bertweet_sentiment" not in source:
            pbar.update(1)
            continue

        ref_created = reference_map.get(doc_id)
        if ref_created:
            source["created_utc"] = ref_created
        else:
            pbar.update(1)
            continue

        action = {
            "_index": target_index,
            "_id": doc_id,
            "_source": source
        }
        actions.append(action)

        if len(actions) >= 1000:
            helpers.bulk(es, actions)
            actions = []

        pbar.update(1)

    scroll = es.scroll(scroll_id=scroll_id, scroll="2m")
    scroll_id = scroll["_scroll_id"]
    docs = scroll["hits"]["hits"]

if actions:
    helpers.bulk(es, actions)

pbar.close()
print("üéâ Finished fast reindexing to corrected mastodon_comments index.")

Reindexing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8828/8828 [00:02<00:00, 2969.63it/s]

üéâ Finished fast reindexing to corrected mastodon_comments index.



