In [7]:
from elasticsearch import Elasticsearch,helpers
from tqdm import tqdm

es = Elasticsearch(
    "https://localhost:9200",
    verify_certs=False,
    ssl_show_warn=False,
    basic_auth=("elastic", "elastic") 
)

try:
    info = es.info()
    print("✅ Connected to Elasticsearch!")
    print(f"Cluster name: {info['cluster_name']}")
    print(f"Version: {info['version']['number']}")
except Exception as e:
    print("❌ Connection failed:", e)

✅ Connected to Elasticsearch!
Cluster name: elasticsearch
Version: 8.5.1


In [6]:
source_index = "reddit_comments_scored"
reference_index = "reddit_comments"
target_index = "reddit_comments_scored_fixed"

es.indices.create(
    index=target_index,
    body={
        "mappings": {
            "properties": {
                "id": {"type": "keyword"},
                "type": {"type": "keyword"},
                "platform": {"type": "keyword"},
                "content": {"type": "text"},
                "author": {"type": "keyword"},
                "like": {"type": "integer"},
                "post_id": {"type": "keyword"},
                "created_utc": {"type": "date", "format": "epoch_second"},
                "text_for_sentiment": {"type": "text"},
                "bertweet_sentiment": {"type": "float"}
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'reddit_comments_scored_fixed'})

In [None]:

reference_map = {}
scroll = es.search(
    index="reddit_comments",
    scroll="2m",
    size=1000,
    body={"_source": ["id", "created_utc"], "query": {"match_all": {}}}
)

scroll_id = scroll["_scroll_id"]
docs = scroll["hits"]["hits"]

total = scroll["hits"]["total"]["value"]
print(f"🔍 Total documents to process: {total}")

pbar = tqdm(total=total)

while docs:
    for doc in docs:
        doc_id = doc["_source"]["id"]
        created = doc["_source"]["created_utc"]
        reference_map[doc_id] = created
        pbar.update(1)

    scroll = es.scroll(scroll_id=scroll_id, scroll="2m")
    scroll_id = scroll["_scroll_id"]
    docs = scroll["hits"]["hits"]

pbar.close()
print(f"✅ Collected {len(reference_map)} timestamps from reference index.")


🔍 Total documents to process: 2922479


100%|██████████| 2922479/2922479 [03:15<00:00, 14944.50it/s]

✅ Collected 2922479 timestamps from reference index.





In [8]:
scroll = es.search(
    index=source_index,
    scroll="2m",
    size=1000,
    body={"query": {"match_all": {}}}
)
scroll_id = scroll["_scroll_id"]
docs = scroll["hits"]["hits"]
total = scroll["hits"]["total"]["value"]
pbar = tqdm(total=total, desc="Reindexing")

actions = []


while docs:
    for doc in docs:
        doc_id = doc["_source"]["id"]
        source = doc["_source"]

        if "bertweet_sentiment" not in source:
            pbar.update(1)
            continue

        ref_created = reference_map.get(doc_id)
        if isinstance(ref_created, int):
            source["created_utc"] = ref_created
        else:
            pbar.update(1)
            continue

        action = {
            "_index": target_index,
            "_id": doc_id,
            "_source": source
        }
        actions.append(action)

        if len(actions) >= 1000:
            helpers.bulk(es, actions)
            actions = []

        pbar.update(1)

    scroll = es.scroll(scroll_id=scroll_id, scroll="2m")
    scroll_id = scroll["_scroll_id"]
    docs = scroll["hits"]["hits"]

if actions:
    helpers.bulk(es, actions)

pbar.close()
print("🎉 Finished fast reindexing to corrected reddit_comments index.")

Reindexing: 100%|██████████| 2726394/2726394 [17:43<00:00, 2564.48it/s]

🎉 Finished fast reindexing to corrected reddit_comments index.



