In [None]:
from elasticsearch import Elasticsearch
from tqdm import tqdm

es = Elasticsearch(
    "https://localhost:9200",
    verify_certs=False,
    ssl_show_warn=False,
    basic_auth=("elastic", "elastic") 
)

try:
    info = es.info()
    print("✅ Connected to Elasticsearch!")
    print(f"Cluster name: {info['cluster_name']}")
    print(f"Version: {info['version']['number']}")
except Exception as e:
    print("❌ Connection failed:", e)

✅ Connected to Elasticsearch!
Cluster name: elasticsearch
Version: 8.5.1


In [None]:
source_index = "reddit_posts_scored"
reference_index = "reddit_posts"
target_index = "reddit_posts_scored_fixed"

es.indices.create(
    index=target_index,
    body={
        "mappings": {
            "properties": {
                "id": {"type": "keyword"},
                "type": {"type": "keyword"},
                "platform": {"type": "keyword"},
                "content": {"type": "text"},
                "title": {"type": "text"},
                "author": {"type": "keyword"},
                "like": {"type": "integer"},
                "num_comments": {"type": "integer"},
                "created_utc": {"type": "date", "format": "epoch_second"},
                "text_for_sentiment": {"type": "text"},
                "sentiment_score": {"type": "float"}
            }
        }
    }
)
print(f"✅ Created index '{target_index}'")


BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [reddit_posts_scored_fixed/1gHDjscIS3G0GvfaZynU5Q] already exists')

In [None]:
BATCH_SIZE = 1000

scroll = es.search(
    index=source_index,
    scroll="2m",
    size=BATCH_SIZE,
    body={"query": {"match_all": {}}}
)

scroll_id = scroll["_scroll_id"]
total_hits = scroll["hits"]["total"]["value"]
print(f"🔍 Total documents to process: {total_hits}")

docs = scroll["hits"]["hits"]

while docs:
    for doc in tqdm(docs):
        doc_id = doc["_id"]
        source = doc["_source"]
        reddit_id = source.get("id")

        if "bertweet_sentiment" not in source:
            continue


        ref_search = es.search(
            index=reference_index,
            body={
                "query": {
                    "term": {
                        "id.keyword": reddit_id
                    }
                },
                "_source": ["created_utc"]
            },
            size=1
        )

        hits = ref_search["hits"]["hits"]
        if hits:
            ref_created = hits[0]["_source"].get("created_utc")
            if isinstance(ref_created, int):
                source["created_utc"] = ref_created
            else:
                print(f"⚠️ Invalid timestamp for {reddit_id}, skipping.")
                continue
        else:
            print(f"⚠️ Not found in reference index: {reddit_id}")
            continue

        es.index(index=target_index, id=doc_id, body=source)

    scroll = es.scroll(scroll_id=scroll_id, scroll="2m")
    scroll_id = scroll["_scroll_id"]
    docs = scroll["hits"]["hits"]

print("🎉 Finished reindexing to corrected index.")

🔍 Total documents to process: 10838


100%|██████████| 1000/1000 [00:29<00:00, 33.41it/s]
100%|██████████| 1000/1000 [00:31<00:00, 31.87it/s]
100%|██████████| 1000/1000 [00:30<00:00, 32.92it/s]
100%|██████████| 1000/1000 [00:30<00:00, 32.36it/s]
100%|██████████| 1000/1000 [00:28<00:00, 34.79it/s]
100%|██████████| 1000/1000 [00:02<00:00, 348.58it/s]
100%|██████████| 1000/1000 [00:00<00:00, 2141043.39it/s]
100%|██████████| 1000/1000 [00:00<00:00, 3146514.63it/s]
100%|██████████| 1000/1000 [00:00<00:00, 2730666.67it/s]
100%|██████████| 1000/1000 [00:00<00:00, 2746760.97it/s]
100%|██████████| 838/838 [00:09<00:00, 91.74it/s]  

🎉 Finished reindexing to corrected index.



