In [None]:
import json
import re
from sentence_transformers import SentenceTransformer
# If your file is a single JSON object or an array of objects:
with open("competitor_raw.json", "r", encoding="utf-8") as f:
    data = json.load(f)   # instead of json.loads(line)

# If it's just one object, wrap it in a list
if isinstance(data, dict):
    data = [data]

print(f"Total posts loaded: {len(data)}")
print("Example raw post:")
print(json.dumps(data[0], indent=2))

Total posts loaded: 1
Example raw post:
{
  "dataset_info": {
    "name": "Tata Harrier Safari Competitors Social Media Dataset",
    "description": "Social media sentiment analysis of Tata Harrier and Safari competitor vehicles",
    "total_documents": 1800,
    "vehicles_covered": [
      "Mahindra XUV700",
      "Hyundai Creta",
      "Kia Seltos",
      "Hyundai Alcazar",
      "Toyota Innova",
      "MG Hector",
      "Maruti Grand Vitara"
    ],
    "platforms": [
      "reddit",
      "youtube"
    ],
    "created_at": "2025-10-03T17:36:37.853646+00:00",
    "schema_version": "1.0"
  },
  "documents": [
    {
      "platform": "reddit",
      "username": "Late-Lettuce-6356",
      "timestamp": "2024-06-30T15:14:04Z",
      "vehicle_model": "Mahindra XUV700",
      "competitor_models": [
        "Tata Harrier",
        "Tata Safari",
        "Hyundai Alcazar",
        "MG Hector"
      ],
      "content": "Virtus 1.0 Topline AT vs XUV700 AX5 P. I am planning to buy my first car a

In [3]:
def clean_text(text):
    if not text:
        return ""
    text = re.sub(r"http\S+|www\S+", "", text)   # remove URLs
    text = re.sub(r"@\w+", "", text)             # remove mentions
    text = re.sub(r"#", "", text)                # remove hashtags
    text = re.sub(r"[^\w\s,.!?]", "", text)      # remove emojis/symbols
    text = re.sub(r"\s+", " ", text).strip()     # normalize spaces
    return text.lower()
def preprocess_post(post):
    flat = {
        "platform": post.get("platform"),
        "username": post.get("username"),
        "timestamp": post.get("timestamp"),
        "vehicle_model": post.get("vehicle_model"),
        "competitor_models": post.get("competitor_models", []),
        "content": post.get("content"),
        "clean_content": clean_text(post.get("content", "")),
        "sentiment_label": post.get("sentiment", {}).get("label"),
        "sentiment_score": post.get("sentiment", {}).get("score"),
        "country": post.get("location", {}).get("country"),
        "city": post.get("location", {}).get("city"),
        "intent": post.get("intent"),
        "keywords": post.get("keywords", []),
        "source_id": post.get("source_id")
    }
    return flat

In [4]:

data = [preprocess_post(p) for p in data]

print("\nExample flattened + cleaned post:")
print(json.dumps(data[0], indent=2))


# --- 4. Embeddings ---
model = SentenceTransformer('all-MiniLM-L6-v2')

for post in data:
    post["embedding"] = model.encode(post["clean_content"]).tolist()
    post["embedding_model"] = "all-MiniLM-L6-v2"

print("\nEmbeddings added for first post:", len(data[0]["embedding"]))


# --- 5. Save to new JSONL ---
with open("updated_data.jsonl", "w", encoding="utf-8") as f:
    for post in data:
        f.write(json.dumps(post, ensure_ascii=False) + "\n")

print("\n✅ Enriched posts saved to updated_data.jsonl")


Example flattened + cleaned post:
{
  "platform": null,
  "username": null,
  "timestamp": null,
  "vehicle_model": null,
  "competitor_models": [],
  "content": null,
  "clean_content": "",
  "sentiment_label": null,
  "sentiment_score": null,
  "country": null,
  "city": null,
  "intent": null,
  "keywords": [],
  "source_id": null
}

Embeddings added for first post: 384

✅ Enriched posts saved to updated_data.jsonl
