In [1]:
import json
import difflib
from datetime import datetime
import os

# If your notebook and scraped_jobs.json are in jupyter 
INPUT_FILE = "scraped_jobs_from_api.json"
OUTPUT_FILE = "removed_duplicate_scraped_jobs_from_api.json"

print("Input path:", os.path.abspath(INPUT_FILE))
print("Output will be saved to:", os.path.abspath(OUTPUT_FILE))


Input path: C:\Users\singh\scraped_jobs_from_api.json
Output will be saved to: C:\Users\singh\removed_duplicate_scraped_jobs_from_api.json


In [2]:
def normalize_company(name):
    """Normalize company name for grouping."""
    if not name:
        return "unknown"
    return name.lower().strip()


def is_similar(desc1, desc2, threshold=0.85):
    """Check if two descriptions are similar using SequenceMatcher."""
    if not desc1 or not desc2:
        return False

    desc1 = str(desc1)
    desc2 = str(desc2)

    # Quick length check optimization
    len1, len2 = len(desc1), len(desc2)
    if len1 == 0 or len2 == 0:
        return False
    
    # If lengths differ a lot, they probably aren't similar enough
    if abs(len1 - len2) / max(len1, len2) > (1 - threshold):
        return False

    matcher = difflib.SequenceMatcher(None, desc1, desc2)
    return matcher.ratio() > threshold


def parse_date(date_str):
    """Parse date string to datetime object."""
    if not date_str:
        return datetime.min
    try:
        return datetime.fromisoformat(date_str)
    except Exception:
        return datetime.min


def deduplicate_list(job_list):
    """Deduplicate a list of jobs based on company + similar description."""
    # Group by normalized company name
    jobs_by_company = {}
    for job in job_list:
        company_norm = normalize_company(job.get("Company"))
        jobs_by_company.setdefault(company_norm, []).append(job)

    unique_jobs = []
    duplicates_removed = 0

    for company, jobs in jobs_by_company.items():
        # Sort by creation date (newest first), so we keep latest
        def get_created_at(j):
            # try both keys just in case
            return parse_date(j.get("Created_At") or j.get("created_at"))
        
        jobs.sort(key=get_created_at, reverse=True)

        kept_jobs = []
        for job in jobs:
            is_duplicate = False
            for kept_job in kept_jobs:
                if is_similar(job.get("Description"), kept_job.get("Description")):
                    is_duplicate = True
                    break

            if not is_duplicate:
                kept_jobs.append(job)
            else:
                duplicates_removed += 1

        unique_jobs.extend(kept_jobs)

    return unique_jobs, duplicates_removed


In [3]:
# Load JSON
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

cleaned_data = {}
total_removed = 0
total_before = 0
total_after = 0

for category, jobs in data.items():
    print(f"Processing category: {category} ({len(jobs)} jobs)...")
    unique_list, removed_count = deduplicate_list(jobs)
    
    cleaned_data[category] = unique_list
    total_removed += removed_count
    total_before += len(jobs)
    total_after += len(unique_list)
    
    print(f"  - Removed {removed_count} duplicates. Valid jobs: {len(unique_list)}")

# Save cleaned JSON in the same folder as the notebook
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, indent=4, ensure_ascii=False)

print("\n===============================")
print(f"Total jobs before: {total_before}")
print(f"Total jobs after : {total_after}")
print(f"Total duplicates removed: {total_removed}")
print("Cleaned data saved to:", os.path.abspath(OUTPUT_FILE))


Processing category: Industrial Training Job Portal (1000 jobs)...
  - Removed 28 duplicates. Valid jobs: 972
Processing category: Fresher Jobs (1000 jobs)...
  - Removed 6 duplicates. Valid jobs: 994
Processing category: Semi Qualified Jobs (1000 jobs)...
  - Removed 9 duplicates. Valid jobs: 991
Processing category: Articleship Jobs (1000 jobs)...
  - Removed 4 duplicates. Valid jobs: 996

Total jobs before: 4000
Total jobs after : 3953
Total duplicates removed: 47
Cleaned data saved to: C:\Users\singh\removed_duplicate_scraped_jobs_from_api.json
