In [1]:
import pandas as pd

# Load your existing USA cleaned data
df_usa = pd.read_csv('data/processed/jobs_cleaned.csv')

print(f"Original USA data: {len(df_usa)} rows")

# ===== ADD COUNTRY COLUMN =====
df_usa['country'] = 'USA'

# ===== ADD JOB CATEGORY =====
def categorize_usa_job(title):
    """Categorize USA jobs into IT domains"""
    title_lower = str(title).lower()
    
    if any(word in title_lower for word in ['data scientist', 'machine learning', 'ml engineer']):
        return 'Data Scientist/ML'
    elif any(word in title_lower for word in ['data analyst', 'analytics', 'business intelligence']):
        return 'Data Analyst'
    elif any(word in title_lower for word in ['data engineer', 'etl', 'big data']):
        return 'Data Engineer'
    elif any(word in title_lower for word in ['full stack', 'fullstack']):
        return 'Full Stack Developer'
    elif any(word in title_lower for word in ['frontend', 'front-end']):
        return 'Frontend Developer'
    elif any(word in title_lower for word in ['backend', 'back-end']):
        return 'Backend Developer'
    elif any(word in title_lower for word in ['software engineer', 'software developer', 'sde']):
        return 'Software Engineer'
    elif any(word in title_lower for word in ['devops', 'site reliability']):
        return 'DevOps Engineer'
    elif any(word in title_lower for word in ['cloud', 'aws', 'azure']):
        return 'Cloud Engineer'
    elif any(word in title_lower for word in ['qa', 'quality', 'test']):
        return 'QA/Test Engineer'
    elif any(word in title_lower for word in ['product manager']):
        return 'Product Manager'
    elif any(word in title_lower for word in ['business analyst']):
        return 'Business Analyst'
    else:
        return 'Other IT Role'

df_usa['job_category'] = df_usa['Job Title'].apply(categorize_usa_job)

print("\nUSA Job Categories:")
print(df_usa['job_category'].value_counts())

# ===== RENAME COLUMNS TO MATCH INDIA FORMAT =====
df_usa = df_usa.rename(columns={
    'Job Title': 'job_title',
    'state': 'location',  # Using state instead of city for USA
    'avg_salary_k': 'avg_salary_usd'  # Keep USA salary in USD
})

# ===== SELECT COLUMNS TO MATCH INDIA =====
usa_columns = [
    'job_title', 'company', 'location', 'country', 'job_category',
    'avg_salary_usd', 'seniority'
] + [col for col in df_usa.columns if col.startswith('skill_')]

df_usa_final = df_usa[usa_columns].copy()

# ===== SAVE =====
df_usa_final.to_csv('data/processed/usa_jobs_cleaned.csv', index=False)

print(f"\n✓ USA data updated: {len(df_usa_final)} rows")
print("✓ Saved to: data/processed/usa_jobs_cleaned.csv")

Original USA data: 2242 rows

USA Job Categories:
job_category
Data Analyst         1705
Other IT Role         398
QA/Test Engineer       49
Business Analyst       48
Data Engineer          25
Data Scientist/ML      15
Cloud Engineer          2
Name: count, dtype: int64

✓ USA data updated: 2242 rows
✓ Saved to: data/processed/usa_jobs_cleaned.csv
