In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Load cleaned data
df = pd.read_csv('data/processed/india_jobs_cleaned.csv')

print(f"Analyzing {len(df)} Indian data science jobs...\n")

# Create output folder
os.makedirs('outputs/plots/india', exist_ok=True)

# ========== VIZ 1: JOB CATEGORY DISTRIBUTION ==========
print("Creating Visualization 1: Job Categories...")

job_cats = df['job_category'].value_counts().head(8)

fig, ax = plt.subplots(figsize=(12, 8))
colors = sns.color_palette('Set2', len(job_cats))
bars = ax.barh(job_cats.index, job_cats.values, color=colors)

ax.set_title('Top 8 Job Categories in Indian Data Science Market', 
             fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Number of Job Postings', fontsize=12)
ax.invert_yaxis()

for i, (bar, count) in enumerate(zip(bars, job_cats.values)):
    pct = (count / len(df)) * 100
    ax.text(count + 5, i, f'{count} ({pct:.1f}%)', 
            va='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('outputs/plots/india/01_job_categories.png', dpi=300, bbox_inches='tight')
print("‚úì Saved: 01_job_categories.png")
plt.close()

# ========== VIZ 2: SALARY DISTRIBUTION ==========
print("Creating Visualization 2: Salary Analysis...")

salary_data = df['avg_salary_lpa'].dropna()

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Histogram
axes[0, 0].hist(salary_data, bins=30, edgecolor='black', alpha=0.7, color='#FF6B35')
median_sal = salary_data.median()
axes[0, 0].axvline(median_sal, color='red', linestyle='--', linewidth=2,
                   label=f'Median: ‚Çπ{median_sal:.1f}L')
axes[0, 0].set_title('Salary Distribution', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Salary (‚Çπ LPA)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Box plot by seniority
df_with_sal = df[df['avg_salary_lpa'].notna()]
sns.boxplot(data=df_with_sal, x='seniority', y='avg_salary_lpa', 
            ax=axes[0, 1], palette='Set2', order=['Junior', 'Mid-Level', 'Senior'])
axes[0, 1].set_title('Salary by Seniority Level', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Seniority Level')
axes[0, 1].set_ylabel('Salary (‚Çπ LPA)')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# Salary by experience
exp_salary = df_with_sal.groupby('min_experience_clean')['avg_salary_lpa'].median().reset_index()
exp_salary = exp_salary[exp_salary['min_experience_clean'] <= 15]  # Focus on 0-15 years
axes[1, 0].plot(exp_salary['min_experience_clean'], exp_salary['avg_salary_lpa'], 
                marker='o', linewidth=2, markersize=8, color='#4ECDC4')
axes[1, 0].set_title('Median Salary by Years of Experience', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Years of Experience')
axes[1, 0].set_ylabel('Median Salary (‚Çπ LPA)')
axes[1, 0].grid(True, alpha=0.3)

# Salary by job category
cat_salary = df_with_sal.groupby('job_category')['avg_salary_lpa'].median().sort_values(ascending=False).head(8)
axes[1, 1].barh(cat_salary.index, cat_salary.values, color='#95E1D3')
axes[1, 1].set_title('Median Salary by Job Category', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Median Salary (‚Çπ LPA)')
axes[1, 1].invert_yaxis()
for i, v in enumerate(cat_salary.values):
    axes[1, 1].text(v + 0.5, i, f'‚Çπ{v:.1f}L', va='center', fontsize=9)

plt.tight_layout()
plt.savefig('outputs/plots/india/02_salary_analysis.png', dpi=300, bbox_inches='tight')
print("‚úì Saved: 02_salary_analysis.png")
plt.close()

# ========== VIZ 3: TOP COMPANIES ==========
print("Creating Visualization 3: Top Hiring Companies...")

top_companies = df['company'].value_counts().head(15)

fig, ax = plt.subplots(figsize=(12, 8))
bars = ax.barh(top_companies.index, top_companies.values, color='#A569BD', alpha=0.8)
ax.set_title('Top 15 Companies Hiring Data Professionals in India', 
             fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Number of Job Postings', fontsize=12)
ax.invert_yaxis()

for i, v in enumerate(top_companies.values):
    ax.text(v + 1, i, str(v), va='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('outputs/plots/india/03_top_companies.png', dpi=300, bbox_inches='tight')
print("‚úì Saved: 03_top_companies.png")
plt.close()

# ========== VIZ 4: SKILLS DEMAND ==========
print("Creating Visualization 4: Skills Analysis...")

skill_cols = [col for col in df.columns if col.startswith('skill_')]
skill_counts = {}
for col in skill_cols:
    skill_name = col.replace('skill_', '').replace('_', ' ').title()
    count = df[col].sum()
    if count > 0:
        skill_counts[skill_name] = count

top_skills = dict(sorted(skill_counts.items(), key=lambda x: x[1], reverse=True)[:12])

fig, ax = plt.subplots(figsize=(12, 8))
skills = list(top_skills.keys())
counts = list(top_skills.values())
percentages = [(c/len(df))*100 for c in counts]

bars = ax.barh(skills, counts, color='#E74C3C', alpha=0.8)
ax.set_title('Top 12 Skills in Indian Data Science Jobs', 
             fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Number of Job Postings', fontsize=12)
ax.invert_yaxis()

for i, (bar, count, pct) in enumerate(zip(bars, counts, percentages)):
    ax.text(count + 3, i, f'{count} ({pct:.1f}%)', 
            va='center', fontsize=10)

plt.tight_layout()
plt.savefig('outputs/plots/india/04_top_skills.png', dpi=300, bbox_inches='tight')
print("‚úì Saved: 04_top_skills.png")
plt.close()

# ========== VIZ 5: SENIORITY VS SALARY ==========
print("Creating Visualization 5: Seniority Analysis...")

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Seniority distribution
seniority_dist = df['seniority'].value_counts()
colors_pie = ['#3498DB', '#2ECC71', '#E74C3C']
axes[0].pie(seniority_dist.values, labels=seniority_dist.index, autopct='%1.1f%%',
            startangle=90, colors=colors_pie, textprops={'fontsize': 11, 'fontweight': 'bold'})
axes[0].set_title('Distribution of Job Levels', fontsize=14, fontweight='bold')

# Average salary by seniority
sen_salary = df_with_sal.groupby('seniority')['avg_salary_lpa'].agg(['median', 'count']).reset_index()
sen_salary = sen_salary.sort_values('median')

bars = axes[1].barh(sen_salary['seniority'], sen_salary['median'], color='#F39C12', alpha=0.8)
axes[1].set_title('Median Salary by Seniority Level', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Median Salary (‚Çπ LPA)', fontsize=12)

for i, row in sen_salary.iterrows():
    axes[1].text(row['median'] + 0.5, i, 
                f"‚Çπ{row['median']:.1f}L ({int(row['count'])} jobs)", 
                va='center', fontsize=10)

plt.tight_layout()
plt.savefig('outputs/plots/india/05_seniority_analysis.png', dpi=300, bbox_inches='tight')
print("‚úì Saved: 05_seniority_analysis.png")
plt.close()

# ========== PRINT KEY INSIGHTS ==========
print("\n" + "="*70)
print("KEY INSIGHTS FROM INDIA DATA SCIENCE JOB MARKET")
print("="*70)

print(f"\nüìä OVERALL STATISTICS:")
print(f"   Total Jobs Analyzed: {len(df)}")
print(f"   Unique Companies: {df['company'].nunique()}")
print(f"   Job Categories: {df['job_category'].nunique()}")

print(f"\nüí∞ SALARY INSIGHTS:")
print(f"   Overall Median Salary: ‚Çπ{df['avg_salary_lpa'].median():.1f} LPA")
print(f"   Overall Mean Salary: ‚Çπ{df['avg_salary_lpa'].mean():.1f} LPA")
print(f"   Salary Range: ‚Çπ{df['avg_salary_lpa'].min():.1f}L - ‚Çπ{df['avg_salary_lpa'].max():.1f}L")

print(f"\n   By Seniority Level:")
for level in ['Junior', 'Mid-Level', 'Senior']:
    level_data = df[df['seniority'] == level]['avg_salary_lpa']
    if len(level_data) > 0:
        print(f"   {level:12s}: ‚Çπ{level_data.median():.1f}L median ({len(level_data)} jobs)")

print(f"\n   By Job Category (Top 5):")
for cat in cat_salary.head(5).index:
    sal = cat_salary[cat]
    count = len(df[(df['job_category'] == cat) & (df['avg_salary_lpa'].notna())])
    print(f"   {cat:20s}: ‚Çπ{sal:.1f}L median ({count} jobs)")

print(f"\nüéØ JOB CATEGORIES (Top 5):")
for i, (cat, count) in enumerate(job_cats.head(5).items(), 1):
    pct = (count/len(df))*100
    print(f"   {i}. {cat:25s} - {count:4d} jobs ({pct:.1f}%)")

print(f"\nüõ†Ô∏è TOP 10 SKILLS:")
for i, (skill, count) in enumerate(list(top_skills.items())[:10], 1):
    pct = (count/len(df))*100
    print(f"   {i:2d}. {skill:25s} - {count:4d} jobs ({pct:.1f}%)")

print(f"\nüè¢ TOP 10 HIRING COMPANIES:")
for i, (company, count) in enumerate(top_companies.head(10).items(), 1):
    pct = (count/len(df))*100
    print(f"   {i:2d}. {company:40s} - {count:3d} jobs ({pct:.1f}%)")

print(f"\nüìà EXPERIENCE INSIGHTS:")
exp_dist = df['min_experience_clean'].value_counts().sort_index()
print(f"   0-2 years: {exp_dist[exp_dist.index <= 2].sum()} jobs")
print(f"   3-5 years: {exp_dist[(exp_dist.index >= 3) & (exp_dist.index <= 5)].sum()} jobs")
print(f"   6+ years: {exp_dist[exp_dist.index >= 6].sum()} jobs")

# Salary growth
junior_sal = df[df['seniority'] == 'Junior']['avg_salary_lpa'].median()
senior_sal = df[df['seniority'] == 'Senior']['avg_salary_lpa'].median()
if pd.notna(junior_sal) and pd.notna(senior_sal):
    growth = ((senior_sal - junior_sal) / junior_sal) * 100
    print(f"\nüí° CAREER GROWTH:")
    print(f"   Junior to Senior salary growth: +{growth:.0f}%")
    print(f"   Absolute increase: ‚Çπ{senior_sal - junior_sal:.1f}L")

print("\n" + "="*70)
print("‚úì ALL VISUALIZATIONS CREATED SUCCESSFULLY!")
print("="*70)
print("\nNext: Create Interactive Dashboard")

Analyzing 1602 Indian data science jobs...

Creating Visualization 1: Job Categories...
‚úì Saved: 01_job_categories.png
Creating Visualization 2: Salary Analysis...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df_with_sal, x='seniority', y='avg_salary_lpa',


‚úì Saved: 02_salary_analysis.png
Creating Visualization 3: Top Hiring Companies...
‚úì Saved: 03_top_companies.png
Creating Visualization 4: Skills Analysis...
‚úì Saved: 04_top_skills.png
Creating Visualization 5: Seniority Analysis...
‚úì Saved: 05_seniority_analysis.png

KEY INSIGHTS FROM INDIA DATA SCIENCE JOB MARKET

üìä OVERALL STATISTICS:
   Total Jobs Analyzed: 1602
   Unique Companies: 642
   Job Categories: 4

üí∞ SALARY INSIGHTS:
   Overall Median Salary: ‚Çπ11.9 LPA
   Overall Mean Salary: ‚Çπ13.2 LPA
   Salary Range: ‚Çπ1.4L - ‚Çπ82.0L

   By Seniority Level:
   Mid-Level   : ‚Çπ9.6L median (860 jobs)
   Senior      : ‚Çπ15.0L median (742 jobs)

   By Job Category (Top 5):
   Other Data Role     : ‚Çπ24.2L median (50 jobs)
   Data Scientist      : ‚Çπ16.6L median (373 jobs)
   Data Engineer       : ‚Çπ13.5L median (430 jobs)
   Data Analyst        : ‚Çπ8.6L median (749 jobs)

üéØ JOB CATEGORIES (Top 5):
   1. Data Analyst              -  749 jobs (46.8%)
   2. Data Eng