# Job Market Analyzer - Analysis Demo

This notebook demonstrates interactive analysis and visualization of job market data.

**Prerequisites:** Run the full pipeline first (`make run` or `run_all.sh`)

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries loaded successfully!")

## 1. Load Data

In [None]:
# Load processed job data
jobs_df = pd.read_csv('../data/jobs_with_skills.csv')

print(f"Loaded {len(jobs_df)} job postings")
print(f"Columns: {jobs_df.columns.tolist()[:10]}...")
jobs_df.head()

## 2. Skill Distribution Analysis

In [None]:
# Skills per job distribution
plt.figure(figsize=(10, 5))
plt.hist(jobs_df['num_skills'], bins=20, edgecolor='black', alpha=0.7)
plt.xlabel('Number of Skills per Job')
plt.ylabel('Frequency')
plt.title('Distribution of Skills per Job Posting')
plt.axvline(jobs_df['num_skills'].mean(), color='red', linestyle='--', label=f"Mean: {jobs_df['num_skills'].mean():.1f}")
plt.legend()
plt.show()

print(f"Average skills per job: {jobs_df['num_skills'].mean():.2f}")
print(f"Median skills per job: {jobs_df['num_skills'].median():.0f}")

In [None]:
# Top 20 most demanded skills
skill_counts = {}
for skills_str in jobs_df['extracted_skills']:
    if pd.notna(skills_str) and skills_str:
        skills = [s.strip() for s in str(skills_str).split(';') if s.strip()]
        for skill in skills:
            skill_counts[skill] = skill_counts.get(skill, 0) + 1

top_skills = pd.DataFrame(list(skill_counts.items()), columns=['Skill', 'Count']).sort_values('Count', ascending=False).head(20)

plt.figure(figsize=(12, 6))
plt.barh(top_skills['Skill'], top_skills['Count'], color='steelblue')
plt.xlabel('Number of Job Postings')
plt.title('Top 20 Most Demanded Skills')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 10 Skills:")
print(top_skills.head(10).to_string(index=False))

## 3. Geographic Analysis

In [None]:
# Jobs by location
location_counts = jobs_df['location_normalized'].value_counts().head(10)

plt.figure(figsize=(10, 6))
location_counts.plot(kind='bar', color='coral')
plt.xlabel('City')
plt.ylabel('Number of Jobs')
plt.title('Top 10 Cities by Job Postings')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("\nTop 5 Cities:")
print(location_counts.head())

In [None]:
# Jobs by tier and region
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

if 'city_tier' in jobs_df.columns:
    tier_counts = jobs_df['city_tier'].value_counts()
    axes[0].pie(tier_counts.values, labels=tier_counts.index, autopct='%1.1f%%', startangle=90)
    axes[0].set_title('Jobs by City Tier')

if 'region' in jobs_df.columns:
    region_counts = jobs_df['region'].value_counts()
    axes[1].pie(region_counts.values, labels=region_counts.index, autopct='%1.1f%%', startangle=90)
    axes[1].set_title('Jobs by Region')

plt.tight_layout()
plt.show()

## 4. Skill Co-occurrence Network

In [None]:
# Load network graph (if available)
network_path = Path('../exports/skill_network.graphml')

if network_path.exists():
    G = nx.read_graphml(network_path)
    
    # Get top nodes by degree
    degrees = dict(G.degree())
    top_nodes = sorted(degrees, key=degrees.get, reverse=True)[:15]
    subgraph = G.subgraph(top_nodes)
    
    # Visualize
    plt.figure(figsize=(14, 10))
    pos = nx.spring_layout(subgraph, k=2, iterations=50)
    
    # Node sizes based on degree
    node_sizes = [degrees[node] * 100 for node in subgraph.nodes()]
    
    nx.draw_networkx_nodes(subgraph, pos, node_size=node_sizes, node_color='lightblue', alpha=0.7)
    nx.draw_networkx_edges(subgraph, pos, alpha=0.3, width=2)
    nx.draw_networkx_labels(subgraph, pos, font_size=10, font_weight='bold')
    
    plt.title('Skill Co-occurrence Network (Top 15 Skills)', fontsize=16)
    plt.axis('off')
    plt.tight_layout()
    plt.show()
    
    print(f"Network has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
else:
    print("Network file not found. Run skill_cooccurrence.py first.")

## 5. Trend Analysis

In [None]:
# Load trend data
trends_path = Path('../exports/trends.csv')

if trends_path.exists():
    trends_df = pd.read_csv(trends_path)
    
    # Get top 5 skills by total mentions
    skill_totals = trends_df.groupby('skill')['count'].sum().sort_values(ascending=False).head(5)
    top_5_skills = skill_totals.index.tolist()
    
    # Plot trends
    plt.figure(figsize=(14, 6))
    
    for skill in top_5_skills:
        skill_data = trends_df[trends_df['skill'] == skill].sort_values('period')
        plt.plot(skill_data['period'], skill_data['count'], marker='o', label=skill, linewidth=2)
    
    plt.xlabel('Time Period')
    plt.ylabel('Mentions')
    plt.title('Top 5 Skills Trend Over Time')
    plt.legend()
    plt.xticks(rotation=45, ha='right')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
else:
    print("Trends file not found. Run trend_analysis.py first.")

In [None]:
# Load growth metrics
growth_path = Path('../exports/trends_growth.csv')

if growth_path.exists():
    growth_df = pd.read_csv(growth_path)
    
    # Emerging skills
    emerging = growth_df[growth_df['growth_rate'] > 0].head(10)
    
    plt.figure(figsize=(10, 6))
    plt.barh(emerging['skill'], emerging['growth_rate'] * 100, color='green', alpha=0.7)
    plt.xlabel('Growth Rate (%)')
    plt.title('Top 10 Emerging Skills (Highest Growth)')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print("\nTop Emerging Skills:")
    print(emerging[['skill', 'growth_rate', 'total_mentions']].head())
else:
    print("Growth metrics not found.")

## 6. Salary Analysis

In [None]:
# Salary distribution
if 'salary_min' in jobs_df.columns and 'salary_max' in jobs_df.columns:
    valid_salaries = jobs_df[(jobs_df['salary_min'].notna()) & (jobs_df['salary_max'].notna())]
    
    if len(valid_salaries) > 0:
        valid_salaries['avg_salary'] = (valid_salaries['salary_min'] + valid_salaries['salary_max']) / 2
        
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        # Histogram
        axes[0].hist(valid_salaries['avg_salary'] / 100000, bins=20, edgecolor='black', alpha=0.7)
        axes[0].set_xlabel('Average Salary (Lakhs)')
        axes[0].set_ylabel('Frequency')
        axes[0].set_title('Salary Distribution')
        
        # Box plot by location
        top_locations = valid_salaries['location_normalized'].value_counts().head(5).index
        location_salary_data = [valid_salaries[valid_salaries['location_normalized'] == loc]['avg_salary'] / 100000 
                               for loc in top_locations]
        
        axes[1].boxplot(location_salary_data, labels=top_locations)
        axes[1].set_ylabel('Average Salary (Lakhs)')
        axes[1].set_title('Salary by Top 5 Cities')
        axes[1].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.show()
        
        print(f"\nMedian Salary: ₹{valid_salaries['avg_salary'].median() / 100000:.2f} Lakhs")
        print(f"Mean Salary: ₹{valid_salaries['avg_salary'].mean() / 100000:.2f} Lakhs")
else:
    print("Salary data not available")

## 7. Experience Requirements

In [None]:
# Experience distribution
if 'experience_years' in jobs_df.columns:
    valid_exp = jobs_df[jobs_df['experience_years'].notna()]
    
    plt.figure(figsize=(10, 5))
    plt.hist(valid_exp['experience_years'], bins=range(0, 11), edgecolor='black', alpha=0.7, color='purple')
    plt.xlabel('Years of Experience Required')
    plt.ylabel('Number of Jobs')
    plt.title('Distribution of Experience Requirements')
    plt.xticks(range(0, 11))
    plt.tight_layout()
    plt.show()
    
    print(f"\nMedian Experience: {valid_exp['experience_years'].median():.0f} years")
    print(f"Mean Experience: {valid_exp['experience_years'].mean():.1f} years")

## 8. Summary Statistics

In [None]:
print("=" * 60)
print("JOB MARKET ANALYSIS SUMMARY")
print("=" * 60)
print(f"\nTotal Job Postings: {len(jobs_df)}")
print(f"Unique Skills: {len(skill_counts)}")
print(f"Unique Locations: {jobs_df['location_normalized'].nunique()}")
print(f"Date Range: {jobs_df['posted_date'].min()} to {jobs_df['posted_date'].max()}")

if 'title_normalized' in jobs_df.columns:
    print(f"\nTop 5 Job Titles:")
    for title, count in jobs_df['title_normalized'].value_counts().head(5).items():
        print(f"  {title}: {count}")

print("\nTop 5 Skills:")
for idx, row in top_skills.head(5).iterrows():
    print(f"  {row['Skill']}: {row['Count']}")

print("\n" + "=" * 60)

## Next Steps

1. **Tableau**: Import CSV files from `exports/tableau_ready/` for interactive dashboards
2. **Deep Dive**: Explore specific skills, locations, or time periods in detail
3. **Custom Analysis**: Add your own visualizations and insights
4. **Export**: Save figures for reports using `plt.savefig()`

For more information, see the project README.