In [None]:
# 02-eda-visualizations.ipynb
# Purpose: Visual data exploration of cleaned LinkedIn job dataset

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set global style
sns.set(style="whitegrid")

# Load cleaned dataset
df = pd.read_csv("cleaned_jobs.csv")

# Ensure visuals folder exists
os.makedirs("visuals", exist_ok=True)

# --- Top 10 Companies Hiring ---
top_companies = df['COMPANY'].value_counts().head(10)

plt.figure(figsize=(10,6))
sns.barplot(y=top_companies.index, x=top_companies.values, palette='Blues_d')
plt.title('Top 10 Companies Hiring')
plt.xlabel('Number of Job Listings')
plt.ylabel('Company')
plt.tight_layout()
plt.savefig('visuals/top_companies.png')
plt.show()

# --- Top 10 Job Titles ---
top_titles = df['TITLE'].value_counts().head(10)

plt.figure(figsize=(10,6))
sns.barplot(y=top_titles.index, x=top_titles.values, palette='Greens_d')
plt.title('Top 10 Job Titles')
plt.xlabel('Number of Listings')
plt.ylabel('Job Title')
plt.tight_layout()
plt.savefig('visuals/top_titles.png')
plt.show()

# --- Job Count by Location ---
top_locations = df['LOCATION'].value_counts().head(10)

plt.figure(figsize=(10,6))
sns.barplot(y=top_locations.index, x=top_locations.values, palette='Oranges_d')
plt.title('Top 10 Job Locations')
plt.xlabel('Number of Listings')
plt.ylabel('Location')
plt.tight_layout()
plt.savefig('visuals/top_locations.png')
plt.show()

# --- Remote vs Onsite Breakdown ---
remote_count = df['ONSITE REMOTE'].value_counts()

plt.figure(figsize=(6,6))
remote_count.plot(kind='pie', autopct='%1.1f%%', colors=['#ff9999','#66b3ff'])
plt.title('Remote vs Onsite Jobs')
plt.ylabel('')
plt.tight_layout()
plt.savefig('visuals/remote_vs_onsite.png')
plt.show()

# --- Job Posting Trend Over Time ---
df['POSTED DATE'] = pd.to_datetime(df['POSTED DATE'], errors='coerce')
posting_trend = df['POSTED DATE'].dt.date.value_counts().sort_index()

plt.figure(figsize=(12,6))
posting_trend.plot(kind='line', marker='o', color='purple')
plt.title('Job Posting Trend Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Postings')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('visuals/posting_trend.png')
plt.show()
