# EDA and Preprocessing for Job Market Analysis

This notebook: 
- Loads `data/job_listings.csv`
- Explores shape, dtypes, and missing values
- Cleans data (duplicates, missing, salary parsing, text clean)
- Visualizes distributions and correlations
- Saves cleaned dataset to `data/job_listings_clean.csv`

In [None]:
# Imports
import os, re, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# Paths
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd()))) if os.path.basename(os.getcwd())=='notebooks' else os.path.dirname(os.path.abspath(os.getcwd()))
DATA_DIR = os.path.join(BASE_DIR, 'data')
RAW_PATH = os.path.join(DATA_DIR, 'job_listings.csv')
CLEAN_PATH = os.path.join(DATA_DIR, 'job_listings_clean.csv')
RAW_PATH, CLEAN_PATH

In [None]:
# Load data
if not os.path.exists(RAW_PATH):
    raise FileNotFoundError(f'Please place job_listings.csv under {DATA_DIR}')
df = pd.read_csv(RAW_PATH)
df.head()

In [None]:
# Basic info
print('Shape:', df.shape)
display(df.dtypes)
print('Missing values per column:')
display(df.isna().sum())

In [None]:
# Drop duplicates
before = len(df)
df = df.drop_duplicates().copy()
print('Dropped duplicates:', before - len(df))
# Standardize column names (strip spaces)
df.columns = [c.strip() for c in df.columns]
df.head(2)

In [None]:
# Salary parsing utilities
def parse_salary(value):
    if pd.isna(value):
        return np.nan
    try:
        if isinstance(value, (int, float)):
            return float(value)
        s = str(value).strip().lower()
        is_hourly = any(u in s for u in ['/hr','per hour','hourly',' hr','hour'])
        is_monthly = any(u in s for u in ['/mo','per month','monthly',' month'])
        is_daily = any(u in s for u in ['/day','per day','daily'])
        s = s.replace(',', '').replace('$','').replace('usd','').replace('k','000')
        nums = [float(n) for n in re.findall(r'\d+\.?\d*', s)]
        if not nums:
            return np.nan
        amount = np.mean(nums) if len(nums) >= 2 else nums[0]
        if is_hourly:
            amount *= 2080
        elif is_monthly:
            amount *= 12
        elif is_daily:
            amount *= 260
        return float(amount)
    except Exception:
        return np.nan

def clean_text(text):
    if pd.isna(text):
        return ''
    s = str(text)
    s = re.sub(r'<[^>]+>', ' ', s)
    s = s.lower()
    s = re.sub(r'[^a-z\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

In [None]:
# Apply cleaning
if 'salary' in df.columns:
    df['salary'] = df['salary'].apply(parse_salary)
if 'jobDescription' in df.columns:
    df['jobDescription_clean'] = df['jobDescription'].apply(clean_text)
    df['desc_length'] = df['jobDescription_clean'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)
else:
    df['desc_length'] = 0

# Missing values: numeric -> mean, categorical -> mode
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        df[col] = df[col].astype(float)
        df[col] = df[col].fillna(df[col].mean())
    else:
        mode_val = df[col].mode().iloc[0] if not df[col].mode().empty else 'Unknown'
        df[col] = df[col].fillna(mode_val)

df.head(3)

In [None]:
# Visualizations: Salary Distribution
plt.figure(figsize=(7,4))
if 'salary' in df.columns:
    sns.histplot(df['salary'], bins=40, kde=True, color='#4472C4')
    plt.title('Salary Distribution')
else:
    plt.text(0.5,0.5,'No salary column found', ha='center')
plt.show()

# Rating Distribution
plt.figure(figsize=(7,4))
if 'rating' in df.columns:
    sns.histplot(df['rating'], bins=30, kde=True, color='#70AD47')
    plt.title('Rating Distribution')
else:
    plt.text(0.5,0.5,'No rating column found', ha='center')
plt.show()

# Job Type Count
plt.figure(figsize=(7,4))
if 'jobType' in df.columns:
    sns.countplot(y=df['jobType'], order=df['jobType'].value_counts().index, color='#ED7D31')
    plt.title('Job Type Count')
else:
    plt.text(0.5,0.5,'No jobType column found', ha='center')
plt.show()

# Correlation Heatmap
plt.figure(figsize=(8,6))
num_df = df.select_dtypes(include=[np.number])
if not num_df.empty:
    sns.heatmap(num_df.corr(), annot=False, cmap='coolwarm', center=0)
    plt.title('Correlation Heatmap')
else:
    plt.text(0.5,0.5,'No numeric columns for correlation', ha='center')
plt.show()

In [None]:
# Save cleaned data
df.to_csv(CLEAN_PATH, index=False)
print('Saved cleaned data to:', CLEAN_PATH)