In [14]:
pip install pandas psycopg2-binary matplotlib wordcloud


Note: you may need to restart the kernel to use updated packages.


In [15]:
"""
job_analysis.py

- Connects to PostgreSQL or reads CSV
- Produces:
    * Word cloud of top skills/terms
    * Top countries by job listings (bar chart)
    * Min-Max salary per job_title (CSV)
    * Top companies by average salary (bar chart + CSV)
    * Salary distribution by country (boxplot)
- Output files saved in ./output/
"""

import os
import re
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud


In [16]:
# ---------- CONFIG ----------
USE_CSV = False                # Set True to load from CSV instead of DB
CSV_PATH = "sample_job_data.csv"   # used if USE_CSV=True

# PostgreSQL config (edit)
PG_CONFIG = {
#    "host": "localhost",
#    "port": 5432,
#    "dbname": "testdb",
#    "user": "********",
#    "password": "********"
     "host": "jde08-ip-p2-angbj1976-c47c.c.aivencloud.com",
     "port": 15241,
     "dbname": "Interim_Project_DB",
     "user": "********",
     "password": "********"
}

OUTPUT_DIR = "./output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [17]:
# Keyword list for skill detection (optional; used to weight common tech terms)
SKILL_KEYWORDS = [
    "python","sql","java","scala","spark","hadoop","aws","gcp","google cloud",
    "azure","kubernetes", "docker","airflow","etl","data warehouse","redshift",
    "snowflake","bigquery","dbt","kafka","nosql","mongodb","cassandra","postgresql",
    "mysql","pandas","numpy",    "tensorflow","pytorch", "powerbi","tableau","hive",
    "flink","beam","linux","bash", "s3","git","ci/cd","jenkins","databricks","kubeflow"
]
SKILL_KEYWORDS = [k.lower() for k in SKILL_KEYWORDS]

In [19]:
# ---------- LOAD DATA ----------
if USE_CSV:
    df_jobs = pd.read_csv(CSV_PATH)
    # If company table also provided as CSV, load similarly (optional)
    dim_companies = None
else:
    import psycopg2
    import sqlalchemy
    from sqlalchemy import create_engine

    conn = None
    try:
        conn = psycopg2.connect(
            host=PG_CONFIG["host"],
            port=PG_CONFIG["port"],
            dbname=PG_CONFIG["dbname"],
            user=PG_CONFIG["user"],
            password=PG_CONFIG["password"]
        )

        # Pull fact_jobs and dim_companies in one session (keeps connection open while reading)
        df_jobs = pd.read_sql("SELECT * FROM fact_jobs;", conn)
        dim_companies = pd.read_sql("SELECT company_id, company_name, country_code FROM dim_companies;", conn)

    except Exception as e:
        raise RuntimeError(f"DB read error: {e}")
    finally:
        # Close connection after we have loaded the required tables
        if conn is not None:
            conn.close()


  df_jobs = pd.read_sql("SELECT * FROM fact_jobs;", conn)
  dim_companies = pd.read_sql("SELECT company_id, company_name, country_code FROM dim_companies;", conn)


In [20]:
# ---------- PREP: standardize column names (safe matching) ----------
def find_col(df, names):
    for n in names:
        if n in df.columns:
            return n
    # case-insensitive match
    cols_lower = {c.lower(): c for c in df.columns}
    for n in names:
        if n.lower() in cols_lower:
            return cols_lower[n.lower()]
    return None

# ensure job_title, description columns
job_title_col = find_col(df_jobs, ["job_title", "title"])
desc_col = find_col(df_jobs, ["description", "job_description", "summary"])

# salary columns detection
salary_min_col = find_col(df_jobs, ["salary_min", "min_salary", "salary_from", "salary_low"])
salary_max_col = find_col(df_jobs, ["salary_max", "max_salary", "salary_to", "salary_high"])

# company_id
company_id_col = find_col(df_jobs, ["company_id"])

# Force salary columns numeric if present
if salary_min_col:
    df_jobs[salary_min_col] = pd.to_numeric(df_jobs[salary_min_col], errors='coerce')
if salary_max_col:
    df_jobs[salary_max_col] = pd.to_numeric(df_jobs[salary_max_col], errors='coerce')

# Always create _row_avg_salary so later code won't fail
def compute_row_avg(row):
    a = row.get(salary_min_col) if salary_min_col in row.index else None
    b = row.get(salary_max_col) if salary_max_col in row.index else None
    if pd.notna(a) and pd.notna(b):
        return (a + b) / 2.0
    if pd.notna(a):
        return a
    if pd.notna(b):
        return b
    return None

df_jobs["_row_avg_salary"] = df_jobs.apply(compute_row_avg, axis=1)

In [21]:
# ---------- MERGE company info (to get company_name and country_code) ----------
if company_id_col and 'dim_companies' in globals() and dim_companies is not None:
    # Note: if dim_companies not available in CSV flow you can provide it similarly
    df = df_jobs.merge(dim_companies, left_on=company_id_col, right_on='company_id', how='left', suffixes=("","_comp"))
else:
    # No dim_companies available: create placeholders
    df = df_jobs.copy()
    if 'company_name' not in df.columns:
        df['company_name'] = df.get(company_id_col).astype(str) if company_id_col else "UNKNOWN"
    if 'country_code' not in df.columns:
        df['country_code'] = None

# If country_code is null, mark as UNKNOWN
df['country_code'] = df['country_code'].fillna("UNKNOWN")


In [28]:
# ---------- 1) Word cloud: core skills from job_title + description ----------
# Combine text
text_columns = []
if job_title_col:
    text_columns.append(job_title_col)
if desc_col:
    text_columns.append(desc_col)
if not text_columns:
    # fallback to all string/object columns
    text_columns = [c for c in df.columns if df[c].dtype == object]
df["_combined_text"] = df[text_columns].fillna("").agg(" ".join, axis=1)

# Build token frequency weighted toward SKILL_KEYWORDS
token_counter = Counter()
for txt in df["_combined_text"].astype(str).str.lower():
    # count keyword occurrences
    for kw in SKILL_KEYWORDS:
        if kw in txt:
            token_counter[kw] += txt.count(kw) * 5  # weight keywords a bit higher

    # fallback tokenization for other words
    tokens = re.findall(r"[a-zA-Z\+\#\-]{2,}", txt)
    for t in tokens:
        if t in SKILL_KEYWORDS:
            continue
        token_counter[t] += 1

# reduce extremely common stopwords
stopwords = set([
    "the","and","to","of","a","in","for","with","on","is","as","are","be",
    "by","an","or","from",
    "experience","years","year","work","ability","use","using","knowledge",
    "skills","role","responsibilities",
    "required","preferred","must","will","team","based","data","engineer",
    "engineers","job","jobs"
])
for s in stopwords:
    if s in token_counter:
        del token_counter[s]

# create wordcloud from the most common tokens
if token_counter:
    wordcloud_input = {k: v for k, v in token_counter.most_common(200)}
    wc = WordCloud(width=1200, height=600, 
                   background_color="white").generate_from_frequencies(wordcloud_input)
    wc_path = os.path.join(OUTPUT_DIR, "wordcloud_skills.png")
    plt.figure(figsize=(12,6))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.tight_layout()
    plt.savefig(wc_path)
    plt.close()
    print("Saved word cloud to:", wc_path)
else:
    print("No textual data found to create word cloud.")

Saved word cloud to: ./output\wordcloud_skills.png


In [23]:
# ---------- 2) Which country are most DATA engineers coming from? ----------
country_counts = df['country_code'].value_counts().reset_index()
country_counts.columns = ['country_code', 'count']
country_counts.to_csv(os.path.join(OUTPUT_DIR, "country_counts.csv"), index=False)

# bar chart (top 20)
top_country_counts = country_counts.head(20)
plt.figure(figsize=(10,6))
plt.bar(range(len(top_country_counts)), top_country_counts['count'])
plt.xticks(range(len(top_country_counts)), top_country_counts['country_code'], rotation=45, ha='right')
plt.title("Top countries by job listings")
plt.tight_layout()
country_plot_path = os.path.join(OUTPUT_DIR, "country_counts_bar.png")
plt.savefig(country_plot_path)
plt.close()
print("Saved country bar chart to:", country_plot_path)

Saved country bar chart to: ./output\country_counts_bar.png


In [24]:
# ---------- 3) Minâ€“Max Salary of each job title ----------
# We'll compute min of salary_min and max of salary_max per job_title
job_title_col_safe = job_title_col if job_title_col else 'job_title'
if job_title_col_safe not in df.columns:
    # create fallback
    df[job_title_col_safe] = "UNKNOWN"

agg_minmax = df.groupby(job_title_col_safe).agg(
    min_salary_min = pd.NamedAgg(column=salary_min_col if salary_min_col in df.columns else salary_min_col, aggfunc=lambda s: pd.to_numeric(s, errors='coerce').min()),
    max_salary_max = pd.NamedAgg(column=salary_max_col if salary_max_col in df.columns else salary_max_col, aggfunc=lambda s: pd.to_numeric(s, errors='coerce').max()),
    count = pd.NamedAgg(column=job_title_col_safe, aggfunc='count')
).reset_index().sort_values(['min_salary_min','max_salary_max'], ascending=[True, False])

# Clean column names if NaN
agg_minmax['min_salary_min'] = agg_minmax['min_salary_min'].replace({pd.NA: None})
agg_minmax['max_salary_max'] = agg_minmax['max_salary_max'].replace({pd.NA: None})

agg_minmax.to_csv(os.path.join(OUTPUT_DIR, "minmax_salary_by_job_title.csv"), index=False)
print("Saved Min-Max salary per job title to:", os.path.join(OUTPUT_DIR, "minmax_salary_by_job_title.csv"))


Saved Min-Max salary per job title to: ./output\minmax_salary_by_job_title.csv


In [25]:
# ---------- 4) Which companies offer the highest average salaries? ----------
# Use company_name where available, else company_id
company_name_col = 'company_name' if 'company_name' in df.columns else company_id_col
company_avg = df.groupby(company_name_col)['_row_avg_salary'].mean().reset_index().rename(columns={'_row_avg_salary':'avg_salary'})
company_avg = company_avg.sort_values('avg_salary', ascending=False).reset_index(drop=True)
company_avg.head(50).to_csv(os.path.join(OUTPUT_DIR, "top_companies_by_avg_salary.csv"), index=False)

# plot top 20 companies
top_companies = company_avg.head(20)
plt.figure(figsize=(12,6))
plt.bar(range(len(top_companies)), top_companies['avg_salary'])
plt.xticks(range(len(top_companies)), top_companies[company_name_col].astype(str), rotation=45, ha='right')
plt.title("Top companies by average salary (row average)")
plt.tight_layout()
companies_plot_path = os.path.join(OUTPUT_DIR, "top_companies_avg_salary.png")
plt.savefig(companies_plot_path)
plt.close()
print("Saved companies average salary chart to:", companies_plot_path)


Saved companies average salary chart to: ./output\top_companies_avg_salary.png


In [31]:
# ---------- 5) How do job salaries vary across countries? (boxplot) ----------
# Take top N countries by count to avoid overcrowding
top_n = 10
top_countries_list = country_counts.head(top_n)['country_code'].tolist()
plot_df = df[df['country_code'].isin(top_countries_list) & pd.notna(df['_row_avg_salary'])]

if not plot_df.empty:
    data_to_plot = [plot_df.loc[plot_df['country_code']==c, '_row_avg_salary'].values for c in top_countries_list]
    plt.figure(figsize=(12,6))
    plt.boxplot(data_to_plot, labels=top_countries_list)
    plt.title("Salary distribution (row avg) by country - top countries")
    plt.ylabel("Salary")
    plt.xlabel("Country")
    plt.tight_layout()
    boxplot_path = os.path.join(OUTPUT_DIR, "salary_by_country_boxplot.png")
    plt.savefig(boxplot_path)
    plt.close()
    print("Saved salary-by-country boxplot to:", boxplot_path)
else:
    print("Not enough salary data by country to draw boxplot.")

  plt.boxplot(data_to_plot, labels=top_countries_list)


Saved salary-by-country boxplot to: ./output\salary_by_country_boxplot.png


In [26]:
# ---------- SUMMARY ----------
print("Outputs written to:", os.path.abspath(OUTPUT_DIR))
print("Files:", os.listdir(OUTPUT_DIR))

Outputs written to: C:\Users\User\TP DE\output
Files: ['country_counts.csv', 'country_counts_bar.png', 'est_travel_times_all_20251202T234439Z.csv', 'est_travel_times_all_20251202T234439Z.json', 'est_travel_times_all_20251202T234506Z.csv', 'est_travel_times_all_20251202T234506Z.json', 'est_travel_times_woodlands_20251202T234439Z.csv', 'est_travel_times_woodlands_20251202T234439Z.json', 'est_travel_times_woodlands_20251202T234506Z.csv', 'est_travel_times_woodlands_20251202T234506Z.json', 'minmax_salary_by_job_title.csv', 'salary_by_country_boxplot.png', 'top_companies_avg_salary.png', 'top_companies_by_avg_salary.csv', 'wordcloud_skills.png']
