In [None]:
import numpy as np
import pandas as pd
from deep_translator import GoogleTranslator
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../data/python/2026-02-19_18-28-09/raw_data.csv")

# Data cleaning

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.drop(["url", "company_url", "description"], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
salary_nums = df["salary"].str.extract(r"(\d+)\D*(\d+)?").astype(float)

In [None]:
salary_from = df["salary"].str.startswith("від")
salary_to = df["salary"].str.startswith("до")

In [None]:
df["lower_salary"] = np.select(
    condlist=[salary_to, salary_from],
    choicelist=[np.nan, salary_nums[0]],
    default=salary_nums[0]
)
df["upper_salary"] = np.select(
    condlist=[salary_to, salary_from],
    choicelist=[salary_nums[0], np.nan],
    default=salary_nums[1]
)

In [None]:
df.loc[df["lower_salary"] <= 10, "lower_salary"] *= 1000
df.loc[df["upper_salary"] <= 10, "upper_salary"] *= 1000

df.loc[df["lower_salary"] <= 100, "lower_salary"] *= 100
df.loc[df["upper_salary"] <= 100, "upper_salary"] *= 100

In [None]:
ukr_months = {
    "січня": "01",
    "лютого": "02",
    "березня": "03",
    "квітня": "04",
    "травня": "05",
    "червня": "06",
    "липня": "07",
    "серпня": "08",
    "вересня": "09",
    "жовтня": "10",
    "листопада": "11",
    "грудня": "12"
}

In [None]:
df["date"] = df["date"].replace(ukr_months, regex=True)
df["date"] = pd.to_datetime(df["date"], format="%d %m %Y")

In [None]:
tech_dummies = df["technologies"].str.get_dummies(sep=",").add_prefix("tech_")
df = pd.concat([df, tech_dummies], axis=1)

In [None]:
df.drop(["salary", "technologies"], axis=1, inplace=True)

In [None]:
unique_locations = df["location"].dropna().unique()
translator = GoogleTranslator(source="uk", target="en")
translations = [translator.translate(location) for location in unique_locations]
translations_map = dict(zip(unique_locations, translations))
df["location"] = df["location"].map(translations_map)

In [None]:
df["location"] = df["location"].fillna("unknown")

In [None]:
location_dummies = df["location"].str.get_dummies(sep=", ").add_prefix("loc_")
df = pd.concat([df, location_dummies], axis=1)

In [None]:
nan_count = df.isna().sum()
nan_count

In [None]:
df.head()

In [None]:
df["upper_salary"].unique()

# Data plotting

In [None]:
experience_counts = df["experience_years"].value_counts(dropna=True).sort_index()
plt.figure(figsize=(12, 8))

x_labels = [str(round(i, 2)) for i in experience_counts.index]
y_values = experience_counts.values

plt.bar(x_labels, y_values, color="lightgreen", edgecolor="black", zorder=2)
plt.grid(axis="y", linestyle="--", alpha=0.7)

plt.title("Count of Vacancies With Certain Years of Experience")
plt.ylabel("Number of Vacancies")
plt.xlabel("Years of Experience")

plt.show()

In [None]:
date_counts = df["date"].value_counts(dropna=True).sort_index()
plt.figure(figsize=(12, 8))

x_labels = [str(i.date()) for i in date_counts.index]
y_values = date_counts.values

plt.bar(x_labels, y_values, color="lightgreen", edgecolor="black", zorder=2)
plt.grid(axis="y", linestyle="--", alpha=0.7)

plt.title("Count of Vacancies by Publishing Date")
plt.ylabel("Number of Vacancies")
plt.xlabel("Date of Publishing")
plt.xticks(rotation=55)

plt.show()

In [None]:
day_names_count = df["date"].dt.day_name().value_counts()

days_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
day_names_count = day_names_count.reindex(days_order)

plt.figure(figsize=(12, 8))

plt.bar(day_names_count.index, day_names_count, color="lightgreen", edgecolor="black", zorder=2)
plt.grid(axis="y", linestyle="--", alpha=0.7)

plt.title("Count of Vacancies By Publishing Day")
plt.ylabel("Number of Vacancies")
plt.xlabel("Day of Publishing")

plt.show()

In [None]:
locations = df.filter(like="loc_")
total_vacancies = locations.shape[0]
top_10_locations = locations.sum().nlargest(10)


def custom_pct(values):
    def format_string(pct):
        total = sum(values)
        count = round(pct * total / 100)
        return f"{pct:.2f}% ({count})"

    return format_string


plt.figure(figsize=(10, 8))

plt.pie(
    top_10_locations,
    labels=top_10_locations.index.str.replace("loc_", ""),
    autopct=custom_pct(top_10_locations),
    pctdistance=0.8
)

plt.title(f"Top 10 Work Locations out of {total_vacancies} Vacancies")

plt.show()

In [None]:
df["lower_salary"].value_counts()

In [None]:
bins = [0, 500, 1000, 1500, 2000, 3000, 4000, 5000, 7000, np.inf]

labels = [
    "0-500", "500-1000", "1000-1500", "1500-2000",
    "2000-3000", "3000-4000", "4000-5000", "5000-7000", "7000+"
]

df["lower_salary_range"] = pd.cut(
    df["lower_salary"],
    bins=bins,
    labels=labels,
    right=False
)

df["upper_salary_range"] = pd.cut(
    df["upper_salary"],
    bins=bins,
    labels=labels,
    right=True
)

In [None]:
lower_counts = df["lower_salary_range"].value_counts(sort=False)
upper_counts = df["upper_salary_range"].value_counts(sort=False)

x_indexes = np.arange(len(lower_counts.index))
bar_width = 0.4

fig, ax = plt.subplots(figsize=(12, 8))

ax.bar(x_indexes - bar_width / 2, lower_counts.values, width=bar_width,
       color="lightblue", edgecolor="black", label="Lower Salary", zorder=2)
ax.bar(x_indexes + bar_width / 2, upper_counts.values, width=bar_width,
       color="lightgreen", edgecolor="black", label="Upper Salary", zorder=2)

ax.set_title("Salary Ranges: Lower vs. Upper")
ax.set_ylabel("Number of Vacancies")
ax.set_xlabel("Salary ($)")

ax.set_xticks(x_indexes)
ax.set_xticklabels(lower_counts.index, rotation=45)

ax.grid(axis="y", linestyle="--", alpha=0.7)
ax.legend()

plt.show()

In [None]:
technologies = df.filter(like="tech_")
total_technologies = technologies.shape[0]
top_20_technologies = technologies.sum().nlargest(20)

plt.figure(figsize=(12, 8))

plt.bar(top_20_technologies.index.str.replace("tech_", ""),
        top_20_technologies,
        color="lightgreen",
        edgecolor="black",
        zorder=2
        )
plt.grid(axis="y", linestyle="--", alpha=0.7)

plt.title("Top 20 Technologies by Mentions")
plt.ylabel("Number of Mentions")
plt.xlabel("Technology")
plt.xticks(rotation=35)

plt.show()