In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random


Start DataScience

Day 1: Основи

In [None]:
data_path = r"C:\Users\Afina\projects\parsers\Job-parser-for-work.ua\data\IT_KYIV_306.xlsx"

In [None]:
df = pd.read_excel(data_path)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
top_jobs = df[df["ai score"] >= 15].copy()
top_jobs.head()

In [None]:
data_science_mask = (df["class"] == "IT; 10–50 співробітників") & (df["ai score"] > 13)
data_science = df[data_science_mask]
data_science.head()

In [None]:
data_science.sort_values(by="ai score", ascending=True)

Day 2: Маніпуляція з даними

In [None]:
avg_salare_mask = df["min salary"].notna() & df["max salary"].notna()
avg_salare = df[avg_salare_mask].copy()
avg_salare["avg salary"] = (avg_salare["min salary"] + avg_salare["max salary"]) /2
avg_salare.head()

In [None]:
is_python = df["skills"].str.contains("python", case=False, na=False)
print(sum(is_python))

python_df = df[is_python]
avg_ai_score_puthon = python_df["ai score"].mean()
print(f"avg_ai_score_puthon: {avg_ai_score_puthon}")
python_df.head()

In [None]:
python_job = df.copy()
# python_job = python_job.fillna({"min salary": 0, "max salary": 0})
python_job["is_python"] = is_python
python_job["avg salary"] = (python_job["min salary"] + python_job["max salary"])/2
python_job.tail()

In [None]:
comparison = python_job.groupby(["class", "is_python"])["ai score"].agg(["mean", "count"])
comparison.reindex()
pivot_report = comparison.pivot_table(
    values=["mean"],
    columns=["is_python"],
    index=["class"],
    aggfunc="mean"
)
pivot_report.style.background_gradient(cmap='Greens')

Очищення чисел від букв

In [None]:
clear_salary = df.copy()
clear_salary = clear_salary.fillna({
    "min salary": 0,
    "max salary": 0,
    "skills": "Not specified",
    "ai score": 0})
pivot_report = clear_salary.pivot_table(
    values=["max salary","min salary"],
    index="class",
    columns="ai score",
    aggfunc="mean"
).fillna(0)
pivot_report.style.background_gradient(cmap='Greens', low=0.1)

Day 3 Очищення та робота з реальними даними

In [None]:
position_to_null = df.copy()
position_to_null["position"] = pd.to_numeric(df["position"], errors="coerce")


In [None]:
comparison["popularity_share"] = comparison['count'] / comparison['count'].sum()*100
comparison.pivot_table(
    values=["popularity_share","count"],
    index=["class", "is_python"],
    columns=[],
    aggfunc="mean"
).fillna(0)
comparison.sort_values(by="is_python", ascending=False)


Day 4

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(data=comparison, x="popularity_share", y="class", hue="is_python")
plt.title('Частка вакансій за класом компанії та наявністю Python')
plt.xlabel('Частка ринку (%)')
plt.ylabel('Тип компанії')

plt.show

In [None]:
plt.figure(figsize=(12,5))
sns.barplot(data=comparison, x="mean", y="class", hue="is_python", palette="magma")
plt.title('Mean вакансій за класом компанії та наявністю Python')
plt.xlabel('Mean')
plt.ylabel('Тип компанії')
plt.show()

In [None]:
plt.figure(figsize=(12,5))
sns.boxplot(data=python_job, hue="is_python", x="is_python", y="ai score", palette="Set2", legend=False)
plt.title('Розкид AI Score: Python vs Інші')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
plt.figure(figsize=(15,7))
sns.scatterplot(data=python_job, y="class"1)
plt.ylabel('Середня зарплата')
plt.legend(title='Знає Python?')
plt.show()

Day 5

Практина 1

In [None]:
data = {
    "date": pd.date_range("01/01/2026", periods=100, freq="h"),
    "item": np.random.choice(['Latte', 'Cappuccino', 'Espresso'], 100),
    "price": np.random.choice(['$5,0', '$4,5', '$3,0'], 100),
    "quantity": np.random.randint(1, 5, 100)
}
df = pd.DataFrame(data)

Завдання 1: очищення даних

In [None]:
df["price"] = df["price"].str.replace("$", "").str.replace(",", "").fillna(0)
df["price"] = pd.to_numeric(df["price"], errors="coerce").fillna(0)

In [None]:
df["total_sales"] = df["price"] * df["quantity"]
df.head()

Завдання 2: зайти наприбутковіший напій

In [None]:
drink_profit = df.groupby("item")["total_sales"].sum()
best = drink_profit.sort_values(axis=0, ascending=False)
print(f"{best.idxmax()} : {best.max()}")

In [None]:
ts = df.set_index("date").resample(rule="5h").mean(numeric_only=True)
ts.head()

In [None]:
sns.set_theme(style="darkgrid")
plt.figure(figsize=(12, 6))
sns.lineplot(data=ts, x=ts.index, y="total_sales", marker="o")
plt.title("Середній виторг кожні 5 годин")
plt.ylabel("Виторг ($)")
plt.xlabel("Дата та час")
plt.show()

Day 6: об'єднання таблиць

In [None]:
data = {
    
    "date": pd.date_range("2026-01-01", periods=100, freq="D"),
    "item": np.random.choice(["late", "cappuccino", "espresso"], 100),
    "price": np.random.choice(["$5.0", "$6.0", "$7.0"], 100),
    "quantity": np.random.randint(1, 5, 100),
    
}

discount = {
    "item": ["late", "espresso"],
    "discount": [0.15, 0.05]
}

df_data = pd.DataFrame(data)
df_data["price"] = pd.to_numeric(df_data["price"].str.replace("$", ""), errors="coerce").fillna(0)
df_data["total"] = df_data["price"] * df_data["quantity"]

df_discount = pd.DataFrame(data=discount)
df_discount.head()

In [None]:
df_merge = pd.merge(df_data, df_discount, on="item", how="left")
df_merge["discount"] = df_merge["discount"].fillna(0)
df_merge.head()

In [None]:
df_merge["discounted_price"] = df_merge["price"] * (1 - df_merge["discount"])
df_merge["discounted_total"] = df_merge["discounted_price"] * df_merge["quantity"]

result = df_merge.groupby("item").agg({
    "total": "sum",
    "discounted_total": "sum"
}).reset_index()
result.head()