# Section 3 Descriptive statistics

In [58]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [5]:
salaries = pd.read_excel("salaries_data.xlsx")
salaries.head()


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


## 6. Categgorical variables

### Task1: Top 5 Most Common Job Titles
Goal: Show how job roles are distributed.

Chart: Bar chart.

In [17]:
grouped_by_job_title = salaries.groupby("job_title").size().sort_values(ascending=False)

five_most_common_jobs = grouped_by_job_title.head().rename("employees_count").reset_index()

five_most_common_jobs


Unnamed: 0,job_title,employees_count
0,Data Engineer,1040
1,Data Scientist,840
2,Data Analyst,612
3,Machine Learning Engineer,289
4,Analytics Engineer,103


In [24]:
fig = px.bar(five_most_common_jobs,
              x="job_title", 
              y="employees_count",
              title="Top 5 Most Common Job Titles", 
              labels={"job_title": "Job Title", "employees_count" : "The Number of Emplyees"}, 
              template="ggplot2",
              )

fig.show()

#### Conclusion

The bar chart shows the five most common jobs in our sample: Data Engineer, Data Scientist, Data Analyst, Machine Learning Engineer, and Analytics Engineer. Among them, Data Engineer is the most common role with 1,040 employees, while Analytics Engineer is the least common in this top group, with 103 employees.

### Task2: Pie Chart of Employment Types
Goal: Show the distribution of contract types.

Chart: Pie chart.

In [26]:
grouped_by_employment_type = salaries.groupby("employment_type").size().rename("frequency").reset_index()
grouped_by_employment_type

Unnamed: 0,employment_type,frequency
0,CT,10
1,FL,10
2,FT,3718
3,PT,17


In [28]:
total_frequency = grouped_by_employment_type["frequency"].sum()
grouped_by_employment_type["relative_frequency"] = grouped_by_employment_type["frequency"].apply(lambda x:round( x/total_frequency*100, 2))
grouped_by_employment_type

Unnamed: 0,employment_type,frequency,relative_frequency
0,CT,10,0.27
1,FL,10,0.27
2,FT,3718,99.01
3,PT,17,0.45


In [30]:
fig = px.pie(grouped_by_employment_type,
    names="employment_type",
    values="relative_frequency",
    title="The distribution of contract types",
    labels={"employment_type":"Contract Type", "relative_frequency" : "Part-to-whole Relationship"},
    template="ggplot2"
)

fig.show()

#### Conclusion

The pie chart reveals that the vast majority of employees (99%) work under full-time (FT) contracts. Other contract types represent only a very small and almost evenly distributed share, making their impact on the overall workforce negligible.

### Task 3: Pareto Chart of Total Salary by Job Title
Goal: See which job titles contribute most to overall salary.

Chart: Bar chart + cumulative line.

In [None]:
overal_salary = salaries["salary_in_usd"].sum()
overal_salary

np.int64(516576814)

In [56]:
grouped_by_job_title = salaries.groupby("job_title")["salary_in_usd"].sum().rename("total_salary")

sorted_jobs_by_salary = grouped_by_job_title.sort_values( ascending=False).reset_index()

sorted_jobs_by_salary["comulative_salary"] = sorted_jobs_by_salary["total_salary"].cumsum()

sorted_jobs_by_salary["comulative_salary_percentage"] = round(sorted_jobs_by_salary["comulative_salary"]/overal_salary * 100, 0)

sample = sorted_jobs_by_salary.head(10)

In [None]:
# 1. Створюємо фігу з двома Y-осями
fig = make_subplots(specs=[[{"secondary_y": True}]]) #??

# додаємо стовпчикову діагараму
fig.add_trace(
    go.Bar(
        name = 'Job`s Total Salaries',
        x =sample["job_title"],
        y = sample["total_salary"],
    ),
    secondary_y=False
)

# додаємо комулятивну лінію
fig.add_trace(
     go.Scatter(
        name="Comulative Salary Percentage",
        x = sample["job_title"],
        y = sample["comulative_salary_percentage"],
        mode="lines+markers",  #лінія + точки
        line=dict(color="crimson")
    ),
    secondary_y=True
)

# оформлення
fig.update_layout(
    width=1000,     # ширина в пікселях
    height=600,  
    title="Pareto Diagram: Total Salary vs Cumulative %",
    xaxis_title = "Job Title",
    template ="ggplot2",
    

)

#Налащтування осей y
fig.update_yaxes(title_text='Total Salary', secondary_y=False)
fig.update_yaxes(title_text='Comulative %', secondary_y=True)

fig.show()

#### Conclusion


The Pareto diagram shows that the total salaries of the first two job titles are significantly higher than the others, largely due to the higher number of specialists in these roles. After the first four job titles, the differences in total salaries become less drastic. The cumulative line indicates that the top two jobs account for 52% of the total salary amount, while the top four jobs contribute 73% of all salaries combined.