In [69]:
import os
import pandas as pd
from sqlalchemy import create_engine
import json
import time
import plotly.express as px
import plotly.graph_objects as go

In [70]:
CONFIG_PATH = os.path.expanduser('~/pet-projects/jupyter-notebooks/config.json')

In [71]:

def load_config():
    """Load configuration from config.json"""
    try:
        with open(CONFIG_PATH, 'r') as config_file:
            return json.load(config_file)
    except FileNotFoundError:
        raise Exception("config.json file not found")
    except json.JSONDecodeError:
        raise Exception("Error parsing config.json")

In [72]:
config = load_config()
# Использую secure native протокол с портом 9440
clickhouse_url = f"clickhouse+native://default:{config['clickhouse_password']}@{config['clickhouse_host']}:9440/default?secure=True"
engine = create_engine(clickhouse_url, connect_args={'connect_timeout': 10, 'send_receive_timeout': 10})

In [None]:
# Параметры кеша
cache_file = "vacancies_cache.csv"
cache_duration = 86400  # 24 часа в секундах

if os.path.exists(cache_file) and (time.time() - os.path.getmtime(cache_file)) < cache_duration:
    # Загрузка из кеша
    df = pd.read_csv(cache_file)
    print("Данные загружены из кеша")
else:
    # Выгрузка данных из ClickHouse и сохранение в кеш
    query = "SELECT * FROM vacancies_hh_ru"
    df = pd.read_sql(query, engine)
    df.to_csv(cache_file, index=False)
    print("Данные выгружены из ClickHouse и сохранены в кеш")

df.info()    

In [None]:
# Filter data by chosen search texts
filtered_df = df[df['search_text'].isin(chosen_search_text)]

# Create DataFrame with vacancy count by city
cities_df = filtered_df['area'].value_counts().reset_index()
cities_df.columns = ['city', 'count']
cities_df = cities_df.sort_values('count', ascending=False)

# Limit number of cities to display (top-15)
top_cities = cities_df.head(15).copy()

# Create horizontal bar chart with value labels
fig = px.bar(top_cities, 
             x='count', 
             y='city',
             orientation='h',
             title=f'Top 15 Cities by Number of Vacancies, {", ".join(chosen_search_text)}',
             labels={'count': 'Number of Vacancies', 'city': 'City'},
             color='count',
             color_continuous_scale='Viridis',
             text='count')  # Add text labels with values

# Configure appearance and position of text labels
fig.update_traces(texttemplate='%{text}', textposition='outside')

# Configure appearance
fig.update_layout(
    height=600,
    width=900,
    yaxis={'categoryorder': 'total ascending'},
    xaxis_title="Number of Vacancies",
    yaxis_title="City",
    font=dict(size=12),
    uniformtext_minsize=8,  # Minimum font size for text
    uniformtext_mode='hide'  # Hide text if it doesn't fit
)

fig.show()

# Additionally: percentage distribution of vacancies by city
top_cities['percentage'] = (top_cities['count'] / top_cities['count'].sum() * 100).round(2)
print("Percentage distribution of vacancies by city (top-15):")
print(top_cities[['city', 'count', 'percentage']])

In [None]:
print(f"Unique search texts: {df['search_text'].unique()}")

In [115]:
chosen_search_text = ['CPO', 'Chief Product Officer']

In [None]:
# Analysis of search text distribution by professional roles
TOP_ROLES_COUNT = 15
PLOT_HEIGHT = 900
PLOT_WIDTH = 1200
FONT_SIZE = 12

# Filter data by chosen search texts
filtered_df = df[df['search_text'].isin(chosen_search_text)]

# Create figure
fig = go.Figure()

# Collect data for all chosen search texts combined
all_roles = []
for roles in filtered_df['professional_role_names'].dropna():
    if isinstance(roles, str) and roles:
        all_roles.extend([role.strip() for role in roles.split(',')])

# Count role frequency
roles_count = pd.Series(all_roles).value_counts().reset_index()
roles_count.columns = ['professional_role_names', 'count']
top_roles = roles_count.sort_values('count', ascending=False).head(TOP_ROLES_COUNT)

# Add single bar chart for combined search terms
fig.add_trace(go.Bar(
    x=top_roles['count'],
    y=top_roles['professional_role_names'],
    orientation='h',
    text=top_roles['count'],
    marker=dict(color=top_roles['count'], colorscale='Viridis')
))

# Configure layout
fig.update_layout(
    height=PLOT_HEIGHT,
    width=PLOT_WIDTH,
    title=f"Distribution of professional roles for: {', '.join(chosen_search_text)}",
    xaxis_title="Number of Vacancies",
    yaxis_title="Professional Role",
    font=dict(size=FONT_SIZE),
    yaxis=dict(
        autorange="reversed", 
        tickfont=dict(size=FONT_SIZE),
        automargin=True
    ),
    bargap=0.1
)

# Display text on chart
fig.update_traces(textposition='outside')

# Show chart with download option
fig.show(config={
    'scrollZoom': False,
    'toImageButtonOptions': {
        'format': 'png',
        'filename': 'roles_by_search_chart',
        'scale': 3
    }
})

In [None]:
# Analysis of professional role distribution
TOP_ROLES_COUNT = 15
PLOT_HEIGHT = 600
PLOT_WIDTH = 900
FONT_SIZE = 12

# Фильтрация данных по выбранным поисковым запросам
filtered_df = df[df['search_text'].isin(chosen_search_text)]

all_roles = []
for roles in filtered_df['professional_role_names'].dropna():
    if isinstance(roles, str) and roles:
        all_roles.extend([role.strip() for role in roles.split(',')])

roles_df = pd.DataFrame(all_roles, columns=['role']).value_counts().reset_index()
roles_df.columns = ['role', 'count']
roles_df = roles_df.sort_values('count', ascending=False)

# Get top roles
top_roles = roles_df.head(TOP_ROLES_COUNT)

fig = px.bar(top_roles, 
             x='count', 
             y='role',
             orientation='h',
             title=f'Top {TOP_ROLES_COUNT} Professional Roles for: {", ".join(chosen_search_text)}',
             labels={'count': 'Number of Vacancies', 'role': 'Professional Role'},
             color='count',
             color_continuous_scale='Viridis',
             text='count')

fig.update_layout(
    height=PLOT_HEIGHT,
    width=PLOT_WIDTH,
    yaxis={'categoryorder': 'total ascending'},
    xaxis_title="Number of Vacancies",
    yaxis_title="Professional Role",
    font=dict(size=FONT_SIZE)
)

fig.show()

In [None]:
# Analysis of top companies by number of vacancies
TOP_COMPANIES_COUNT = 15
PLOT_HEIGHT = 600
PLOT_WIDTH = 1000
FONT_SIZE = 12
MAX_COMPANY_NAME_LENGTH = 50

# Filter data by selected search terms
filtered_df = df[df['search_text'].isin(chosen_search_text)]

# Get top companies
companies_df = filtered_df['company_name'].value_counts().reset_index()
companies_df.columns = ['company', 'count']
companies_df = companies_df.sort_values('count', ascending=False)

# Format long company names
def format_company_name(name):
    if len(name) > MAX_COMPANY_NAME_LENGTH:
        # Split long names into two lines
        middle = len(name) // 2
        # Find the nearest space to the middle
        split_point = name.rfind(' ', 0, middle) if name.rfind(' ', 0, middle) != -1 else name.find(' ', middle)
        if split_point != -1:
            return name[:split_point] + '<br>' + name[split_point+1:]
        else:
            # If no space found, just truncate with ellipsis
            return name[:MAX_COMPANY_NAME_LENGTH-3] + '...'
    return name

companies_df['formatted_company'] = companies_df['company'].apply(format_company_name)
top_companies = companies_df.head(TOP_COMPANIES_COUNT)

# Create bar chart
fig = px.bar(top_companies, 
             x='count', 
             y='formatted_company',
             orientation='h',
             title=f'Top {TOP_COMPANIES_COUNT} Companies by Number of Vacancies for: {", ".join(chosen_search_text)}',
             labels={'count': 'Number of Vacancies', 'formatted_company': 'Company'},
             color='count',
             color_continuous_scale='Viridis',
             text='count')

fig.update_traces(texttemplate='%{text}', textposition='outside')

fig.update_layout(
    height=PLOT_HEIGHT,
    width=PLOT_WIDTH,
    yaxis={'categoryorder': 'total ascending'},
    xaxis_title="Number of Vacancies",
    yaxis_title="Company",
    font=dict(size=FONT_SIZE),
    uniformtext_minsize=8,
    uniformtext_mode='hide'
)

fig.show(config={'toImageButtonOptions': {'scale': 3, 'filename': "top_companies_vacancies"}})

In [None]:
# Constants for plot layout
PLOT_HEIGHT = 900
PLOT_WIDTH = 1200
FONT_SIZE = 12

# Filter data by selected search terms
filtered_df = df[df['search_text'].isin(chosen_search_text)].copy()

# Transform skills into a list and split into separate rows
filtered_df.loc[:, 'skill_list'] = filtered_df['key_skills'].apply(lambda x: [skill.strip() for skill in x.split(',')] if pd.notnull(x) else [])
df_exploded = filtered_df.explode('skill_list')

# Group skills by roles
role_skills = df_exploded.groupby(['professional_role_names', 'skill_list']).size().reset_index(name='frequency')

# Get list of unique roles
roles = sorted(df_exploded['professional_role_names'].unique())

# Добавляем опцию "Все роли"
# Создаем агрегированные данные по всем ролям
all_roles_data = df_exploded.groupby('skill_list').size().reset_index(name='frequency')
all_roles_data = all_roles_data.sort_values('frequency', ascending=False).head(20)

# Create plot
fig = go.Figure()

# Добавляем данные для "Все роли"
fig.add_trace(go.Bar(
    x=all_roles_data['frequency'],
    y=all_roles_data['skill_list'],
    visible=True,  # Изначально показываем все роли
    name="Все роли",
    orientation='h',
    text=all_roles_data['frequency'],
    marker=dict(color=all_roles_data['frequency'], colorscale='Viridis')
))

# Add data for each role
for i, role in enumerate(roles):
    # Filter data by role
    role_data = role_skills[role_skills['professional_role_names'] == role]
    # Sort and take top-20 skills
    top_skills = role_data.sort_values('frequency', ascending=False).head(20)
    
    fig.add_trace(go.Bar(
        x=top_skills['frequency'],
        y=top_skills['skill_list'],
        visible=False,  # Скрываем отдельные роли изначально
        name=role,
        orientation='h',
        text=top_skills['frequency'],
        marker=dict(color=top_skills['frequency'], colorscale='Viridis')
    ))

# Create buttons for role selection
buttons = []
# Кнопка для "Все роли"
visibility = [True] + [False] * len(roles)
buttons.append(dict(
    label="Все роли",
    method="update",
    args=[{"visible": visibility},
          {"title": f"Top-20 Skills Across All Roles (Search: {', '.join(chosen_search_text)})"}]
))

# Кнопки для отдельных ролей
for i, role in enumerate(roles):
    visibility = [False] * (len(roles) + 1)  # +1 для "Все роли"
    visibility[i + 1] = True  # +1 смещение из-за "Все роли"
    
    buttons.append(dict(
        label=role,
        method="update",
        args=[{"visible": visibility},
              {"title": f"Top-20 Skills for Role: {role} (Search: {', '.join(chosen_search_text)})"}]
    ))

# Configure layout
fig.update_layout(
    updatemenus=[dict(
        active=0,
        buttons=buttons,
        x=1.0,
        y=1.15,
        xanchor='right',
        yanchor='top',
        direction="down"
    )],
    height=PLOT_HEIGHT,
    width=PLOT_WIDTH,
    title=f"Top-20 Skills Across All Roles (Search: {', '.join(chosen_search_text)})",
    xaxis_title="Frequency",
    yaxis_title="Skill",
    font=dict(size=FONT_SIZE),
    yaxis=dict(
        autorange="reversed", 
        tickfont=dict(size=FONT_SIZE),
        automargin=True
    ),
    bargap=0.1
)

# Display text on the chart
fig.update_traces(textposition='outside')

# Show chart with download option
fig.show(config={
    'scrollZoom': False,
    'toImageButtonOptions': {
        'format': 'png',
        'filename': 'skills_frequency_chart',
        'scale': 3
    }
})

In [None]:
# Filter dataframe by chosen search text
filtered_df = df
if chosen_search_text:
    filtered_df = df[df['name'].str.contains('|'.join(chosen_search_text), case=False, na=False)]

# Calculate min and max salary for each currency, handling missing/zero values
salary_range = filtered_df.groupby('salary_currency').agg({
    'salary_from': lambda x: x[x > 0].min() if len(x[x > 0]) > 0 else 0,  # Min non-zero salary_from
    'salary_to': lambda x: x[x > 0].max() if len(x[x > 0]) > 0 else 0     # Max non-zero salary_to
}).reset_index()

# Create a copy of the dataframe for plotting
plot_data = []
for _, row in salary_range.iterrows():
    currency = row['salary_currency']
    min_salary = row['salary_from'] if row['salary_from'] > 0 else row['salary_to']
    max_salary = row['salary_to'] if row['salary_to'] > 0 else row['salary_from']
    
    # Add both min and max as separate rows for the same currency
    if min_salary > 0:
        plot_data.append({'salary_currency': currency, 'value': min_salary, 'type': 'Min'})
    if max_salary > 0:
        plot_data.append({'salary_currency': currency, 'value': max_salary, 'type': 'Max'})

plot_df = pd.DataFrame(plot_data)

# Create grouped bar chart showing min and max salaries
fig = px.bar(plot_df, 
             x='salary_currency',
             y='value',
             color='type',
             barmode='group',
             title=f'Salary Range by Currency (Min and Max) (Search: {", ".join(chosen_search_text)})',
             labels={'salary_currency': 'Currency',
                    'value': 'Salary Value',
                    'type': 'Salary Type'},
             text_auto='.0f',  # Add text labels with no decimal places
             color_discrete_map={'Min': 'lightblue', 'Max': 'darkblue'})

# Update layout
fig.update_layout(
    height=500,
    yaxis_type="log",  # Use logarithmic scale for better visualization
)

# Update text position to be inside bars
fig.update_traces(textposition='inside')

# Show the plot
fig.show(config={
    'scrollZoom': False,
    'toImageButtonOptions': {
        'format': 'png',
        'filename': 'salary_range_by_currency',
        'scale': 3
    }
})