In [47]:
import pandas as pd
import time
import os
import re
import json
import requests
from datetime import datetime, timedelta
from tqdm import tqdm 
from concurrent.futures import ThreadPoolExecutor, as_completed
import plotly.express as px
import plotly.graph_objects as go

In [48]:
CONFIG_PATH = os.path.expanduser('~/pet-projects/jupyter-notebooks/config.json')
TOKEN_PATH = os.path.expanduser('~/pet-projects/jupyter-notebooks/token.json')
BASE_DIR = os.path.expanduser('~/pet-projects/jupyter-notebooks/')

In [49]:

def load_config():
    """Load configuration from config.json"""
    try:
        with open(CONFIG_PATH, 'r') as config_file:
            return json.load(config_file)
    except FileNotFoundError:
        raise Exception("config.json file not found")
    except json.JSONDecodeError:
        raise Exception("Error parsing config.json")

def save_token(token_data):
    """Save token to file"""
    token_data['saved_at'] = datetime.now().isoformat()
    with open(TOKEN_PATH, 'w') as token_file:
        json.dump(token_data, token_file)

def load_token():
    """Load existing token"""
    try:
        with open(TOKEN_PATH, 'r') as token_file:
            token_data = json.load(token_file)
            saved_at = datetime.fromisoformat(token_data['saved_at'])
            # Check if token has expired (we store for 1 day)
            if datetime.now() - saved_at < timedelta(days=1):
                return token_data['access_token']
    except (FileNotFoundError, json.JSONDecodeError, KeyError):
        pass
    return None

def get_access_token(client_id, client_secret):
    """Get access token from HH.ru API"""
    # First try to load existing token
    existing_token = load_token()
    if existing_token:
        return existing_token

    # If token not found or expired, request new one
    token_url = 'https://hh.ru/oauth/token'
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    data = {
        'grant_type': 'client_credentials',
        'client_id': client_id,
        'client_secret': client_secret
    }
    
    response = requests.post(token_url, headers=headers, data=data)
    
    if response.status_code == 200:
        token_data = response.json()
        save_token({
            'access_token': token_data['access_token'],
            'saved_at': datetime.now().isoformat()
        })
        return token_data['access_token']
    else:
        raise Exception(f"Error getting token: {response.status_code}, {response.text}")


def create_api_client():
    """Create API client with loaded credentials"""
    config = load_config()
    
    client_id = config.get('client_id')
    client_secret = config.get('client_secret')
    
    if not client_id or not client_secret:
        raise Exception("client_id or client_secret missing in config.json")
    
    access_token = get_access_token(client_id, client_secret)
    
    return {
        'headers': {
            'Authorization': f'Bearer {access_token}',
            'HH-User-Agent': 'Your_App_Name (your@email.com)'
        },
        'base_url': 'https://api.hh.ru'
    }

def get_vacancy_details(api_client, vacancy_id):
    """Get detailed information about a specific vacancy"""
    response = requests.get(
        f"{api_client['base_url']}/vacancies/{vacancy_id}",
        headers=api_client['headers']
    )
    
    if response.status_code != 200:
        raise Exception(f"Error getting vacancy details: {response.status_code}, {response.text}")
        
    return response.json()

def get_all_vacancies(api_client, search_text, max_pages=None):
    """Get all vacancies with pagination"""
    all_vacancies = []
    page = 0
    per_page = 100  # Maximum number of vacancies per page
    
    while True:
        response = requests.get(
            f"{api_client['base_url']}/vacancies",
            headers=api_client['headers'],
            params = {
                'text': search_text,
                'per_page': per_page,
                # 'area': 1,  # Moscow (can be removed or changed)
                # 'only_with_salary': True,  # Optional: only with specified salary
                'search_field': ['name'],  # Search in all fields
                'order_by': 'publication_time',  # Sort by publication date
                'page': page
            }
        )
        
        if response.status_code != 200:
            print(f"Error getting page {page}: {response.status_code}")
            break
            
        data = response.json()
        vacancies = data['items']
        if not vacancies:
            break
            
        # Save first 3 pages of raw JSON response
        if page < 3:
            filename = f"vacancies_raw_page_{page}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
            filepath = os.path.join(BASE_DIR, filename)
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            print(f"Saved raw JSON for page {page} to {filename}")
            
        if page == 0:
            print(data)
            
        all_vacancies.extend(vacancies)
        print(f"Loaded page {page}, received {len(vacancies)} vacancies")
        
        page += 1
        if max_pages and page >= max_pages:
            break
    
    return all_vacancies

def create_vacancies_dataframe(vacancies):
    """Convert list of vacancies to DataFrame"""
    processed_vacancies = []
    
    for vacancy in vacancies:
        try:
            # Safe extraction of nested dictionaries
            salary_data = vacancy.get('salary') or {}
            area_data = vacancy.get('area') or {}
            snippet_data = vacancy.get('snippet') or {}
            address_data = vacancy.get('address') or {}
            empoyeer_data = vacancy.get('employer') or {}
            experience_data = vacancy.get('experience') or {}
            work_format_data = vacancy.get('work_format') or []
            professional_roles = vacancy.get('professional_roles') or []
            
            
            # Process work_format (can have multiple formats)
            work_formats = [format.get('name', '') for format in work_format_data]
            work_format_str = ', '.join(work_formats) if work_formats else ''
            # Process professional_roles
            role_ids = [str(role.get('id')) for role in professional_roles]
            role_names = [role.get('name', '') for role in professional_roles]
            
            processed_vacancy = {
                'id': vacancy.get('id'),
                'name': vacancy.get('name'),
                'url': vacancy.get('alternate_url'),
                'salary_from': salary_data.get('from'),
                'salary_to': salary_data.get('to'),
                'salary_currency': salary_data.get('currency'),
                'company_name': empoyeer_data.get('name'),
                'company_id': empoyeer_data.get('id'),
                'area': area_data.get('name'),
                'address': address_data.get('raw', ''),
                'created_at': vacancy.get('created_at'),
                'requirement': snippet_data.get('requirement'),
                'responsibility': snippet_data.get('responsibility'),
                'experience': experience_data.get('name', ''),
                'work_format': work_format_str,
                'internship': vacancy.get('internship', False),
                'premium': vacancy.get('premium', False),
                'professional_role_ids': ','.join(role_ids),
                'professional_role_names': ','.join(role_names)
            
            }
            processed_vacancies.append(processed_vacancy)
        except Exception as e:
            print(f"Error processing vacancy {vacancy.get('id', 'Unknown ID')}: {str(e)}")
            continue
    
    if not processed_vacancies:
        raise Exception("Failed to process any vacancies")
        
    df = pd.DataFrame(processed_vacancies)
    
    # Fill empty values
    df['salary_from'] = df['salary_from'].fillna(0)
    df['salary_to'] = df['salary_to'].fillna(0)
    df['salary_currency'] = df['salary_currency'].fillna('RUR')
    df['requirement'] = df['requirement'].fillna('')
    df['responsibility'] = df['responsibility'].fillna('')
    
    return df

In [None]:
try:
    search_text = 'CPO'
    # Пример запроса к API (получение вакансий)
    api_client = create_api_client()
    vacancies = get_all_vacancies(api_client, search_text, max_pages=5000)  # Ограничиваем 5 страницами для примера

    # Преобразуем в DataFrame
    df_part_one = create_vacancies_dataframe(vacancies)

    search_text = 'Chief Product Owner'
    vacancies = get_all_vacancies(api_client, search_text, max_pages=5000)  # Ограничиваем 5 страницами для примера

    # Преобразуем в DataFrame
    df_part_two = create_vacancies_dataframe(vacancies)
    df = pd.concat([df_part_one, df_part_two])

    print(f"Всего получено вакансий: {len(df)}")

except Exception as e:
    print(f"Произошла ошибка: {str(e)}")

df.info()

In [None]:
# Function to get details of a single vacancy with retry
def get_vacancy_details_with_retry(api_client, vacancy_id, max_retries=3):
    for attempt in range(max_retries):
        try:
            return get_vacancy_details(api_client, vacancy_id)
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"Failed to get details for vacancy {vacancy_id} after {max_retries} attempts: {e}")
                return None
            time.sleep(1)  # Small delay before retry

# Function for parallel processing of vacancies
def process_vacancy_details(vacancy_id):
    try:
        details = get_vacancy_details_with_retry(api_client, vacancy_id)
        if details:
            return {
                'id': vacancy_id,  # Add vacancy ID to the result
                'description': details.get('description', ''),
                'key_skills': [skill.get('name') for skill in details.get('key_skills', [])]
            }
        return None
    except Exception as e:
        print(f"Error processing vacancy {vacancy_id}: {e}")
        return None

print("Getting detailed information about vacancies...")

# Get list of all vacancy IDs
vacancy_ids = df['id'].tolist()
total_vacancies = len(vacancy_ids)
completed_vacancies = 0

# Create progress bar
progress_bar = tqdm(total=total_vacancies, desc="Processing vacancies")

# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=10) as executor:
    # Start getting details for all vacancies
    future_to_id = {executor.submit(process_vacancy_details, vid): vid for vid in vacancy_ids}
    
    # Collect results
    details_list = []
    for future in as_completed(future_to_id):
        result = future.result()
        if result:
            details_list.append(result)
        # Update progress bar
        completed_vacancies += 1
        progress_bar.update(1)

# Close progress bar
progress_bar.close()

# Create DataFrame with details
details_df = pd.DataFrame(details_list)

# Merge with main DataFrame using vacancy ID
df = df.merge(details_df, on='id', how='left')
df.info()
# Fill empty values
df['description'] = df['description'].fillna('')
df['key_skills'] = df['key_skills'].fillna('').apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

print(f"Details retrieved for {len(details_list)} out of {total_vacancies} vacancies")
print("\nUpdated DataFrame structure:")
df.info()

In [61]:
debug_one_vacancy = False
if debug_one_vacancy:   
    id = 117803714
    details = get_vacancy_details_with_retry(api_client, id)
    filename = f"vacancy_detail_{id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    filepath = os.path.join(BASE_DIR, filename)
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(details, f, ensure_ascii=False, indent=2)

In [None]:
df.head(100)

In [None]:
df.groupby('company_name').size().reset_index(name='count').sort_values('count', ascending=False)

In [64]:
# Формируем имя файла
filename = f"vacancies_{search_text.lower().replace(' ', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
filepath = os.path.join(BASE_DIR, filename)

# Сохраняем DataFrame
df.to_csv(filepath, index=False, encoding='utf-8')

In [None]:
# Analysis of professional role distribution
TOP_ROLES_COUNT = 15
PLOT_HEIGHT = 600
PLOT_WIDTH = 900
FONT_SIZE = 12

all_roles = []
for roles in df['professional_role_names'].dropna():
    if isinstance(roles, str) and roles:
        all_roles.extend([role.strip() for role in roles.split(',')])

roles_df = pd.DataFrame(all_roles, columns=['role']).value_counts().reset_index()
roles_df.columns = ['role', 'count']
roles_df = roles_df.sort_values('count', ascending=False)

# Get top roles
top_roles = roles_df.head(TOP_ROLES_COUNT)

fig = px.bar(top_roles, 
             x='count', 
             y='role',
             orientation='h',
             title=f'Top {TOP_ROLES_COUNT} Professional Roles',
             labels={'count': 'Number of Vacancies', 'role': 'Professional Role'},
             color='count',
             color_continuous_scale='Viridis',
             text='count')

fig.update_layout(
    height=PLOT_HEIGHT,
    width=PLOT_WIDTH,
    yaxis={'categoryorder': 'total ascending'},
    xaxis_title="Number of Vacancies",
    yaxis_title="Professional Role",
    font=dict(size=FONT_SIZE)
)

fig.show()

In [None]:
# Create DataFrame with vacancy count by city
cities_df = df['area'].value_counts().reset_index()
cities_df.columns = ['city', 'count']
cities_df = cities_df.sort_values('count', ascending=False)

# Limit number of cities to display (top-15)
top_cities = cities_df.head(15).copy()

# Create horizontal bar chart with value labels
fig = px.bar(top_cities, 
             x='count', 
             y='city',
             orientation='h',
             title='Top 15 Cities by Number of Vacancies',
             labels={'count': 'Number of Vacancies', 'city': 'City'},
             color='count',
             color_continuous_scale='Viridis',
             text='count')  # Add text labels with values

# Configure appearance and position of text labels
fig.update_traces(texttemplate='%{text}', textposition='outside')

# Configure appearance
fig.update_layout(
    height=600,
    width=900,
    yaxis={'categoryorder': 'total ascending'},
    xaxis_title="Number of Vacancies",
    yaxis_title="City",
    font=dict(size=12),
    uniformtext_minsize=8,  # Minimum font size for text
    uniformtext_mode='hide'  # Hide text if it doesn't fit
)

fig.show()

# Additionally: percentage distribution of vacancies by city
top_cities['percentage'] = (top_cities['count'] / top_cities['count'].sum() * 100).round(2)
print("Percentage distribution of vacancies by city (top-15):")
print(top_cities[['city', 'count', 'percentage']])

In [None]:
# Create DataFrame with vacancy count by company
companies_df = df['company_name'].value_counts().reset_index()
companies_df.columns = ['company', 'count']
companies_df = companies_df.sort_values('count', ascending=False)

# Limit number of companies to display (top-15)
top_companies = companies_df.head(15).copy()

# Create horizontal bar chart with value labels
fig = px.bar(top_companies, 
             x='count', 
             y='company',
             orientation='h',
             title='Top 15 Companies by Number of Vacancies',
             labels={'count': 'Number of Vacancies', 'company': 'Company'},
             color='count',
             color_continuous_scale='Viridis',
             text='count')  # Add text labels with values

# Configure appearance and position of text labels
fig.update_traces(texttemplate='%{text}', textposition='outside')

# Configure appearance
fig.update_layout(
    height=600,
    width=1000,
    yaxis={'categoryorder': 'total ascending'},
    xaxis_title="Number of Vacancies",
    yaxis_title="Company",
    font=dict(size=12),
    uniformtext_minsize=8,  # Minimum font size for text
    uniformtext_mode='hide'  # Hide text if it doesn't fit
)

fig.show()

# Additionally: percentage distribution of vacancies by company
top_companies['percentage'] = (top_companies['count'] / top_companies['count'].sum() * 100).round(2)
print("Percentage distribution of vacancies by company (top-15):")
print(top_companies[['company', 'count', 'percentage']])

In [None]:
# Create a copy of the dataframe
filtered_df = df.copy()

# Convert created_at to datetime and calculate vacancy age in days
filtered_df['created_at'] = pd.to_datetime(filtered_df['created_at'])
filtered_df['vacancy_age_days'] = (pd.Timestamp.now(tz=filtered_df['created_at'].dt.tz) - filtered_df['created_at']).dt.days

# Get role name for the title
role_name = filtered_df['professional_role_names'].iloc[0]

# Create a simple histogram
fig = px.histogram(filtered_df,
                  x='vacancy_age_days',
                  nbins=20,
                  title=f'Distribution of Vacancy Ages for {role_name}',
                  labels={'vacancy_age_days': 'Vacancy Age (Days)',
                         'count': 'Number of Vacancies'},
                  color_discrete_sequence=['#636EFA'])

# Update layout
fig.update_layout(
    height=600,
    xaxis_title="Vacancy Age (Days)",
    yaxis_title="Number of Vacancies",
    bargap=0.1
)

fig.show()

# Print some basic statistics
print("\nSummary Statistics:")
print(f"Average vacancy age: {filtered_df['vacancy_age_days'].mean():.1f} days")
print(f"Median vacancy age: {filtered_df['vacancy_age_days'].median():.1f} days")
print(f"Newest vacancy: {filtered_df['vacancy_age_days'].min():.0f} days old")
print(f"Oldest vacancy: {filtered_df['vacancy_age_days'].max():.0f} days old")

In [None]:
# Explode key_skills to get individual skills
skills_df = df.explode('key_skills')

# Count frequency of each skill
skill_counts = skills_df['key_skills'].value_counts().reset_index()
skill_counts.columns = ['skill', 'count']

# Create bar chart of top 20 skills
fig = px.bar(skill_counts.head(20), 
             x='skill',
             y='count',
             title='Top 20 Most Common Skills',
             labels={'skill': 'Skill',
                    'count': 'Number of Vacancies'},
             color='count',
             color_continuous_scale='Viridis')

# Update layout for better readability
fig.update_layout(
    xaxis_tickangle=45,
    height=600,
    showlegend=False
)

fig.show()

In [None]:
# Calculate min and max salary for each currency, handling missing/zero values
salary_range = df.groupby('salary_currency').agg({
    'salary_from': lambda x: x[x > 0].min() if len(x[x > 0]) > 0 else 0,  # Min non-zero salary_from
    'salary_to': lambda x: x[x > 0].max() if len(x[x > 0]) > 0 else 0     # Max non-zero salary_to
}).reset_index()

# Create a copy of the dataframe for plotting
plot_data = []
for _, row in salary_range.iterrows():
    currency = row['salary_currency']
    min_salary = row['salary_from'] if row['salary_from'] > 0 else row['salary_to']
    max_salary = row['salary_to'] if row['salary_to'] > 0 else row['salary_from']
    
    # Add both min and max as separate rows for the same currency
    if min_salary > 0:
        plot_data.append({'salary_currency': currency, 'value': min_salary, 'type': 'Min'})
    if max_salary > 0:
        plot_data.append({'salary_currency': currency, 'value': max_salary, 'type': 'Max'})

plot_df = pd.DataFrame(plot_data)

# Create grouped bar chart showing min and max salaries
fig = px.bar(plot_df, 
             x='salary_currency',
             y='value',
             color='type',
             barmode='group',
             title='Salary Range by Currency (Min and Max)',
             labels={'salary_currency': 'Currency',
                    'value': 'Salary Value',
                    'type': 'Salary Type'},
             text_auto='.0f',  # Add text labels with no decimal places
             color_discrete_map={'Min': 'lightblue', 'Max': 'darkblue'})

# Update layout
fig.update_layout(
    height=500,
    yaxis_type="log",  # Use logarithmic scale for better visualization
)

# Update text position to be inside bars
fig.update_traces(textposition='inside')

# Show the plot
fig.show()
