In [15]:
import pandas as pd
import time
import os
import re
import json
import requests
from datetime import datetime, timedelta
from tqdm import tqdm 
from concurrent.futures import ThreadPoolExecutor, as_completed
import plotly.express as px
import plotly.graph_objects as go
from clickhouse_driver import Client

In [16]:
CONFIG_PATH = os.path.expanduser('~/pet-projects/jupyter-notebooks/config.json')
TOKEN_PATH = os.path.expanduser('~/pet-projects/jupyter-notebooks/token.json')
BASE_DIR = os.path.expanduser('~/pet-projects/jupyter-notebooks/')
DATA_DIR = os.path.expanduser('~/pet-projects/jupyter-notebooks/data/hh_api_data')

In [17]:

def load_config():
    """Load configuration from config.json"""
    try:
        with open(CONFIG_PATH, 'r') as config_file:
            return json.load(config_file)
    except FileNotFoundError:
        raise Exception("config.json file not found")
    except json.JSONDecodeError:
        raise Exception("Error parsing config.json")

def save_token(token_data):
    """Save token to file"""
    token_data['saved_at'] = datetime.now().isoformat()
    with open(TOKEN_PATH, 'w') as token_file:
        json.dump(token_data, token_file)

def load_token():
    """Load existing token"""
    try:
        with open(TOKEN_PATH, 'r') as token_file:
            token_data = json.load(token_file)
            saved_at = datetime.fromisoformat(token_data['saved_at'])
            # Check if token has expired (we store for 1 day)
            if datetime.now() - saved_at < timedelta(days=1):
                return token_data['access_token']
    except (FileNotFoundError, json.JSONDecodeError, KeyError):
        pass
    return None

def get_access_token(client_id, client_secret):
    """Get access token from HH.ru API"""
    # First try to load existing token
    existing_token = load_token()
    if existing_token:
        return existing_token

    # If token not found or expired, request new one
    token_url = 'https://hh.ru/oauth/token'
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    data = {
        'grant_type': 'client_credentials',
        'client_id': client_id,
        'client_secret': client_secret
    }
    
    response = requests.post(token_url, headers=headers, data=data)
    
    if response.status_code == 200:
        token_data = response.json()
        save_token({
            'access_token': token_data['access_token'],
            'saved_at': datetime.now().isoformat()
        })
        return token_data['access_token']
    else:
        raise Exception(f"Error getting token: {response.status_code}, {response.text}")


def create_api_client():
    """Create API client with loaded credentials"""
    config = load_config()
    
    client_id = config.get('client_id')
    client_secret = config.get('client_secret')
    user_email = config.get('user_email')
    
    if not client_id or not client_secret:
        raise Exception("client_id or client_secret missing in config.json")
    
    access_token = get_access_token(client_id, client_secret)
    
    return {
        'headers': {
            'Authorization': f'Bearer {access_token}',
            'HH-User-Agent': f'Your_App_Name ({user_email})'
        },
        'base_url': 'https://api.hh.ru'
    }

def get_vacancy_details(api_client, vacancy_id):
    """Get detailed information about a specific vacancy"""
    response = requests.get(
        f"{api_client['base_url']}/vacancies/{vacancy_id}",
        headers=api_client['headers']
    )
    
    if response.status_code != 200:
        raise Exception(f"Error getting vacancy details: {response.status_code}, {response.text}")
        
    return response.json()

def get_employer_details(api_client, employer_id):
    """Get detailed information about a specific employer"""
    response = requests.get(
        f"{api_client['base_url']}/employers/{employer_id}",
        headers=api_client['headers']
    )
    
    if response.status_code != 200:
        raise Exception(f"Error getting employer details: {response.status_code}, {response.text}")
        
    return response.json()

def get_all_vacancies_single(api_client, search_text, max_pages=None):
    """Get all vacancies with pagination"""
    all_vacancies = []
    page = 0
    per_page = 100  # Maximum number of vacancies per page
    
    while True:
        response = requests.get(
            f"{api_client['base_url']}/vacancies",
            headers=api_client['headers'],
            params = {
                'text': search_text,
                'per_page': per_page,
                # 'area': 1,  # Moscow (can be removed or changed)
                # 'only_with_salary': True,  # Optional: only with specified salary
                'search_field': ['name'],  # Search in all fields
                'order_by': 'publication_time',  # Sort by publication date
                'page': page
            }
        )
        
        if response.status_code != 200:
            print(f"Error getting page {page}: {response.status_code}")
            break
            
        data = response.json()
        vacancies = data['items']
        
        for vacancy in vacancies:
            vacancy['search_text'] = search_text
        
        if not vacancies:
            break
            
        # Save first 3 pages of raw JSON response
        if page < 3:
            filename = f"vacancies_raw_page_{page}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
            if not os.path.exists(DATA_DIR):
                os.makedirs(DATA_DIR)
            filepath = os.path.join(DATA_DIR, filename)
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            print(f"Saved raw JSON for page {page} to {filename}")
            
        if page == 0:
            print(data)
            
        all_vacancies.extend(vacancies)
        print(f"Loaded page {page}, received {len(vacancies)} vacancies")
        
        page += 1
        if max_pages and page >= max_pages:
            break
    
    return all_vacancies

def get_all_vacancies(api_client, search_text, max_pages=None):
    """
    Gets vacancies with pagination.
    If the search_text parameter contains multiple values separated by '|',
    sequential search is performed for each keyword.
    """
    if '|' in search_text:
        all_vacancies = []
        keywords = [kw.strip() for kw in search_text.split('|')]
        for kw in keywords:
            print(f"Starting search for: {kw}")
            vacancies = get_all_vacancies_single(api_client, kw, max_pages)
            all_vacancies.extend(vacancies)
        return all_vacancies
    else:
        return get_all_vacancies_single(api_client, search_text, max_pages)

def create_vacancies_dataframe(vacancies):
    """Convert list of vacancies to DataFrame"""
    processed_vacancies = []
    
    for vacancy in vacancies:
        try:
            # Safe extraction of nested dictionaries
            salary_data = vacancy.get('salary') or {}
            area_data = vacancy.get('area') or {}
            snippet_data = vacancy.get('snippet') or {}
            address_data = vacancy.get('address') or {}
            empoyeer_data = vacancy.get('employer') or {}
            experience_data = vacancy.get('experience') or {}
            work_format_data = vacancy.get('work_format') or []
            professional_roles = vacancy.get('professional_roles') or []
            
            
            # Process work_format (can have multiple formats)
            work_formats = [format.get('name', '') for format in work_format_data]
            work_format_str = ', '.join(work_formats) if work_formats else ''
            # Process professional_roles
            role_ids = [str(role.get('id')) for role in professional_roles]
            role_names = [role.get('name', '') for role in professional_roles]
            
            processed_vacancy = {
                'id': vacancy.get('id'),
                'name': vacancy.get('name'),
                'url': vacancy.get('alternate_url'),
                'salary_from': salary_data.get('from'),
                'salary_to': salary_data.get('to'),
                'salary_currency': salary_data.get('currency'),
                'company_name': empoyeer_data.get('name'),
                'company_id': empoyeer_data.get('id'),
                'area': area_data.get('name'),
                'address': address_data.get('raw', ''),
                'created_at': vacancy.get('created_at'),
                'published_at': vacancy.get('published_at'),
                'requirement': snippet_data.get('requirement'),
                'responsibility': snippet_data.get('responsibility'),
                'experience': experience_data.get('name', ''),
                'work_format': work_format_str,
                'internship': vacancy.get('internship', False),
                'premium': vacancy.get('premium', False),
                'professional_role_ids': ','.join(role_ids),
                'professional_role_names': ','.join(role_names),
                'search_text': vacancy.get('search_text', '')
            
            }
            processed_vacancies.append(processed_vacancy)
        except Exception as e:
            print(f"Error processing vacancy {vacancy.get('id', 'Unknown ID')}: {str(e)}")
            continue
    
    if not processed_vacancies:
        raise Exception("Failed to process any vacancies")
        
    df = pd.DataFrame(processed_vacancies)
    
    # Fill empty values
    df['salary_from'] = df['salary_from'].fillna(0)
    df['salary_to'] = df['salary_to'].fillna(0)
    df['salary_currency'] = df['salary_currency'].fillna('RUR')
    df['requirement'] = df['requirement'].fillna('')
    df['responsibility'] = df['responsibility'].fillna('')
    
    return df

In [None]:
try:
    api_client = create_api_client()
    search_text_en = 'Product designer|Продуктовый дизайнер|TeamLead|Product manager|Product owner|CPO|Chief Product Officer|Golang Developer|Golang разработчик|PHP developer|Php разработчик|Product manager|Тестировщик|QA'
    vacancies = get_all_vacancies(api_client, search_text_en, max_pages=5000)
    df = create_vacancies_dataframe(vacancies)
    
    # Count records before removing duplicates
    initial_count = len(df)
    
    # Remove duplicates by id field, keeping the first occurrence
    df = df.drop_duplicates(subset=['id'])
    
    # Count records after removing duplicates
    final_count = len(df)
    duplicates_removed = initial_count - final_count
    print(f"Количество удаленных дубликатов: {duplicates_removed}")
    
    # Reset DataFrame index
    df = df.reset_index(drop=True)
    
    print(f"Всего получено вакансий: {len(df)}")

except Exception as e:
    print(f"Произошла ошибка: {str(e)}")

df.info()

In [None]:
# Function to get details of a single vacancy with retry
def get_vacancy_details_with_retry(api_client, vacancy_id, max_retries=3):
    for attempt in range(max_retries):
        try:
            return get_vacancy_details(api_client, vacancy_id)
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"Failed to get details for vacancy {vacancy_id} after {max_retries} attempts: {e}")
                return None
            time.sleep(1)  # Small delay before retry

# Function for parallel processing of vacancies
def process_vacancy_details(vacancy_id):
    try:
        details = get_vacancy_details_with_retry(api_client, vacancy_id)
        if details:
            return {
                'id': vacancy_id,  # Add vacancy ID to the result
                'description': details.get('description', ''),
                'key_skills': [skill.get('name') for skill in details.get('key_skills', [])]
            }
        return None
    except Exception as e:
        print(f"Error processing vacancy {vacancy_id}: {e}")
        return None

print("Getting detailed information about vacancies...")

# Get list of all vacancy IDs
vacancy_ids = df['id'].tolist()
total_vacancies = len(vacancy_ids)
completed_vacancies = 0

# Create progress bar
progress_bar = tqdm(total=total_vacancies, desc="Processing vacancies")

# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=10) as executor:
    # Start getting details for all vacancies
    future_to_id = {executor.submit(process_vacancy_details, vid): vid for vid in vacancy_ids}
    
    # Collect results
    details_list = []
    for future in as_completed(future_to_id):
        result = future.result()
        if result:
            details_list.append(result)
        # Update progress bar
        completed_vacancies += 1
        progress_bar.update(1)

# Close progress bar
progress_bar.close()

# Create DataFrame with details
details_df = pd.DataFrame(details_list)

# Merge with main DataFrame using vacancy ID
df = df.merge(details_df, on='id', how='left')
df.info()
# Fill empty values
df['description'] = df['description'].fillna('')
df['key_skills'] = df['key_skills'].fillna('').apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

print(f"Details retrieved for {len(details_list)} out of {total_vacancies} vacancies")
print("\nUpdated DataFrame structure:")
df.info()

In [None]:
print(f"unique search_text: {df['search_text'].unique()}")

In [21]:
debug_one_vacancy = True
if debug_one_vacancy:   
    id = 103720577
    details = get_vacancy_details_with_retry(api_client, id)
    filename = f"vacancy_detail_{id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    filepath = os.path.join(DATA_DIR, filename)
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(details, f, ensure_ascii=False, indent=2)

In [22]:
debug_employer_details = True
if debug_employer_details:
    employer_id = 3529
    employer_details = get_employer_details(api_client, employer_id)
    filename = f"employer_detail_{employer_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    filepath = os.path.join(DATA_DIR, filename)
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(employer_details, f, ensure_ascii=False, indent=2)

In [None]:
df.head(100)

In [None]:
df.groupby('company_name').size().reset_index(name='count').sort_values('count', ascending=False)

In [None]:
df.info()

In [26]:
# Формируем имя файла
filename = f"vacancies_all_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
filepath = os.path.join(DATA_DIR, filename)

# Сохраняем DataFrame
df.to_csv(filepath, index=False, encoding='utf-8')

In [27]:
def insert_bulk_dataframe(client, df, table_name):
    # Convert DataFrame to a list of tuples and perform bulk insertion in a single query
    data = list(df.itertuples(index=False, name=None))
    columns = ", ".join(df.columns)
    client.execute(f"INSERT INTO {table_name} ({columns}) VALUES", data)

# Load configuration using the existing load_config function
config = load_config()

# Configure ClickHouse client using data from configuration
client = Client(
    host=config['clickhouse_host'],
    port=9440,
    user='default',
    password=config['clickhouse_password'],
    database=config.get('clickhouse_database', 'default'),
    secure=True
)

In [None]:
def prepare_dataframe_for_clickhouse(df):
    """Преобразует DataFrame для соответствия схеме таблицы в ClickHouse"""
    df_prepared = df.copy()
    
    # Заполняем пустые значения перед преобразованием типов
    df_prepared['id'] = df_prepared['id'].fillna(0).astype(int)
    df_prepared['company_id'] = df_prepared['company_id'].fillna(0).astype(int)
    df_prepared['professional_role_ids'] = df_prepared['professional_role_ids'].fillna(0).astype(int)
    
    # Преобразуем поля с плавающей точкой
    df_prepared['salary_from'] = pd.to_numeric(df_prepared['salary_from'], errors='coerce')
    df_prepared['salary_to'] = pd.to_numeric(df_prepared['salary_to'], errors='coerce')
    
    # Преобразуем дату в формат DateTime
    df_prepared['created_at'] = pd.to_datetime(df_prepared['created_at'])

    df_prepared['published_at'] = pd.to_datetime(df_prepared['published_at'])
    
    # Заполняем None значения в строковых полях пустыми строками
    for column in df_prepared.select_dtypes(include=['object']).columns:
        df_prepared[column] = df_prepared[column].fillna('')
    
    # Преобразуем булевы значения в строки
    bool_columns = df_prepared.select_dtypes(include=['bool']).columns
    for column in bool_columns:
        df_prepared[column] = df_prepared[column].astype(str)
    
    return df_prepared

# Подготавливаем данные и вставляем в ClickHouse
df_prepared = prepare_dataframe_for_clickhouse(df)
insert_bulk_dataframe(client, df_prepared, 'vacancies_hh_ru')
print("Data successfully inserted into ClickHouse.")