# Data Collection

All source functions are in this file: [link](../src/jobs_tools/jobs_helpers.py)

## Import Libraries

In [None]:
from dotenv import load_dotenv
import pandas as pd
import json
import sys
import os

src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

%load_ext autoreload
%autoreload 2

from jobs_tools.jobs_helpers import process_json_to_csv, collect_jobs_data


In [None]:
load_dotenv()
api_key = os.getenv("API_KEY")

path = r"../data/csv/location_domain_table.csv"
df = pd.read_csv(path).copy()
df = df.drop(columns=["gl(not needed)"])
df.head()

Unnamed: 0,location,google_domain,Region,EU member,Schengen Agreement
0,Austria,google.at,Europe,True,True
1,Belgium,google.be,Europe,True,True
2,Bulgaria,google.bg,Europe,True,True
3,Canada,google.ca,Northern America,False,False
4,Croatia,google.hr,Europe,True,True


## Data Collection

In [None]:
collect_jobs_data(
    quarry="Android developer",       # Job search query
    location="all",                   # Collect data for all countries
    domain="local",                   # Use local domains
    number_of_queries="all",          # Maximum 2 queries per country
    api_key=api_key,                  # Specify your actual API key
    data_frame=df,                    # DataFrame containing country data
    save_path=r"../data/jobs_data/data/local_domain/Android",  # Folder to save JSON files
    number_of_errors=2,               # Stop after 2 empty results
    report=True                       # Display a report
)


All data collection logs: [link](../data/data_collection_logs.txt)

## Make markdown file with the test results

In [None]:
# Path to the main folder
base_path = r"../data/jobs_data/test_data/local_domain"
android_path = os.path.join(base_path, "Android")
ios_path = os.path.join(base_path, "iOS")
markdown_path = os.path.join(base_path, "results.md")

# Check if the file is "successful" or "empty"
def is_valid_file(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
            if data.get("search_information", {}).get("jobs_results_state") == "Fully empty" and \
               data.get("error") == "Google hasn't returned any results for this query.":
                return False
            return True
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return False

# Scan folders and collect data
results = {}

for folder, platform in [(ios_path, "iOS"), (android_path, "Android")]:
    for file_name in os.listdir(folder):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder, file_name)
            country = file_name.replace(".json", "").strip()
            is_valid = is_valid_file(file_path)
            
            if country not in results:
                results[country] = {"iOS": "-", "Android": "-"}
            
            results[country][platform] = "+" if is_valid else "-"

# Create Markdown file
with open(markdown_path, "w", encoding="utf-8") as md_file:
    md_file.write("[] - iOS, [] - Android\n")
    md_file.write("-----------------------\n")
    for country, platforms in sorted(results.items()):
        ios_status = platforms["iOS"]
        android_status = platforms["Android"]
        md_file.write(f"- [{ios_status}],[{android_status}] {country}\n")

print(f"Markdown file created: {markdown_path}")

Используются оба домена, чтобы максимизировать шанс сбора данных по стране. И увеличить шанс сбора уникальных вакансий доступных только на определенном домене.

Если пройтись по всем странам и собрать по одной вакансии для "iOS" и "Android" разработчиков, были получены следующие резульаты:

`.com`:
- Все страны (100%) вернули данные как по iOS, так и по Android.

`Локальный домен`:<br>
Большинство стран также показали успешные результаты для iOS и Android вакансий.<br>
Исключения:
- Исландия: отсутствуют вакансии для iOS.
- Румыния: отсутствуют вакансии для Android.

[Results for ".com" domain](../data/jobs_data/test_data/dotcom_domain/results.md) <br>
[Results for "local" domain](../data/jobs_data/test_data/local_domain/results.md)

## Make a CSV from JSON files

In [None]:
# Paths to the data
data_dirs = [
    r"../data/jobs_data/data/dotcom_domain/Android",
    r"../data/jobs_data/data/dotcom_domain/iOS",
    r"../data/jobs_data/data/local_domain/Android",
    r"../data/jobs_data/data/local_domain/iOS"
]

# Run the function
report = process_json_to_csv(data_dirs, df, output_file=r"../data/csv/jobs_data.csv")


Processing completed:
- Total files: 665
- Successfully processed: 660
- Empty files: 5
- Corrupted files: 0

