# Data Cleaning

In [31]:
import pandas as pd

raw_df = pd.read_csv('../Data Collection/jobs_data.csv')

df = raw_df.copy()
df.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query
0,Austria,Europe,True,True,default,google.com,Android Developer,Agentur LOOP New Media GmbH,Anywhere,"Indeed, Jooble, Jobs By Workable, Trabajo.org ...","LOOP is a digital-first lead agency, exploring...",True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciIsIm...,2025-01-13 12:08:48 UTC,Android developer
1,Austria,Europe,True,True,default,google.com,Android Kotlin Developer*,BAWAG Group,"Vienna, Austria","Karriere.at, XING, Indeed, LinkedIn, Jooble, J...",Android Kotlin Developer*\r\n\r\nBAWAG Group i...,,"€44,476.88 a year",Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIEtvdGxpbiBEZXZlbG...,2025-01-13 12:08:48 UTC,Android developer
2,Austria,Europe,True,True,default,google.com,"Software Engineer, Android",Bitpanda,"Vienna, Austria",Startup Jobs,Who we are\r\n\r\nWe simplify wealth creation....,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlciwgQW...,2025-01-13 12:08:48 UTC,Android developer
3,Austria,Europe,True,True,default,google.com,Android Application Development,Eastern Techno Solutions Pty,Austria,"Trabajo.org - Stellenangebote, Arbeit",Android Developer Job Description:We are looki...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIEFwcGxpY2F0aW9uIE...,2025-01-13 12:08:48 UTC,Android developer
4,Austria,Europe,True,True,default,google.com,Android & iOS Developer,ventopay,"Hagenberg, Austria","Ventopay, Informatikjobs.at, XING, Expertini",Was sind deine Aufgaben?\r\n• Du gestaltest at...,,,Part-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIFx1MDAyNiBpT1MgRG...,2025-01-13 12:08:48 UTC,Android developer


## Step 1

In [32]:
def remove_exact_duplicates(df):
    """Удаляет полные дубликаты строк и выводит количество удалённых записей."""
    initial_count = len(df)
    df_cleaned = df.drop_duplicates(keep="first")
    removed_count = initial_count - len(df_cleaned)
    
    print(f"Шаг 1: Удаление абсолютных дубликатов")
    print(f"- Исходное количество строк: {initial_count}")
    print(f"- Удалено дубликатов: {removed_count}")
    print(f"- Оставшиеся строки: {len(df_cleaned)}\n")
    
    return df_cleaned

df = remove_exact_duplicates(df)

Шаг 1: Удаление абсолютных дубликатов
- Исходное количество строк: 5834
- Удалено дубликатов: 0
- Оставшиеся строки: 5834



## Step 2

In [33]:
def remove_job_id_duplicates(df):
    """Удаляет дубликаты Job ID в рамках одной страны, оставляя приоритетно домен 'google.com', затем последнюю по дате."""
    initial_count = len(df)
    
    # Добавляем столбец с приоритетом домена (0 - google.com, 1 - другие локальные домены)
    df["Domain Priority"] = df["Google Domain Type"].apply(lambda x: 0 if x == "default" else 1)
    
    # Сортировка: сначала google.com (приоритет 0), затем по дате (последняя — самая свежая)
    df_cleaned = df.sort_values(by=["Domain Priority", "Search Date"], ascending=[True, False])
    df_cleaned = df_cleaned.drop_duplicates(subset=["Job ID", "Location"], keep="first")

    df_cleaned = df_cleaned.sort_values(by="Location", ascending=True).reset_index(drop=True)

    removed_count = initial_count - len(df_cleaned)
    
    print(f"Шаг 2: Удаление дубликатов Job ID в рамках одной страны")
    print(f"- Исходное количество строк: {initial_count}")
    print(f"- Удалено дубликатов: {removed_count}")
    print(f"- Оставшиеся строки: {len(df_cleaned)}\n")
    
    return df_cleaned.drop(columns=["Domain Priority"])  # Удаляем временный столбец



clean_df = remove_job_id_duplicates(df)
clean_df.head()

Шаг 2: Удаление дубликатов Job ID в рамках одной страны
- Исходное количество строк: 5834
- Удалено дубликатов: 2971
- Оставшиеся строки: 2863



Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer
1,Austria,Europe,True,True,default,google.com,Senior Ios Developer,Pyramid Global Technologies,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",A minimum of 6+ years of concurrent commercial...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgSW9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer
2,Austria,Europe,True,True,default,google.com,iOS Developer - Permanent remote,Bluestorm Recruitment by Dazzle,Austria,Jooble,iOS Developer\r\n\r\nOur client is a leading m...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIC0gUGVybW...,2025-01-13 12:20:43 UTC,iOS developer
3,Austria,Europe,True,True,default,google.com,Sr. Ios Developer,Bykon,Austria,"Trabajo.org - Stellenangebote, Arbeit",In ByKon we're looking for an exceptional Sr.\...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTci4gSW9zIERldmVsb3BlciIsIm...,2025-01-13 12:20:43 UTC,iOS developer
4,Austria,Europe,True,True,default,google.com,Software Engineer/ iOS,Bitpanda,Anywhere,GrabJobs,Who we are\r\n\r\nWe simplify wealth creation....,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlci8gaU...,2025-01-13 12:20:43 UTC,iOS developer


## Domain analysis

In [38]:
def analyze_domain_effectiveness(df):
    """Анализирует эффективность домена google.com vs local для каждой страны."""
    domain_counts = df.groupby(["Location", "Google Domain Type"]).size().unstack(fill_value=0)
    
    # Переименовываем столбцы
    domain_counts = domain_counts.rename(columns={"default": "Google.com Jobs", "local": "Local Domain Jobs"})
    
    # Добавляем метрики
    domain_counts["Total Jobs"] = domain_counts.sum(axis=1)
    domain_counts["Google.com Share"] = domain_counts["Google.com Jobs"] / domain_counts["Total Jobs"]
    domain_counts["Local Domain Share"] = domain_counts["Local Domain Jobs"] / domain_counts["Total Jobs"]
    domain_counts["Google vs Local Diff"] = domain_counts["Google.com Share"] - domain_counts["Local Domain Share"]
    
    # Подсчёт уникальных вакансий для каждого домена
    unique_jobs = df.groupby("Job ID")["Google Domain Type"].nunique().reset_index()
    unique_jobs = unique_jobs[unique_jobs["Google Domain Type"] == 1]
    unique_counts = df[df["Job ID"].isin(unique_jobs["Job ID"])].groupby(["Location", "Google Domain Type"]).size().unstack(fill_value=0)
    unique_counts = unique_counts.rename(columns={"default": "Unique Google.com Jobs", "local": "Unique Local Domain Jobs"})
    
    # Объединяем с основными данными
    domain_counts = domain_counts.join(unique_counts, how="left").fillna(0)
    
    print("Анализ эффективности доменов:")
    print(domain_counts.sort_values(by="Google.com Share", ascending=False).head(10))
    
    return domain_counts

domain_table = analyze_domain_effectiveness(df)
domain_table.head()

Анализ эффективности доменов:
Google Domain Type  Google.com Jobs  Local Domain Jobs  Total Jobs  \
Location                                                             
Austria                          88                 83         171   
Lithuania                        44                 42          86   
Denmark                          34                 33          67   
Croatia                          39                 38          77   
Portugal                        129                126         255   
Bulgaria                         44                 43          87   
Netherlands                     100                 98         198   
France                           89                 88         177   
Mexico                          164                163         327   
Slovenia                         16                 16          32   

Google Domain Type  Google.com Share  Local Domain Share  \
Location                                                   
Austria  

Google Domain Type,Google.com Jobs,Local Domain Jobs,Total Jobs,Google.com Share,Local Domain Share,Google vs Local Diff,Unique Google.com Jobs,Unique Local Domain Jobs
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Austria,88,83,171,0.51462,0.48538,0.02924,5.0,1.0
Belgium,57,57,114,0.5,0.5,0.0,0.0,0.0
Bulgaria,44,43,87,0.505747,0.494253,0.011494,5.0,6.0
Canada,220,220,440,0.5,0.5,0.0,15.0,15.0
Croatia,39,38,77,0.506494,0.493506,0.012987,1.0,0.0


In [39]:
clean_df.to_csv('./jobs_data_clean.csv')