In [1]:
import pandas as pd
import numpy as np

print("Pandas version:", pd.__version__)
print("NumPy version:", np.__version__)

Pandas version: 2.2.2
NumPy version: 1.26.4


In [2]:
df = pd.read_csv("D:/job_app_model/cleaned_job_data1.csv")
df.head()

Unnamed: 0,current_role,job_location,key_success_factors,beneficial_actions,advice_for_others,current_job_status,living_in_denmark,target_location,education_level,field_of_study,...,tool_matlab,tool_adobe,tool_r,tool_etc…,tool_typescript,tool_vs code,tool_azure,tool_figma,tool_react,tool_javascript
0,Business Analyst,Copenhagen,"Relevant skills/tech stack, Strong CV/Resume a...",Network,Build your network,,,,Master's,MBA,...,0,0,0,0,0,0,0,0,0,0
1,,,,,,Actively applying for Jobs,Yes,Copenhagen,Master's,MBA,...,0,0,0,0,0,0,0,0,0,0
2,,,,,,Actively applying for Jobs,Yes,Copenhagen,PhD,Mathematics,...,0,0,0,0,0,0,0,0,0,0
3,,,,,,Not actively applying but intrested,Yes,Copenhagen,Master's,Structural Engineering & Natural Disaster Mana...,...,0,0,0,0,0,0,0,0,0,0
4,Quality control,Copenhagen,Relevant skills/tech stack,LinkedIn,Talk to the recruiter before applying for a jo...,,,,Bachelor's,Business administration,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df_cleaned = df.copy()
df_cleaned["role"] = df_cleaned["target_roles"].combine_first(df_cleaned["current_role"])
df_cleaned.head()

Unnamed: 0,current_role,job_location,key_success_factors,beneficial_actions,advice_for_others,current_job_status,living_in_denmark,target_location,education_level,field_of_study,...,tool_adobe,tool_r,tool_etc…,tool_typescript,tool_vs code,tool_azure,tool_figma,tool_react,tool_javascript,role
0,Business Analyst,Copenhagen,"Relevant skills/tech stack, Strong CV/Resume a...",Network,Build your network,,,,Master's,MBA,...,0,0,0,0,0,0,0,0,0,Business Analyst
1,,,,,,Actively applying for Jobs,Yes,Copenhagen,Master's,MBA,...,0,0,0,0,0,0,0,0,0,Others
2,,,,,,Actively applying for Jobs,Yes,Copenhagen,PhD,Mathematics,...,0,0,0,0,0,0,0,0,0,Data Analyst
3,,,,,,Not actively applying but intrested,Yes,Copenhagen,Master's,Structural Engineering & Natural Disaster Mana...,...,0,0,0,0,0,0,0,0,0,Data Analyst
4,Quality control,Copenhagen,Relevant skills/tech stack,LinkedIn,Talk to the recruiter before applying for a jo...,,,,Bachelor's,Business administration,...,0,0,0,0,0,0,0,0,0,Quality control


In [4]:
# Create the target variable: if current_role is not empty, consider it as having found a job
df_cleaned["found_job"] = df_cleaned["current_role"].apply(lambda x: 0 if pd.isna(x) else 1)

df_cleaned[["current_role", "found_job"]].head(10)

Unnamed: 0,current_role,found_job
0,Business Analyst,1
1,,0
2,,0
3,,0
4,Quality control,1
5,Junior Clinical data manager,1
6,,0
7,,0
8,,0
9,,0


In [5]:
df_cleaned.shape

(56, 107)

In [6]:
df.shape

(56, 105)

In [7]:
df_cleaned["location"] = df_cleaned["target_location"].combine_first(df_cleaned["job_location"])
df_cleaned[["job_location", "target_location", "location"]].head(10)

Unnamed: 0,job_location,target_location,location
0,Copenhagen,,Copenhagen
1,,Copenhagen,Copenhagen
2,,Copenhagen,Copenhagen
3,,Copenhagen,Copenhagen
4,Copenhagen,,Copenhagen
5,Copenhagen,,Copenhagen
6,,Copenhagen,Copenhagen
7,,Copenhagen,Copenhagen
8,,Copenhagen,Copenhagen
9,,Copenhagen,Copenhagen


In [8]:
columns_to_drop = [
    "job_location", "target_location", "working_model",
    "key_success_factors", "beneficial_actions", "advice_for_others",
    "biggest_challenge", "applications_last_3_months", "experience_years",
    "current_role", "target_roles"  # Already merged, can be deleted
]
missing_columns = [col for col in columns_to_drop if col not in df_cleaned.columns]

if missing_columns:
    print("these are not found：")
    print(missing_columns)
else:
    print("all columns can be found, do not worry just delete them")

all columns can be found, do not worry just delete them


In [9]:
df_cleaned.drop(columns=columns_to_drop, axis=1, inplace=True)

In [10]:
df_cleaned.drop(columns=["current_job_status"], inplace=True)

In [11]:
df_cleaned["danish_proficiency"] = df["danish_proficiency"]
danish_level_map = {
    "Not at all": 0,
    "Basic only": 1,
    "Yes, Conversational": 2,
    "Intermediate": 3,
    "Fluent": 4,
    "Native": 5  #  Not present in the data, but reserved for encoding
}
df_cleaned["danish_proficiency_encoded"] = df_cleaned["danish_proficiency"].map(danish_level_map)
df_cleaned[["danish_proficiency_encoded"]].head(10)


Unnamed: 0,danish_proficiency_encoded
0,1
1,1
2,1
3,1
4,3
5,2
6,0
7,2
8,2
9,0


In [12]:
df_cleaned["prior_dk_experience"] = df["prior_dk_experience"]
df_cleaned["living_in_denmark"] = df["living_in_denmark"]
df_cleaned.loc[
    (df_cleaned["found_job"] == 1) & (df_cleaned["living_in_denmark"].isna()),
    "living_in_denmark"
] = "Yes"
df_cleaned[["living_in_denmark"]].head(5)


Unnamed: 0,living_in_denmark
0,Yes
1,Yes
2,Yes
3,Yes
4,Yes


In [13]:
df_cleaned["living_in_denmark_encoded"] = (
    df_cleaned["living_in_denmark"]
    .astype(str).str.strip().str.lower()
    .map({"yes": 1, "no": 0})
)
df_cleaned["prior_dk_experience_encoded"] = (
    df_cleaned["prior_dk_experience"]
    .astype(str).str.strip().str.lower()
    .map({"yes": 1, "no": 0})
)

df_cleaned[["living_in_denmark_encoded","prior_dk_experience_encoded"]].head(5)

Unnamed: 0,living_in_denmark_encoded,prior_dk_experience_encoded
0,1,1
1,1,1
2,1,0
3,1,0
4,1,1


In [14]:
df_cleaned["role"].nunique
df_cleaned["role"].dropna().unique()


array(['Business Analyst', 'Others', 'Data Analyst', 'Quality control',
       'Junior Clinical data manager', 'Data Scientist',
       'Software Developer', 'Domain Architect', 'QA Engineer',
       'Cleaning assistance',
       'Team Lead and Occupational Health and Safety Representative',
       'Student assistant', 'kitchen helper', 'Logistics', 'Pharmacist ',
       'Frontend Developer', 'Backend Developer', 'UX Designer',
       'UI/UX Designer'], dtype=object)

In [15]:
role_mapping = {
    "Frontend Developer": "it_frontend",
    "Backend Developer": "it_backend",
    "Software Developer": "it_fullstack",
    "UX Designer": "it_design",
    "UI/UX Designer": "it_design",
    "Business Analyst": "it_data",
    "Data Analyst": "it_data",
    "Data Scientist": "it_data",
    "QA Engineer": "it_other",
    "Pharmacist": "tech_other",
    "Quality control": "tech_other",
    "Domain Architect": "tech_other",
    "Junior Clinical data manager": "tech_other",
    "Cleaning assistance": "non_tech",
    "Team Lead and Occupational Health and Safety Representative": "non_tech",
    "kitchen helper": "non_tech",
    "Logistics": "non_tech",
    "Student assistant": "other",
    "Others": "other"
}
def classify_role(role):
    role = str(role).strip()
    return role_mapping.get(role, "other")  # 找不到的默认归为 other



In [16]:
df_cleaned["role_grouped"] = df_cleaned["role"].apply(classify_role)
df_cleaned[["role", "role_grouped"]].head(10)

Unnamed: 0,role,role_grouped
0,Business Analyst,it_data
1,Others,other
2,Data Analyst,it_data
3,Data Analyst,it_data
4,Quality control,tech_other
5,Junior Clinical data manager,tech_other
6,Data Scientist,it_data
7,Others,other
8,Software Developer,it_fullstack
9,Data Analyst,it_data


In [17]:
df_cleaned[df_cleaned["role_grouped"] == "other"]["role"].value_counts()

role
Others               5
Student assistant    1
Name: count, dtype: int64

In [18]:
df_cleaned["field_of_study"].dropna().unique()

array(['MBA', 'Mathematics ',
       'Structural Engineering & Natural Disaster Management ',
       'Business administration', 'Computer Science', 'Biotechnology ',
       'IT', 'CSE', 'Computer science ', 'Data analytics',
       'Sustainable energy', 'Architecture', 'Software Engineering',
       'geology', 'Theology', 'Management ', 'Anthropology',
       'IT and cognition', 'Pharmacy', 'Physics', 'Software Development',
       'Statistics', 'Data Analytics', 'Data Science', 'Mathematics',
       'Graphic Design', 'Service Design', 'Industrial Design',
       'Chemistry', 'Electric Engineering', 'Economics', 'Journalism',
       'Philosophy', 'Fine Arts', 'Aechitecture', 'History',
       'Political Science', 'Sociology'], dtype=object)

In [19]:
import re

# Fuzzy matching keywords (can be used as substrings, e.g., data → datascience)
fuzzy_keywords = {
    "it": ["computer", "informatics", "software", "data", "ai", "cyber", "machine learning"],
    "science": ["biology", "chemistry", "physics", "statistics", "mathematics","geology", "engineering", "math", "mechanical""energy", "architecture", "biotechnology", "electric"],
    "business": ["business", "marketing", "management", "finance", "economics","administration"],
    "art": ["arts", "design", "music", "fine art", "creative"],
    "humanity": ["philosophy", "history", "literature", "linguistics", "culture", "anthropology",
                 "sociology", "journalism", "communication", "political", "theology" "psychology"]
}

# Keywords that require exact whole-word matching (e.g., "it")
strict_keywords = {
    "it": ["it"],
    "ai": ["it"],
    "mba": ["business"],
    "cse": ["it"]
}

def classify_major(text):
    text = str(text).lower()

    # rocess fuzzy matching keywords first
    for category, keywords in fuzzy_keywords.items():
        if any(kw in text for kw in keywords):
            return category

    # then handle strict matching keywords
    for category, keywords in strict_keywords.items():
        for kw in keywords:
            if re.search(rf"\b{re.escape(kw)}\b", text):
                return category

    return "other"


In [20]:
df_cleaned["major_grouped"] = df_cleaned["field_of_study"].apply(classify_major)
df_cleaned[["major_grouped"]].head(20)

Unnamed: 0,major_grouped
0,other
1,other
2,science
3,science
4,business
5,it
6,it
7,science
8,it
9,other


In [21]:
# copy cleaned data
df_encoded = df_cleaned.copy()

In [22]:
one_hot_columns = ["role_grouped", "location", "education_level", "major_grouped", "visa_status"]
df_encoded = pd.get_dummies(df_encoded, columns=one_hot_columns, prefix=one_hot_columns, drop_first=False, dtype=int) 
df_encoded.head()

Unnamed: 0,living_in_denmark,field_of_study,prior_dk_experience,english_proficiency,danish_proficiency,interview_invites,skill_prototyping,skill_python,skill_insurance,skill_quick learner,...,education_level_PhD,major_grouped_art,major_grouped_business,major_grouped_humanity,major_grouped_it,major_grouped_other,major_grouped_science,visa_status_Job Seeker Visa,visa_status_Residence Permit,visa_status_Student Visa
0,Yes,MBA,Yes,4,Basic only,1,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
1,Yes,MBA,Yes,5,Basic only,1,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
2,Yes,Mathematics,No,5,Basic only,0,0,0,0,1,...,1,0,0,0,0,0,1,0,1,0
3,Yes,Structural Engineering & Natural Disaster Mana...,No,4,Basic only,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,Yes,Business administration,Yes,5,Intermediate,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [23]:
columns_to_drop_encoded = [
    "danish_proficiency", "living_in_denmark", "prior_dk_experience", "field_of_study", "danish_proficiency", "role"  # 已合并，可删
]
missing_columns_encoded = [col for col in columns_to_drop_encoded if col not in df_encoded.columns]

if missing_columns_encoded:
    print("these are not found：")
    print(missing_columns_encoded)
else:
    print("all found you can just delete them ")

all found you can just delete them 


In [24]:
df_encoded.drop(columns=columns_to_drop_encoded, inplace=True)


In [64]:
df_encoded = df_encoded.drop("interview_invites", axis=1)

In [66]:
df_encoded.head()

Unnamed: 0,english_proficiency,skill_prototyping,skill_python,skill_insurance,skill_quick learner,skill_grid analysis,skill_api development,skill_java script,skill_sql,skill_vue,...,education_level_PhD,major_grouped_art,major_grouped_business,major_grouped_humanity,major_grouped_it,major_grouped_other,major_grouped_science,visa_status_Job Seeker Visa,visa_status_Residence Permit,visa_status_Student Visa
0,4,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
1,5,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,5,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0
3,4,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [68]:
df_encoded.to_csv("encoded_data_new.csv", index=False)

In [96]:
#modeling knn part
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Split X/y, keeping only "found a job" as the target variable
X = df_encoded.drop("found_job", axis=1)
y = df_encoded["found_job"]

# Split the dataset (default: 75% training, 25% testing)）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=28)

# Build the KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Model accuracy and print the results
y_pred = knn.predict(X_test)
print("accuracy：", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


accuracy： 0.9285714285714286
              precision    recall  f1-score   support

           0       0.86      1.00      0.92         6
           1       1.00      0.88      0.93         8

    accuracy                           0.93        14
   macro avg       0.93      0.94      0.93        14
weighted avg       0.94      0.93      0.93        14



In [98]:
joblib.dump(knn, "knn_model_new.joblib")
print("model is saved as knn_model_new.joblib")

model is saved as knn_model_new.joblib


In [100]:
list(df_encoded.columns)

['english_proficiency',
 'skill_prototyping',
 'skill_python',
 'skill_insurance',
 'skill_quick learner',
 'skill_grid analysis',
 'skill_api development',
 'skill_java script',
 'skill_sql',
 'skill_vue',
 'skill_machine learning',
 'skill_guidewire',
 'skill_java',
 'skill_html',
 'skill_excel',
 'skill_biotechnology microbiology quality control',
 'skill_web development',
 'skill_user research',
 'skill_machine learning and data analysis',
 'skill_statistical methods',
 'skill_data cleaning',
 'skill_git',
 'skill_ux design',
 'skill_solar power simulations',
 'skill_ai enthusiastic',
 'skill_c#',
 'skill_pandas',
 'skill_strong mathematics background',
 'skill_wireframing',
 'skill_cloud computing',
 'skill_photography',
 'skill_data modeling',
 'skill_data visualization',
 'skill_mobile app development',
 'skill_figma',
 'skill_problem solving',
 'skill_electrical engineering',
 'skill_teamwork',
 'skill_dustribution',
 'skill_ui design',
 'skill_time management',
 'skill_adaptab

In [114]:
#prediction part
X_cols = list(X.columns)
fields = ['english_proficiency',
 'skill_prototyping',
 'skill_python',
 'skill_insurance',
 'skill_quick learner',
 'skill_grid analysis',
 'skill_api development',
 'skill_java script',
 'skill_sql',
 'skill_vue',
 'skill_machine learning',
 'skill_guidewire',
 'skill_java',
 'skill_html',
 'skill_excel',
 'skill_biotechnology microbiology quality control',
 'skill_web development',
 'skill_user research',
 'skill_machine learning and data analysis',
 'skill_statistical methods',
 'skill_data cleaning',
 'skill_git',
 'skill_ux design',
 'skill_solar power simulations',
 'skill_ai enthusiastic',
 'skill_c#',
 'skill_pandas',
 'skill_strong mathematics background',
 'skill_wireframing',
 'skill_cloud computing',
 'skill_photography',
 'skill_data modeling',
 'skill_data_visualization',
 'skill_mobile app development',
 'skill_figma',
 'skill_problem solving',
 'skill_electrical engineering',
 'skill_teamwork',
 'skill_dustribution',
 'skill_ui design',
 'skill_time management',
 'skill_adaptability',
 'skill_ml',
 'skill_programming and software development',
 'skill_design thinking',
 'skill_data analysis',
 'skill_numpy',
 'skill_typescript',
 'skill_css',
 'skill_power bi',
 'skill_react',
 'skill_javascript',
 'skill_domain',
 'skill_communication',
 'tool_',
 'tool_python',
 'tool_jupyter',
 'tool_python (basic level)',
 'tool_sql',
 'tool_tableau',
 'tool_postman',
 'tool_java',
 'tool_1. code development and scripting',
 'tool_html',
 'tool_docker',
 'tool_2. web development',
 'tool_nextjs',
 'tool_adobe xd',
 'tool_powerbi',
 'tool_aws',
 'tool_c#',
 'tool_c program',
 'tool_webpack',
 'tool_sql workbench',
 'tool_github',
 'tool_css',
 'tool_matlab',
 'tool_adobe',
 'tool_r',
 'tool_etc…',
 'tool_typescript',
 'tool_vs code',
 'tool_azure',
 'tool_figma',
 'tool_react',
 'tool_javascript',
 'danish_proficiency_encoded',
 'living_in_denmark_encoded',
 'prior_dk_experience_encoded',
 'role_grouped_it_backend',
 'role_grouped_it_data',
 'role_grouped_it_design',
 'role_grouped_it_frontend',
 'role_grouped_it_fullstack',
 'role_grouped_it_other',
 'role_grouped_non_tech',
 'role_grouped_other',
 'role_grouped_tech_other',
 'location_Aarhus',
 'location_Copenhagen',
 'location_Hillerod',
 'location_Odense',
 "education_level_Bachelor's",
 'education_level_High school ',
 "education_level_Master's",
 'education_level_PhD',
 'major_grouped_art',
 'major_grouped_business',
 'major_grouped_humanity',
 'major_grouped_it',
 'major_grouped_other',
 'major_grouped_science',
 'visa_status_Job Seeker Visa',
 'visa_status_Residence Permit',
 'visa_status_Student Visa']

# Initialize to 0
new_user_input = {field: 0 for field in fields}

# user input
new_user_input.update({
    "english_proficiency": 4,
    "danish_proficiency_encoded": 3,
    "prior_dk_experience_encoded": 0,
    "living_in_denmark_encoded": 1,
    "role_grouped_it_data": 1,
    "location_Copenhagen": 1,
    'major_grouped_science': 1,
    "education_level_Master's": 1,
    'skill_python': 1,
    'skill_sql': 1,
    'skill_data analysis':1,
    'skill_machine learning': 1,
    'skill_data cleaning': 1,
    'skill_pandas': 1,
    'skill_numpy': 1,
    'tool_python': 1,
    'tool_jupyter': 1,
    'tool_sql': 1,
    'tool_tableau': 1,
    'tool_github': 1,
    'visa_status_Job Seeker Visa': 1
})

X_cols = list(X.columns)  # Column order used during model training
new_user_df = pd.DataFrame([new_user_input])
new_user_df = new_user_df.reindex(columns=X_cols, fill_value=0)

In [116]:
# Build the DataFrame and automatically fill in missing fields
new_user_df = pd.DataFrame([new_user_input])
new_user_df = new_user_df.reindex(columns=X_cols, fill_value=0)


In [118]:
# Use the model to make predictions
prediction = knn.predict(new_user_df)[0]
probability = knn.predict_proba(new_user_df)[0][1]

print("Model prediction：", "you're likely to find a job" if prediction == 1 else "hard to find a job")
print(f"The probability of finding a job is:：{probability:.0%}")

Model prediction： you're likely to find a job
The probability of finding a job is:：60%
