# Importing Libraries and Loading an Existing Dataset

In [11]:
import pandas as pd
import random
import uuid
import numpy as np

In [12]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/job-recommendation-raw-dataset/jobstreet_all_job_dataset.csv


In [13]:
df_raw = pd.read_csv("/kaggle/input/job-recommendation-raw-dataset/jobstreet_all_job_dataset.csv")

print("Raw dataset size:", len(df_raw))
print(df_raw.columns)

Raw dataset size: 59306
Index(['job_id', 'job_title', 'company', 'descriptions', 'location',
       'category', 'subcategory', 'role', 'type', 'salary', 'listingDate'],
      dtype='object')


# Dataset Formation and Preprocessing

In [14]:
df = df_raw[[
    "job_title",
    "location"
]].copy()

df.rename(columns={
    "job_title": "job_title",
    "location": "city"
}, inplace=True)


In [15]:
print(df.head())
print(df.columns)

                                      job_title             city
0              Procurement Executive (Contract)  Negeri Sembilan
1                  Account Executive/ Assistant         Petaling
2  Data Analyst - Asset Management, SPX Express   Klang District
3                              Service Engineer         Petaling
4                          Purchasing Executive      Hulu Langat
Index(['job_title', 'city'], dtype='object')


In [16]:
RETAIL_KEYWORDS = [
    "sales", "cashier", "billing", "account",
    "store", "shop", "assistant", "counter", "serving"
]

def is_retail_job(text):
    text = str(text).lower()
    return any(keyword in text for keyword in RETAIL_KEYWORDS)

df = df[df["job_title"].apply(is_retail_job)]

print("After retail filtering:", len(df))

After retail filtering: 16992


In [17]:
def normalize_category(title):
    title = title.lower()
    if "account" in title:
        return "accounting"
    elif "cashier" in title or "billing" in title:
        return "billing"
    elif "sales" in title:
        return "sales"
    elif "serve" in title or "counter" in title:
        return "serving"
    else:
        return "retail assistant"

df["job_category"] = df["job_title"].apply(normalize_category)

In [18]:
skills_by_category = {
    "billing": [
        "billing", "cashier handling", "POS operation",
        "basic arithmetic", "customer service"
    ],
    "sales": [
        "sales", "communication skills",
        "customer interaction", "stock handling"
    ],
    "serving": [
        "serving customers", "order taking",
        "cleanliness", "food hygiene", "patience"
    ],
    "accounting": [
        "basic accounting", "data entry",
        "record maintenance", "excel basics"
    ],
    "retail assistant": [
        "store assistance", "shelf arrangement",
        "inventory support", "multitasking"
    ]
}

skill_count_by_category = {
    "billing": (4, 5),
    "sales": (4, 5),
    "serving": (3, 4),
    "accounting": (4, 5),
    "retail assistant": (3, 4)
}

def select_skills(category):
    if category not in skills_by_category:
        category = "retail assistant"
    pool = skills_by_category[category]
    k = random.randint(*skill_count_by_category[category])
    return ", ".join(random.sample(pool, k))


In [19]:
shop_type_pool = ["bakery", "supermarket", "clothing store", "retail shop", "petrol pump", "parts supplier"]
salary_pool = [300, 350, 400, 450]
shift_pool = ["evening", "holiday", "evening+holiday"]
working_days_pool = ["weekdays", "weekends", "weekdays+weekends"]
season_pool = ["festival", "summer", "exam_break", "christmas", "onam"]
openings_pool = [1, 2, 3, 5, 7, 10, 12]
distance_pool = [2, 3, 5, 8, 10]
retailer_verified_pool = [True, False]
job_status_pool = ["approved", "not approved"] 
is_blocked_pool = [True, False]
retailer_verified_probabilities = [0.9, 0.1]
job_status_probabilities = [0.85, 0.15] 
is_blocked_probabilities = [0.05, 0.95]

In [20]:
def assign_context(row):
    row["job_id"] = str(uuid.uuid4())

    # Basic fields
    row["shop_type"] = random.choice(shop_type_pool)
    row["salary_per_day"] = random.choice(salary_pool)
    row["salary_type"] = "daily"
    row["number_of_openings"] = random.choice(openings_pool)
    row["max_distance_km"] = random.choice(distance_pool)

    #Safety fields
    row["retailer_verified"] = random.choices(retailer_verified_pool, retailer_verified_probabilities, k=1 ) [0]
    row["job_status"] = random.choices(job_status_pool, job_status_probabilities, k=1 ) [0]
    row["is_blocked"] = random.choices(is_blocked_pool, is_blocked_probabilities, k=1 ) [0]
    
    # Shift assignment
    row["shift_type"] = random.choice(shift_pool)

    # Holiday logic (separate from season)
    if row["shift_type"] in ["holiday", "evening+holiday"]:
        row["working_days"] = random.choice(
            ["weekends", "weekdays+holidays"]
        )
    else:
        row["working_days"] = "weekdays"

    # Seasonal logic (independent)
    row["seasonal_job"] = random.choice([True, False])

    if row["seasonal_job"]:
        row["season_name"] = random.choice(
            ["festival", "summer", "exam_break", "christmas", "onam"]
        )
    else:
        row["season_name"] = "none"

    row["location_area"] = "local area"

    return row


In [21]:
def select_skills(category):
    category = category.lower()

    # fallback safety
    if category not in skills_by_category:
        category = "retail assistant"

    skill_pool = skills_by_category[category]
    min_skills, max_skills = skill_count_by_category[category]

    # ensure k does not exceed pool size
    max_allowed = min(max_skills, len(skill_pool))
    min_allowed = min(min_skills, max_allowed)

    k = random.randint(min_allowed, max_allowed)

    selected = random.sample(skill_pool, k)
    return ", ".join(selected)


In [22]:
df["required_skills"] = df["job_category"].apply(select_skills)

In [23]:
VARIANTS_PER_JOB = 4
rows = []

for _, row in df.iterrows():
    for _ in range(VARIANTS_PER_JOB):
        rows.append(assign_context(row.copy()))

df_final = pd.DataFrame(rows)

print("Final dataset size:", len(df_final))

Final dataset size: 67968


In [24]:
df_final["retailer_verified"].value_counts(normalize=True) * 100
df_final["job_status"].value_counts(normalize=True) * 100
df_final["is_blocked"].value_counts(normalize=True) * 100


is_blocked
False    94.922611
True      5.077389
Name: proportion, dtype: float64

In [25]:
text_cols = ["job_title", "required_skills"]

for col in text_cols:
    df_final[col] = df_final[col].fillna("").str.lower()

In [26]:
df_final["combined_text"] = (
    df_final["job_title"] + " " +
    df_final["required_skills"] + " " +
    df_final["job_category"] + " " +
    df_final["shop_type"] + " " +
    df_final["shift_type"] + " " +
    df_final["working_days"]
)

In [27]:
df_final.to_csv("/kaggle/working/jobs_final_dataset.csv", index=False)

print("Saved successfully to /kaggle/working/")

Saved successfully to /kaggle/working/


In [28]:
print(df_final.isna().sum())

job_title             0
city                  0
job_category          0
required_skills       0
job_id                0
shop_type             0
salary_per_day        0
salary_type           0
number_of_openings    0
max_distance_km       0
retailer_verified     0
job_status            0
is_blocked            0
shift_type            0
working_days          0
seasonal_job          0
season_name           0
location_area         0
combined_text         0
dtype: int64


In [29]:
safe_df = df_final[
  (df_final["retailer_verified"] == True) &
   (df_final["job_status"] == "approved") &
   (df_final["is_blocked"] == False)
 ].reset_index(drop=True)

In [30]:
print(safe_df.columns)
set(df_final.columns) == set(safe_df.columns)


Index(['job_title', 'city', 'job_category', 'required_skills', 'job_id',
       'shop_type', 'salary_per_day', 'salary_type', 'number_of_openings',
       'max_distance_km', 'retailer_verified', 'job_status', 'is_blocked',
       'shift_type', 'working_days', 'seasonal_job', 'season_name',
       'location_area', 'combined_text'],
      dtype='object')


True

In [31]:
print("Safe jobs:", len(safe_df))


Safe jobs: 49259


# Feature Extraction - TF-IDF Vectorization

In [32]:
df1 = pd.read_csv("/kaggle/working/jobs_final_dataset.csv")

print(df1.shape)
print(df1.columns)

(67968, 19)
Index(['job_title', 'city', 'job_category', 'required_skills', 'job_id',
       'shop_type', 'salary_per_day', 'salary_type', 'number_of_openings',
       'max_distance_km', 'retailer_verified', 'job_status', 'is_blocked',
       'shift_type', 'working_days', 'seasonal_job', 'season_name',
       'location_area', 'combined_text'],
      dtype='object')


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
print(df1.columns)

Index(['job_title', 'city', 'job_category', 'required_skills', 'job_id',
       'shop_type', 'salary_per_day', 'salary_type', 'number_of_openings',
       'max_distance_km', 'retailer_verified', 'job_status', 'is_blocked',
       'shift_type', 'working_days', 'seasonal_job', 'season_name',
       'location_area', 'combined_text'],
      dtype='object')


In [35]:
#The cosine matrix is built only for the required student data dynamically as other job x job calculations will involve the usage of a huge of RAM

from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=5000,
    min_df=3,
    ngram_range=(1, 2)
)

tfidf_matrix = tfidf.fit_transform(safe_df["combined_text"])


student_text = (
    "billing cashier handling customer service "
    "evening weekdays supermarket"
)

student_vector = tfidf.transform([student_text])


from sklearn.metrics.pairwise import cosine_similarity

similarity_scores = cosine_similarity(
    student_vector,      # (1 × features)
    tfidf_matrix         # (N × features)
)[0]


top_n = 5
top_indices = np.argsort(similarity_scores)[::-1][:top_n]

recommended_jobs = safe_df.iloc[top_indices][[
    "job_title", "job_category",
    "shop_type", "shift_type",
    "salary_per_day"
]]


In [36]:
def recommend_similar_jobs(job_index, top_n=5):
    similarity_scores = list(enumerate(cosine_sim[job_index]))
    similarity_scores = sorted(
        similarity_scores,
        key=lambda x: x[1],
        reverse=True
    )
    similarity_scores = similarity_scores[1:top_n+1]

    job_indices = [i[0] for i in similarity_scores]

    return df1.iloc[job_indices][
        ["job_title", "job_category", "shop_type", "shift_type", "salary_per_day"]
    ]

# Wrap Recommendation Logic into a Function

def build_student_text(student_profile):
    return (
        student_profile["skills"] + " " +
        student_profile["preferred_shift"] + " " +
        student_profile["preferred_working_days"] + " " +
        student_profile["preferred_shop_type"]
    )
def recommend_jobs(student_profile, df_final, tfidf, top_n=5):

    # 1. Trust & safety
    safe_df = df_final[
        (df_final["retailer_verified"] == True) &
        (df_final["job_status"] == "approved") &
        (df_final["is_blocked"] == False)
    ].reset_index(drop=True)

    print("After trust & safety:", len(safe_df))

    # 2. Hard filters
    filtered_df = safe_df.copy()

    filtered_df = filtered_df[
        filtered_df["working_days"].str.contains(
            "|".join(student_profile["acceptable_working_days"]),
            case=False,
            na=False
        )
    ]

    filtered_df = filtered_df[
        filtered_df["max_distance_km"] <= student_profile["max_distance_km"]
    ]

    filtered_df = filtered_df[
        filtered_df["shift_type"].isin(student_profile["acceptable_shifts"])
    ]

    print("After hard filters:", len(filtered_df))

    if len(filtered_df) == 0:
        print("⚠️ No jobs after hard filters, applying fallback")
        filtered_df = safe_df

    # --- rest of the logic ---



In [37]:

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def recommend_jobs(student_profile, df_final, tfidf, top_n=5):

    # ============================
    # 1. TRUST & SAFETY FILTER
    # ============================
    safe_df = df_final[
        (df_final["retailer_verified"] == True) &
        (df_final["job_status"] == "approved") &
        (df_final["is_blocked"] == False)
    ].reset_index(drop=True)

    # ============================
    # 2. STRICT CONTEXT FILTERS
    # ============================
    filtered_df = safe_df.copy()

    # --- strict shift filter
    filtered_df = filtered_df[
        filtered_df["shift_type"].isin(student_profile["acceptable_shifts"])
    ]

    # --- strict working days (flexible string match)
    filtered_df = filtered_df[
        filtered_df["working_days"].str.contains(
            "|".join(student_profile["acceptable_working_days"]),
            case=False,
            na=False
        )
    ]

    # --- strict distance
    filtered_df = filtered_df[
        filtered_df["max_distance_km"] <= student_profile["max_distance_km"]
    ]

    # ============================
    # 3. CONDITIONAL RELAXATIONS
    # ============================

    # --- Relaxation 1: Distance (+2 km)
    if len(filtered_df) < top_n:
        filtered_df = safe_df[
            safe_df["max_distance_km"] <= student_profile["max_distance_km"] + 2
        ]

    # --- Relaxation 2: Ignore working days
    if len(filtered_df) < top_n:
        filtered_df = safe_df[
            safe_df["shift_type"].isin(student_profile["acceptable_shifts"])
        ]

    # --- Relaxation 3 (last resort): Trust-only
    if len(filtered_df) < top_n:
        filtered_df = safe_df.copy()

    # Safety check
    if len(filtered_df) == 0:
        return "No suitable jobs found."

    # ============================
    # 4. CONTENT SIMILARITY
    # ============================
    tfidf_matrix = tfidf.fit_transform(filtered_df["combined_text"])

    student_text = (
        student_profile["skills"] + " " +
        student_profile["preferred_shift"] + " " +
        student_profile["preferred_working_days"] + " " +
        student_profile["preferred_shop_type"]
    )

    student_vector = tfidf.transform([student_text])

    similarity_scores = cosine_similarity(
        student_vector,
        tfidf_matrix
    )[0]

    # ============================
    # 5. FINAL RANKING
    # ============================
    top_indices = np.argsort(similarity_scores)[::-1][:top_n]

    return filtered_df.iloc[top_indices][[
        "job_title",
        "job_category",
        "shop_type",
        "shift_type",
        "working_days",
        "salary_per_day",
        "location_area"
    ]]



if len(filtered_df) == 0:
        return "No suitable jobs found for given preferences."
    
    # -----------------------------
    # 3. Content-based similarity
    # -----------------------------
    tfidf_matrix = tfidf.fit_transform(filtered_df["combined_text"])

    student_text = build_student_text(student_profile)
    student_vector = tfidf.transform([student_text])

    similarity_scores = cosine_similarity(
        student_vector,
        tfidf_matrix
    )[0]

    # -----------------------------
    # 4. Ranking (hybrid-ready)
    # -----------------------------
    top_indices = np.argsort(similarity_scores)[::-1][:top_n]

    return filtered_df.iloc[top_indices][[
        "job_title",
        "job_category",
        "shop_type",
        "shift_type",
        "working_days",
        "salary_per_day",
        "location_area"
    ]]

# Testing the logic uptil now

In [38]:

student_1 = {
    "skills": "billing cashier handling customer service",
    "preferred_shift": "evening",
    "preferred_working_days": "weekdays",
    "preferred_shop_type": "supermarket",
    "acceptable_shifts": ["evening", "evening+holiday"],
    "acceptable_working_days": ["weekdays"],
    "max_distance_km": 3
}
recommend_jobs(student_1, df_final, tfidf)



Unnamed: 0,job_title,job_category,shop_type,shift_type,working_days,salary_per_day,location_area
14671,billing,billing,petrol pump,evening,weekdays,300,local area
36619,administration clerk (billing),billing,parts supplier,evening+holiday,weekdays+holidays,350,local area
30780,billing analyst,billing,bakery,evening,weekdays,350,local area
30782,billing analyst,billing,clothing store,evening,weekdays,350,local area
11981,billing clerk cum admin assistant,billing,supermarket,evening+holiday,weekdays+holidays,350,local area


In [39]:

student_2 = {
    "skills": "serving food hospitality cleaning",
    "preferred_shift": "holiday",
    "preferred_working_days": "weekends",
    "preferred_shop_type": "bakery",
    "acceptable_shifts": ["holiday", "evening+holiday"],
    "acceptable_working_days": ["weekends", "weekdays+holidays"],
    "max_distance_km": 5
}
recommend_jobs(student_2, df_final, tfidf)


Unnamed: 0,job_title,job_category,shop_type,shift_type,working_days,salary_per_day,location_area
38373,serveyor assistant,serving,petrol pump,evening+holiday,weekends,400,local area
14786,head of anti-money laundering and counter fina...,serving,petrol pump,evening+holiday,weekends,450,local area
20036,counterfeit brand protection (listing agents) ...,serving,bakery,holiday,weekends,300,local area
22144,counter customer service executive (mandarin s...,serving,supermarket,holiday,weekends,450,local area
4871,counter clerk,serving,clothing store,evening+holiday,weekends,350,local area


In [40]:

student_3 = {
    "skills": "sales communication stock handling",
    "preferred_shift": "evening",
    "preferred_working_days": "weekdays+weekends",
    "preferred_shop_type": "clothing store",
    "acceptable_shifts": ["evening"],
    "acceptable_working_days": ["weekdays+weekends"],
    "max_distance_km": 8
}
recommend_jobs(student_3, df_final, tfidf)

Unnamed: 0,job_title,job_category,shop_type,shift_type,working_days,salary_per_day,location_area
18908,sales manager - proactive,sales,clothing store,evening,weekdays,450,local area
31741,sales man,sales,clothing store,holiday,weekends,450,local area
26850,sales executive,sales,clothing store,evening,weekdays,450,local area
26852,sales executive,sales,clothing store,evening,weekdays,450,local area
29700,sales executive,sales,clothing store,evening,weekdays,450,local area


In [41]:

student_4 = {
    "skills": "general helper packing assisting",
    "preferred_shift": "holiday",
    "preferred_working_days": "holidays",
    "preferred_shop_type": "retail shop",
    "acceptable_shifts": ["holiday", "evening+holiday"],
    "acceptable_working_days": ["weekends", "weekdays+holidays"],
    "max_distance_km": 10
}

recommend_jobs(student_4, df_final, tfidf)


Unnamed: 0,job_title,job_category,shop_type,shift_type,working_days,salary_per_day,location_area
46355,assistant general manager or assistant to gene...,retail assistant,retail shop,evening+holiday,weekends,300,local area
11288,accountant ii - general accounting,accounting,retail shop,evening+holiday,weekends,400,local area
18101,*account executive - general ledger (licc),accounting,retail shop,evening+holiday,weekends,450,local area
4732,assistant general clerk,retail assistant,retail shop,evening+holiday,weekends,350,local area
39196,senior general ledger accountant,accounting,retail shop,holiday,weekends,400,local area


In [42]:
len(tfidf.get_feature_names_out())

4444

In [43]:
similarity_scores[:10]

array([0.00749048, 0.00751184, 0.03224974, 0.03224974, 0.00803389,
       0.00801522, 0.01599659, 0.03611501, 0.00576007, 0.02477153])

In [44]:
import joblib
joblib.dump(tfidf, "tfidf_model.joblib")


['tfidf_model.joblib']

In [45]:
df_final.to_csv("final_jobs_dataset.csv", index=False)


# Conclusion

This project presents a hybrid-ready job recommendation system designed to support part-time employment opportunities for students. The system adopts an unsupervised, content-based recommendation approach to effectively handle cold-start scenarios where historical user interaction data is unavailable. Job postings are represented using TF-IDF feature extraction on curated textual attributes, and cosine similarity is employed to rank job relevance with respect to student preferences.

To enhance recommendation quality and realism, the system integrates trust and safety constraints, along with contextual filters such as working hours, working days, and geographic distance. Conditional relaxation strategies are applied to ensure recommendation availability without compromising relevance. Experimental validation using multiple student profiles demonstrates that the system produces personalized and context-aware job recommendations, with meaningful similarity scores and controlled vocabulary size.

Although the current implementation focuses on content-based recommendation, the architecture is designed to evolve into a hybrid and predictive system. By logging user interactions such as job views, saves, and applications, future iterations can incorporate collaborative filtering and supervised learning models to improve personalization and semantic understanding. Overall, the proposed approach offers a scalable, explainable, and practical foundation for a student-centric job recommendation platform, with clear pathways for future enhancement.
