In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
import torch
from datasets import Dataset


In [None]:

df = pd.read_csv('upd_labelled_data.csv')

print(f"printing the top 5 rows of the dataset")
print(df.head())

# Prepare dataset by combining user profile and job description, and selecting necessary columns
# Add markers to differentiate user profile and job description
df['input_text'] = "[USER] " + df['user'] + " [JOB] " + df['job']
df = df[['input_text', 'label']]

In [None]:
# Load the model and tokenizer
model_name_or_path = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path).to("cuda" if torch.cuda.is_available() else "cpu")

print(f"model loaded successfully")

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
print(f"Dataset conversion is done ")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Tokenize the dataset
def preprocess_data(example):
    return tokenizer(example['input_text'], truncation=True, padding='max_length', max_length=128)

tokenized_dataset = dataset.map(preprocess_data, batched=True)

# Split dataset into train and validation sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
validation_dataset = train_test_split['test']

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # You can increase this if you want better results
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=100,  # Log every 100 steps for better tracking
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
    greater_is_better=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)

# Fine-tune the model
trainer.train()

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111263971111119, max=1.0)…

Epoch,Training Loss,Validation Loss
1,0.0116,0.001874
2,0.0001,3.8e-05
3,0.0,2.6e-05


TrainOutput(global_step=2532, training_loss=0.015911485617702656, metrics={'train_runtime': 203.6489, 'train_samples_per_second': 99.436, 'train_steps_per_second': 12.433, 'total_flos': 670616205696000.0, 'train_loss': 0.015911485617702656, 'epoch': 3.0})

In [10]:

# Function to recommend jobs based on user profile and list of job descriptions
def recommend_jobs(user_profile, job_listings, top_n=5):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    user_profile_encoded = tokenizer("[USER] " + user_profile, return_tensors="pt", padding=True, truncation=True).to(device)
    scores = []
    
    for job in job_listings:
        job_encoded = tokenizer("[JOB] " + job, return_tensors="pt", padding=True, truncation=True).to(device)
        
        # Concatenate user and job encoding for input
        inputs = {
            "input_ids": torch.cat((user_profile_encoded['input_ids'], job_encoded['input_ids']), dim=1),
            "attention_mask": torch.cat((user_profile_encoded['attention_mask'], job_encoded['attention_mask']), dim=1),
        }
        
        with torch.no_grad():
            output = model(**inputs)
            score = output.logits[0][1].item()  # Confidence score for label '1' (recommended)
            scores.append((job, score))
    
    # Sort and return the top N jobs based on score
    print(scores)
    recommended_jobs = sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]
    return [job for job, score in recommended_jobs]

# Example user profile and job listings
user_profile = "Data Scientist with skills in Python, SQL, Machine Learning"
job_listings = [
    "Software Developer role with Python and JavaScript",
    "Data Scientist with SQL, Machine Learning",
    "Backend Developer with Flask and SQL","Marketing Exe","content creator","java developer","Java","software engineer","salesforce developer"
]

# Get job recommendations
recommendations = recommend_jobs(user_profile, job_listings)
print("Recommended Jobs:", recommendations)

[('Software Developer role with Python and JavaScript', -3.746919870376587), ('Data Scientist with SQL, Machine Learning', 4.756860256195068), ('Backend Developer with Flask and SQL', -4.245437145233154), ('Marketing Exe', -4.539279937744141), ('content creator', -3.812018632888794), ('java developer', -3.9749536514282227), ('Java', -3.139751672744751), ('software engineer', -1.6429455280303955), ('salesforce developer', -4.20063591003418)]
Recommended Jobs: ['Data Scientist with SQL, Machine Learning', 'software engineer', 'Java', 'Software Developer role with Python and JavaScript', 'content creator']


In [11]:
test_df=pd.read_csv('/kaggle/input/jobs-dataset/jobs_data.csv')

In [14]:
# Combine relevant columns into a single text representation for each job
test_df['combined_text'] = test_df.apply(lambda row: f"{row['job_title']} {row['skills']}", axis=1)

In [None]:


# Load tokenizer and fine-tuned model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")  # Replace with your model's tokenizer
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained('/kaggle/working/results/checkpoint-2532')  # Path to fine-tuned model

# Load the CSV file with job listings
job_listings_df = pd.read_csv("/kaggle/input/jobs-dataset/jobs_data.csv")  # Path to your jobs CSV file

# Combine job details (role, skills, description) for each job
job_listings = [
    f"Role:{row['job_title']}, Skills: {row['skills']}"
    for _, row in job_listings_df.iterrows()
]

# Function to recommend jobs using the fine-tuned model
def recommend_jobs(user_profile, job_listings, top_n=5):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    fine_tuned_model.to(device)

    # Encode the user profile
    user_profile_encoded = tokenizer("[USER] " + user_profile, return_tensors="pt", padding=True, truncation=True).to(device)
    scores = []

    # Iterate over each job listing with its index
    for idx, job in enumerate(job_listings):
        # Encode each job listing
        job_encoded = tokenizer("[JOB] " + job, return_tensors="pt", padding=True, truncation=True).to(device)
        
        # Concatenate the user and job input encodings
        inputs = {
            "input_ids": torch.cat((user_profile_encoded['input_ids'], job_encoded['input_ids']), dim=1),
            "attention_mask": torch.cat((user_profile_encoded['attention_mask'], job_encoded['attention_mask']), dim=1),
        }
        
        # Predict with the fine-tuned model
        with torch.no_grad():
            output = fine_tuned_model(**inputs)
            score = output.logits[0][1].item()  # Confidence score for label '1' (recommended)
            scores.append((idx, job, score))  # Store index, job, and score
    
    # Sort jobs by score and select the top N recommendations
    recommended_jobs = sorted(scores, key=lambda x: x[2], reverse=True)[:top_n]
    print(recommended_jobs)
    
    return [(idx, job) for idx, job, score in recommended_jobs]  # Return index and job details

# Define a sample user profile for testing
user_profile = "Java Developer, with backend"

# Get the top 5 job recommendations
recommended_jobs = recommend_jobs(user_profile, job_listings, top_n=5)

# Create a DataFrame of the results using the original job_listings_df
result_df = job_listings_df.loc[[idx for idx, _ in recommended_jobs]].copy()
result_df["Combined Details"] = [job for _, job in recommended_jobs]

# Display recommended jobs DataFrame
print(result_df)




[(366, 'Role:Software Development Engineer (SDE-1) / Full Stack Developer, Skills: Full Stack Developer,Software Development Engineer,Development,Full Stack,Software,Stack,Software development,Software engineering', 5.273569583892822), (2497, 'Role:Python Backend Developer (AWS Serverless), Skills: Django,Rest Api Development,Aws Serverless Architecture,Python Development,Microservices,Flask,Python,Backend', 5.254361629486084), (2325, 'Role:Java Developer, Skills: Multithreading,Banking Sector,Memory Management,Collections,Core Java Development,Design Patterns,OOPS,Java Development', 5.237627983093262), (2075, 'Role:Hiring For Java Developer For Pune Location, Skills: Java Development,J2Ee,Spring Boot,J2Ee Development,Java Programming,Java Coding,Core Java Development,Spring Batch', 5.235193729400635), (2859, 'Role:Java Full Stack Developer, Skills: Java Fullstack,Java Spring Boot,Java Development,Core Java Development,Core Java Programming,Spring Microservices,java Fullstack Developer

In [19]:
import shutil

# Zip the folder
shutil.make_archive('checkpoint-1688', 'zip', '/kaggle/working/results/checkpoint-1688')


'/kaggle/working/checkpoint-1688.zip'