In [None]:
pip install transformers torch pandas scikit-learn


# Data Preprocessing 

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import ast

# Load your dataset
df = pd.read_csv('/Users/innovapathinc/Downloads/Agentic_AI_1st/Agentic_AI/Agentic_AI/job_classification_project/data/jobs.csv')

# Data Cleaning
def clean_text(text):
    # Convert to lowercase and remove extra spaces
    text = text.lower().replace('  ', ' ')
    return text

# Apply cleaning to relevant columns
df['Job Title'] = df['Job Title'].apply(clean_text)
df['Key Skills'] = df['Key Skills'].apply(clean_text)
df['Role Category'] = df['Role Category'].apply(clean_text)
df['Functional Area'] = df['Functional Area'].apply(clean_text)
df['Industry'] = df['Industry'].apply(clean_text)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_text(text, tokenizer, max_length=512):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

# Combine relevant columns into a single text field for tokenization
df['combined_text'] = df['Job Title'] + ' ' + df['Key Skills'] + ' ' + df['Role Category'] + ' ' + df['Functional Area'] + ' ' + df['Industry']

# Apply tokenization to the combined text
df['input_ids'] = df['combined_text'].apply(lambda x: tokenize_text(x, tokenizer)['input_ids'].flatten().tolist())
df['attention_mask'] = df['combined_text'].apply(lambda x: tokenize_text(x, tokenizer)['attention_mask'].flatten().tolist())

# Data Splitting
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Save the splits to CSV files
train_df.to_csv('train_data.csv', index=False)
valid_df.to_csv('valid_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

print("Data preprocessing complete.")


Data preprocessing complete.


In [5]:
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
import pandas as pd
import ast

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
class JobDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)


In [7]:
import pandas as pd
import ast
from sklearn.preprocessing import LabelEncoder

# Load your preprocessed data
train_df = pd.read_csv('train_data.csv')
valid_df = pd.read_csv('valid_data.csv')

# Ensure input_ids and attention_mask are lists of integers
train_df['input_ids'] = train_df['input_ids'].apply(ast.literal_eval)
train_df['attention_mask'] = train_df['attention_mask'].apply(ast.literal_eval)
valid_df['input_ids'] = valid_df['input_ids'].apply(ast.literal_eval)
valid_df['attention_mask'] = valid_df['attention_mask'].apply(ast.literal_eval)

# Combine labels from both datasets to ensure all labels are seen during fitting
all_labels = pd.concat([train_df['Role Category'], valid_df['Role Category']])

# Encode the 'Role Category' column to integers
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

train_df['Role Category'] = label_encoder.transform(train_df['Role Category'])
valid_df['Role Category'] = label_encoder.transform(valid_df['Role Category'])

# Prepare the data
train_encodings = {'input_ids': train_df['input_ids'].tolist(), 'attention_mask': train_df['attention_mask'].tolist()}
valid_encodings = {'input_ids': valid_df['input_ids'].tolist(), 'attention_mask': valid_df['attention_mask'].tolist()}

# Extract labels
train_labels = train_df['Role Category'].tolist()
valid_labels = valid_df['Role Category'].tolist()

train_dataset = JobDataset(train_encodings, train_labels)
valid_dataset = JobDataset(valid_encodings, valid_labels)


In [8]:
# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)


In [9]:
# Training setup
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

optim = AdamW(model.parameters(), lr=5e-5)




In [1]:
import pandas as pd
import ast
from sklearn.preprocessing import LabelEncoder
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
import torch
from torch.utils.data import DataLoader, Dataset

# Define a custom dataset
class JobDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Load your preprocessed data
train_df = pd.read_csv('train_data.csv')
valid_df = pd.read_csv('valid_data.csv')

# Ensure input_ids and attention_mask are lists of integers
train_df['input_ids'] = train_df['input_ids'].apply(ast.literal_eval)
train_df['attention_mask'] = train_df['attention_mask'].apply(ast.literal_eval)
valid_df['input_ids'] = valid_df['input_ids'].apply(ast.literal_eval)
valid_df['attention_mask'] = valid_df['attention_mask'].apply(ast.literal_eval)

# Combine labels from both datasets to ensure all labels are seen during fitting
all_labels = pd.concat([train_df['Role Category'], valid_df['Role Category']])

# Encode the 'Role Category' column to integers
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

train_df['Role Category'] = label_encoder.transform(train_df['Role Category'])
valid_df['Role Category'] = label_encoder.transform(valid_df['Role Category'])

# Prepare the data
train_encodings = {'input_ids': train_df['input_ids'].tolist(), 'attention_mask': train_df['attention_mask'].tolist()}
valid_encodings = {'input_ids': valid_df['input_ids'].tolist(), 'attention_mask': valid_df['attention_mask'].tolist()}

# Extract labels
train_labels = train_df['Role Category'].tolist()
valid_labels = valid_df['Role Category'].tolist()

train_dataset = JobDataset(train_encodings, train_labels)
valid_dataset = JobDataset(valid_encodings, valid_labels)

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)

# Training setup
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

optim = AdamW(model.parameters(), lr=5e-5)

# Training loop
for epoch in range(3):  # Number of epochs
    model.train()
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optim.step()

    # Validation
    model.eval()
    valid_preds, valid_labels = [], []
    with torch.no_grad():
        for batch in valid_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            valid_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            valid_labels.extend(labels.cpu().numpy())

    valid_accuracy = accuracy_score(valid_labels, valid_preds)
    print(f'Epoch {epoch + 1}, Validation Accuracy: {valid_accuracy}')

print("Training complete.")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'accuracy_score' is not defined