In [1]:
!pip install transformers datasets evaluate transformers[torch]

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import shuffle

from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import evaluate

data_path = "drive/MyDrive/diploma/jobs-main-data.csv"
max_length = 512

class ModelNames:
    distilbertBaseUncased = "distilbert-base-uncased"
    bertBaseUncased = "bert-base-uncased"

model_name = ModelNames.distilbertBaseUncased

class ModelColumns:
    budget = "budget"
    hourlyRangeMin = "hourlyRangeMin"
    isHourlyPayment = "isHourlyPayment"
    country = "country"
    category = "category"
    workload = "workload"
    duration = "duration"
    clientTotalCharge = "clientTotalCharge"
    clientTotalJobsPosted = "clientTotalJobsPosted"
    clientFeedbackScore = "clientFeedbackScore"
    clientPastHires = "clientPastHires"
    isPaymentMethodVerified = "isPaymentMethodVerified"
    skills = "skills"
    target = "target"


id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

tokenizer = AutoTokenizer.from_pretrained(model_name)
accuracy = evaluate.load("accuracy")

def prepare_dataset():
    df = pd.read_csv(data_path)

    df = df[df["status"].isin(["prelead", "in-progress"]) == False]

    def transform_status_into_target(row):
        if row["status"] == "trashed":
            return 0
        else:
            return 1

    df[ModelColumns.target] = df.apply(transform_status_into_target, axis=1)
    df = df.drop(
        columns=[
            "id",
            "uid",
            "score",
            "createdAt",
            "status",
            "postedAt",
            "query",
        ]
    )

    df.dropna(subset=[ModelColumns.country, ModelColumns.duration], inplace=True)

    # reassign jobs which do not have verified method payment, but were taken
    df.loc[
        (
            (df[ModelColumns.isPaymentMethodVerified] == 0)
            & (df[ModelColumns.target] == 1)
        ),
        ModelColumns.target,
    ] = 0

    # reassign project jobs which have budget less than 5000, but were taken
    df.loc[
        (
            (df[ModelColumns.target] == 1)
            & (df[ModelColumns.budget] < 5000)
            & (df[ModelColumns.isHourlyPayment] == 0)
        ),
        ModelColumns.target,
    ] = 0

    X = df.drop(ModelColumns.target, axis=1).copy()
    X = X[["title", "description"]]

    y = df[ModelColumns.target].copy()

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    rus = RandomUnderSampler(random_state=42, replacement=True)
    X_train_undersampled, y_train_undersampled = rus.fit_resample(X_train, y_train)

    X_train, y_train = shuffle(
        X_train_undersampled, y_train_undersampled, random_state=42
    )

    train = X_train.copy()
    test = X_test.copy()

    train["text"] = train["title"] + "\n" + train["description"]
    test["text"] = test["title"] + "\n" + test["description"]

    train = train.drop(columns=["title", "description"])
    test = test.drop(columns=["title", "description"])

    train["label"] = y_train
    test["label"] = y_test

    train = Dataset.from_pandas(train)
    test = Dataset.from_pandas(test)

    train = train.map(preprocess_function, batched=True)
    test = test.map(preprocess_function, batched=True)

    train = train.select(range(64))
    test = test.select(range(32))

    return train, test

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=max_length)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


train, test = prepare_dataset()

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir=model_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Map:   0%|          | 0/5014 [00:00<?, ? examples/s]

Map:   0%|          | 0/3388 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.650505,0.75
2,No log,0.672681,0.65625


TrainOutput(global_step=8, training_loss=0.7038155794143677, metrics={'train_runtime': 30.4605, 'train_samples_per_second': 4.202, 'train_steps_per_second': 0.263, 'total_flos': 31959771005760.0, 'train_loss': 0.7038155794143677, 'epoch': 2.0})

In [8]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

events.out.tfevents.1714039480.922a85d4eb90.191.0:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/DenysZakharkevych/bert-base-uncased/commit/239d5e108c80e91dde23bbe0826cef33d30774d0', commit_message='End of training', commit_description='', oid='239d5e108c80e91dde23bbe0826cef33d30774d0', pr_url=None, pr_revision=None, pr_num=None)

In [9]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model=model_name)
text = train[1]['text']
classifier(text)

[{'label': 'NEGATIVE', 'score': 0.5195567607879639}]