<a href="https://colab.research.google.com/github/Omkar-talla/-DEEP-LEARNING-BASED-VIDEO-LIGHT-IMAGE-ENHANCEMENT-FOR-IMPROVED-VISIBILITY-/blob/main/FakeJobCompanydetection_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install --upgrade transformers torch datasets scikit-learn requests streamlit -q


import pandas as pd
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import requests
import streamlit as st


from google.colab import files
uploaded = files.upload()  # select fake_job_postings.csv

df = pd.read_csv(list(uploaded.keys())[0])
df = df.fillna("")


df["text"] = df["title"] + " " + df["location"] + " " + df["department"] + " " + \
             df["company_profile"] + " " + df["description"] + " " + \
             df["requirements"] + " " + df["benefits"]

X = df["text"].tolist()
y = df["fraudulent"].tolist()


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=256)


class JobDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = JobDataset(train_encodings, y_train)
test_dataset = JobDataset(test_encodings, y_test)

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=2
)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=50,
    save_steps=200,
    save_total_limit=1,
    load_best_model_at_end=False  # set False to avoid evaluation_strategy error
)


from sklearn.metrics import accuracy_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)


trainer.train()



# Company verification using OpenCorporates API
def verify_company(company_name):
    try:
        api_url = f"https://api.opencorporates.com/v0.4/companies/search?q={company_name}"
        response = requests.get(api_url).json()
        if response['results']['companies']:
            return True
        else:
            return False
    except:
        return False

# URL verification
def verify_url(url):
    try:
        r = requests.get(url, timeout=5)
        return r.status_code == 200
    except:
        return False

# Final prediction combining text + company + URL
def final_prediction(job_text, company_name="", url=""):
    encoding = tokenizer(job_text, truncation=True, padding=True, max_length=256, return_tensors="pt")
    outputs = model(**encoding)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    text_pred = torch.argmax(probs).item()

    company_exists = verify_company(company_name)
    url_valid = verify_url(url) if url else True

    fake_score = 0
    if text_pred == 1:
        fake_score += 1
    if not company_exists:
        fake_score += 1
    if not url_valid:
        fake_score += 1

    return "Fake" if fake_score >= 2 else "Real"


# Step 15: Streamlit App

st.title("AI-Powered Fake Job Posting Detector")

job_text = st.text_area("Paste Job Description Text Here")
company_name = st.text_input("Enter Company Name")
job_url = st.text_input("Enter Job Posting URL (optional)")

if st.button("Predict"):
    if not job_text:
        st.warning("Please enter the job description text!")
    else:
        prediction = final_prediction(job_text, company_name, job_url)
        st.success(f"Prediction: {prediction}")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m87.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does n

Saving fake_job_postings.csv to fake_job_postings.csv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33momkartalla88[0m ([33momkartalla88-na[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.2357
100,0.144
150,0.1507
200,0.1304
250,0.0876
300,0.1024
350,0.1396
400,0.0913
450,0.0848
500,0.109


2025-10-06 09:52:04.936 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-10-06 09:52:04.950 Session state does not function when running a script without `streamlit run`


In [None]:
model.save_pretrained("fake_job_model")
tokenizer.save_pretrained("fake_job_model")
print("Model saved!")


Model saved!


In [None]:
def final_prediction(job_text, company_name="", url=""):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # make sure model is on the correct device

    # Tokenize and move input to same device
    encoding = tokenizer(
        job_text, truncation=True, padding=True, max_length=256, return_tensors="pt"
    )
    encoding = {key: val.to(device) for key, val in encoding.items()}

    # Model prediction
    outputs = model(**encoding)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    text_pred = torch.argmax(probs).item()

    # External checks
    company_exists = verify_company(company_name)
    url_valid = verify_url(url) if url else True

    # Combine results
    fake_score = 0
    if text_pred == 1:
        fake_score += 1
    if not company_exists:
        fake_score += 1
    if not url_valid:
        fake_score += 1

    return "Fake" if fake_score >= 1 else "Real"




In [None]:
# Fake job example
job_text = "We are hiring an AI wizard with 50 years of experience. Unlimited salary and magical benefits!"
company_name = "FakeTechXYZ123"
job_url = "https://www.thisisnotarealwebsite123.com/job"

# Run final prediction
prediction = final_prediction(job_text, company_name, job_url)
print("Prediction:", prediction)


Prediction: Fake


In [None]:
job_text = "Hiring an AI genius with 50 years of experience. Unlimited salary and magical benefits!"
company_name = "FakeTechXYZ123"
job_url = "https://www.thisisnotarealwebsite123.com/job"

prediction = final_prediction(job_text, company_name, job_url)
print("Prediction:", prediction)


Prediction: Fake


In [None]:
job_text = "Hiring a frontend developer with 2 years of React experience. Standard salary and benefits."
company_name = "NonExistentTechCo"
job_url = "https://www.github.com/"  # This URL exists

prediction = final_prediction(job_text, company_name, job_url)
print("Prediction:", prediction)


Prediction: Fake


In [None]:
job_text = "Earn $5000 per week working from home. No experience required. Just sign up today!"
company_name = "WorkFromHome Money Inc."
job_url = "http://shadyjobs123.biz"
prediction = final_prediction(job_text, company_name, job_url)
print("Prediction:", prediction)


Prediction: Fake


In [None]:
from google.colab import files
import shutil

# Zip the saved model folder
shutil.make_archive("fake_job_model", 'zip', "fake_job_model")

# Download to your computer
files.download("fake_job_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>