<a href="https://colab.research.google.com/github/Valmik2004/Infosys-Springboard-Internship/blob/main/Day_11_Tasks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()   # select fake_job_postings.csv


Saving fake_job_postings.csv to fake_job_postings.csv


In [2]:
import pandas as pd
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import os

In [3]:
df = pd.read_csv('fake_job_postings.csv')

# Create clean_description if missing
def basic_clean(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

if 'clean_description' not in df.columns:
    df['clean_description'] = df['description'].fillna('').apply(basic_clean)

df = df[df['clean_description'].str.strip()!='']
df.shape


(17878, 19)

In [4]:
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['clean_description'])
y = df['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Save files
joblib.dump(model, 'fake_job_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("Saved model & vectorizer.")


Saved model & vectorizer.


In [5]:
!mkdir -p templates


In [6]:
%%writefile templates/index.html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Fake Job Detection</title>
<style>
    body { font-family: 'Segoe UI'; background: #f3f5f9; text-align: center; padding: 50px; }
    form { background: white; padding: 30px; border-radius: 12px; box-shadow: 0 0 10px rgba(0,0,0,0.1); width: 50%; margin: auto; }
    textarea { width: 90%; height: 120px; padding: 10px; border: 1px solid #ccc; border-radius: 8px; }
    button { padding: 10px 20px; border: none; border-radius: 8px; background-color: #28a745; color: white; cursor: pointer; }
    button:hover { background-color: #218838; }
    .error { color: red; margin-top: 10px; }
    .stats { margin-top: 30px; background: #fff; padding: 15px; border-radius: 10px; width: 50%; margin: auto; }
    .fake { color: red; font-weight: bold; }
    .real { color: green; font-weight: bold; }
    .link-button { margin-top: 20px; display: inline-block; background-color: #007bff; color: white; padding: 10px 15px; border-radius: 8px; text-decoration: none; }
    .link-button:hover { background-color: #0056b3; }
</style>
</head>
<body>
<h2>Fake Job Detection System</h2>

<form action="/predict" method="POST">
<label><b>Enter Job Description:</b></label><br><br>
<textarea name="job_description" placeholder="Paste job post here..."></textarea><br><br>
<button type="submit">Check Authenticity</button>
</form>

{% if error %}
<p class="error">{{ error }}</p>
{% endif %}

<div class="stats">
<h3>Prediction Summary</h3>
<p>ðŸ§© Fake Jobs Detected: <span class="fake">{{ fake }}</span></p>
<p>âœ… Real Jobs Detected: <span class="real">{{ real }}</span></p>
</div>

<a class="link-button" href="/history">ðŸ“œ View History</a>

</body>
</html>


Writing templates/index.html


In [7]:
%%writefile templates/result.html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Prediction Result</title>
<style>
    body { font-family: 'Segoe UI'; background: #f3f5f9; text-align: center; padding: 50px; }
    .card { background: white; padding: 30px; border-radius: 12px; box-shadow: 0 0 10px rgba(0,0,0,0.1); width: 50%; margin: auto; }
    progress { width: 80%; height: 20px; }
    .fake { color: red; font-weight: bold; }
    .real { color: green; font-weight: bold; }
    .stats { margin-top: 30px; background: #fff; padding: 15px; border-radius: 10px; }
    a { color: #007bff; text-decoration: none; }
    a:hover { text-decoration: underline; }
</style>
</head>
<body>

<div class="card">
<h2>Prediction Result</h2>

<p><b>Description:</b> {{ description }}</p>

<h3>Prediction:
{% if label == 'Fake Job' %}
<span class="fake">{{ label }}</span>
{% else %}
<span class="real">{{ label }}</span>
{% endif %}
</h3>

<p>Confidence: {{ confidence }}%</p>
<progress value="{{ confidence }}" max="100"></progress>

<br><br>
<a href="/">ðŸ”™ Back to Home</a>

<div class="stats">
<h3>Summary</h3>
<p>Fake Jobs: <span class="fake">{{ fake }}</span></p>
<p>Real Jobs: <span class="real">{{ real }}</span></p>
</div>

</div>
</body>
</html>


Writing templates/result.html


In [8]:
%%writefile templates/history.html
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Prediction History</title>
<style>
    body { font-family: 'Segoe UI'; background: #f3f5f9; text-align: center; padding: 40px; }
    table { width: 90%; margin: auto; border-collapse: collapse; background: white; }
    th, td { padding: 10px; border: 1px solid #ccc; text-align: center; }
    th { background: #007bff; color: white; }
    tr:nth-child(even) { background: #f2f2f2; }
    a { color: #007bff; text-decoration: none; }
    a:hover { text-decoration: underline; }
</style>
</head>
<body>
<h2>ðŸ“œ Prediction History</h2>
<div>
{{ tables|safe }}
</div>
<br>
<a href="/">ðŸ”™ Back to Home</a>
</body>
</html>


Writing templates/history.html


In [9]:
%%writefile app.py
from flask import Flask, render_template, request
import joblib, csv, os
from datetime import datetime
import pandas as pd

app = Flask(__name__)

# Load model + vectorizer
model = joblib.load('fake_job_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

fake_count = 0
real_count = 0

LOG_FILE = "predictions_log.csv"

# Create CSV if not exists
if not os.path.exists(LOG_FILE):
    with open(LOG_FILE, 'w', newline='', encoding='utf-8') as f:
        csv.writer(f).writerow(["timestamp", "job_description", "prediction", "confidence"])

@app.route('/')
def home():
    return render_template("index.html", fake=fake_count, real=real_count)

@app.route('/predict', methods=['POST'])
def predict():
    global fake_count, real_count

    job_desc = request.form['job_description'].strip()

    if len(job_desc.split()) < 5:
        return render_template("index.html",
                               error="Please enter at least 5 words.",
                               fake=fake_count, real=real_count)

    X_input = vectorizer.transform([job_desc])
    pred = model.predict(X_input)[0]
    prob = model.predict_proba(X_input)[0][1]

    label = "Fake Job" if pred == 1 else "Real Job"
    confidence = round(prob*100,2) if pred==1 else round((1-prob)*100,2)

    # Update counters
    if pred == 1: fake_count += 1
    else: real_count += 1

    # Log to CSV
    with open(LOG_FILE, 'a', newline='', encoding='utf-8') as f:
        csv.writer(f).writerow([
            datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            job_desc, label, confidence
        ])

    return render_template("result.html",
                           label=label, confidence=confidence,
                           description=job_desc,
                           fake=fake_count, real=real_count)

@app.route('/history')
def history():
    df = pd.read_csv(LOG_FILE)
    return render_template("history.html",
                           tables=df.to_html(index=False))

if __name__ == "__main__":
    app.run()


Writing app.py


In [None]:
!pip install pyngrok flask pandas joblib --quiet

from pyngrok import ngrok
ngrok.set_auth_token("354A7wIzV8oo6I3t1xYLDyPtsBH_4UifAjbTkMFA1pG6hLhgT")

public_url = ngrok.connect(5000)
print("Public URL:", public_url)

!python app.py


Public URL: NgrokTunnel: "https://unbalanced-galen-holdable.ngrok-free.dev" -> "http://localhost:5000"
 * Serving Flask app 'app'
 * Debug mode: off
 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [13/Nov/2025 13:24:38] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [13/Nov/2025 13:24:39] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
127.0.0.1 - - [13/Nov/2025 13:25:33] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [13/Nov/2025 13:25:39] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [13/Nov/2025 13:25:41] "GET /history HTTP/1.1" 200 -
127.0.0.1 - - [13/Nov/2025 13:25:49] "GET / HTTP/1.1" 200 -
