In [None]:
# Step 1: Install required libraries
!pip install -q pandas

# Step 2: Import libraries
import pandas as pd
import sqlite3
from google.colab import files

# Step 3: Upload CSV file
print("Please upload your job_description.csv file")
uploaded = files.upload()

# Step 4: Load CSV with safe encoding
df = pd.read_csv("job_description.csv", encoding='ISO-8859-1')
print("✅ File loaded successfully!\n")

# Step 5: View the top 5 rows
print("📌 Preview of the data:")
print(df.head())

# Step 6: Display column names
print("\n📌 Column names:")
print(df.columns)

# Step 7: Save to SQLite database
conn = sqlite3.connect("job_data.db")
df.to_sql("job_descriptions", conn, if_exists="replace", index=False)
conn.close()

print("\n✅ Data successfully saved to SQLite database named 'job_data.db'")


In [None]:
# Step 8: Clean job descriptions
df['Job Description'] = df['Job Description'].str.replace(r"(?i)description:\s*", "", regex=True)
df['Job Description'] = df['Job Description'].str.replace(r"\n", " ", regex=True)

# Display cleaned version
print("✅ Cleaned job descriptions:")
print(df['Job Description'].head())


In [None]:
# Step 9: Install transformers and summarization model
!pip install -q transformers sentencepiece

from transformers import pipeline

# Load summarizer (Flan-T5)
summarizer = pipeline("summarization", model="google/flan-t5-small", tokenizer="google/flan-t5-small")

# Summarize each JD (truncated if too long for model)
def summarize_text(text):
    try:
        return summarizer(text[:512], max_length=60, min_length=20, do_sample=False)[0]['summary_text']
    except:
        return "Summary failed"

df['Summary'] = df['Job Description'].apply(summarize_text)

# Show summaries
print("✅ Summarized JDs:")
print(df[['Job Title', 'Summary']].head())


In [None]:
# Save updated DataFrame (with summaries) into database
conn = sqlite3.connect("job_data.db")
df.to_sql("job_descriptions", conn, if_exists="replace", index=False)
conn.close()

print("\n✅ Summaries saved to SQLite database.")


In [None]:
from google.colab import files

print("📂 Upload your CV PDFs (you can select multiple files)")
uploaded = files.upload()


In [None]:
!pip install -q PyMuPDF
import fitz  # PyMuPDF

def extract_text_from_pdf(file_path):
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

# Create a dictionary with filenames and their extracted text
cv_texts = {}
for filename in uploaded:
    cv_texts[filename] = extract_text_from_pdf(filename)

# Preview one CV's extracted text
first_cv = list(cv_texts.items())[0]
print(f"\n📄 Extracted text from {first_cv[0]}:\n")
print(first_cv[1][:1000])  # Print only first 1000 characters


In [None]:
import re

def extract_cv_sections(cv_text):
    sections = {
        "name": None,
        "email": None,
        "education": [],
        "experience": [],
        "skills": [],
        "certifications": []
    }

    # Extract name
    name_match = re.search(r"Name:\s*(.+)", cv_text)
    sections["name"] = name_match.group(1).strip() if name_match else None

    # Extract email
    email_match = re.search(r"Email:\s*(\S+@\S+)", cv_text)
    sections["email"] = email_match.group(1).strip() if email_match else None

    # Extract education
    edu_match = re.findall(r"(Bachelor|Master).+?\((\d{4}-\d{4})\)", cv_text)
    sections["education"] = [" ".join(match) for match in edu_match]

    # Extract work experience
    exp_match = re.findall(r"(?<=at\s).+?\((\d{4}-\d{4})\)", cv_text)
    job_titles = re.findall(r"(?<=\n)[A-Z][a-z]+\s(?:Manager|Engineer|Analyst|Developer)", cv_text)
    sections["experience"] = [f"{title} ({exp})" for title, exp in zip(job_titles, exp_match)]

    # Extract skills
    skill_match = re.search(r"Skills\s*(.*?)(?=Certifications|$)", cv_text, re.S)
    if skill_match:
        skills_block = skill_match.group(1)
        sections["skills"] = [s.strip() for s in re.split(r"[-•]", skills_block) if len(s.strip()) > 2]

    # Extract certifications
    cert_match = re.search(r"Certifications\s*(.*)", cv_text, re.S)
    if cert_match:
        certifications_block = cert_match.group(1)
        sections["certifications"] = [s.strip() for s in certifications_block.splitlines() if s.strip()]

    return sections

cv_data = extract_cv_sections(first_cv[1])
for k, v in cv_data.items():
    print(f"🔹 {k.capitalize()}: {v}")


In [None]:
!pip install -q sentence-transformers
from sentence_transformers import SentenceTransformer, util

# Load lightweight but powerful model
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')


In [None]:
# Combine candidate sections
candidate_profile = " ".join(cv_data['education'] + cv_data['experience'] + cv_data['skills'] + cv_data['certifications'])

# Pick a sample job description from the DataFrame
jd_text = df['Job Description'].iloc[0]  # You can loop through all later

# Generate embeddings
cv_embedding = model.encode(candidate_profile, convert_to_tensor=True)
jd_embedding = model.encode(jd_text, convert_to_tensor=True)

# Compute similarity
similarity_score = util.pytorch_cos_sim(cv_embedding, jd_embedding).item()
print(f"\n📊 Semantic Similarity Score with JD 1: {similarity_score:.4f}")


In [None]:
if similarity_score >= 0.8:
    print("✅ Candidate shortlisted for interview!")
else:
    print("❌ Candidate does not meet the threshold.")


In [None]:
import sqlite3
import pandas as pd
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# Load JD DataFrame (already loaded)
df['embedding'] = df['Job Description'].apply(lambda x: model.encode(x))

# Save JD embeddings to SQLite
conn = sqlite3.connect('job_data.db')
df.to_sql('job_descriptions', conn, if_exists='replace', index=False)
conn.close()
