In [44]:
import pdfplumber
import openpyexcel

## # Load your PDF

## Now let’s extract Invoice Number, Date, Total:

In [46]:
import re
import pandas as pd

In [47]:
# Extract email
email = re.search(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", text)

# Extract phone number (handles +country codes, dashes, spaces)
phone = re.search(r"(\+?\d[\d\-\s]{8,}\d)", text)

# Extract name (usually first line of CV)
lines = text.split("\n")
name = lines[0].strip()

print("Name:", name)
print("Email:", email.group(0) if email else None)
print("Phone:", phone.group(0) if phone else None)

Name: Hafiz Abdullah Nadeem
Email: digitalabd417@gmail.com
Phone: 0318-0450458


## 3. Extract Education, Experience, Skills

In [48]:
import re

# Education section
education = re.findall(r"(B\.Sc\.|M\.Sc\.|B\.Tech|M\.Tech|Ph\.D|University|College).*", text)

# Experience section
experience = re.findall(r"(?:Experience|Work History|Employment).*", text, re.IGNORECASE)

# Skills section
skills = re.findall(r"(?:Skills|Technologies|Expertise).*", text, re.IGNORECASE)

print("Education:", education)
print("Experience:", experience)
print("Skills:", skills)


Education: ['University', 'College']
Experience: ['experience in building and', 'EXPERIENCE', 'experience in data preprocessing, model']
Skills: ['expertise.', 'EXPERTISE MY PROJECTS GITHUB']


## 4. Store in Structured Format

In [49]:
cv_data = {
    "name": name,
    "email": email.group(0) if email else None,
    "phone": phone.group(0) if phone else None,
    "education": education,
    "experience": experience,
    "skills": skills
}

print(cv_data)


{'name': 'Hafiz Abdullah Nadeem', 'email': 'digitalabd417@gmail.com', 'phone': '0318-0450458', 'education': ['University', 'College'], 'experience': ['experience in building and', 'EXPERIENCE', 'experience in data preprocessing, model'], 'skills': ['expertise.', 'EXPERTISE MY PROJECTS GITHUB']}


## Anomoly checks

## 1. Check for Missing Fields

In [50]:
anomalies=[]

if not cv_data["email"]:
    anomalies.append("Missing email")

    if not cv_data["phone"]:
        anomalies.append("Missing phone number")

    if not cv_data["education"]:
        anomalies.append("Missing education information")

    if not cv_data["experience"]:
        anomalies.append("Missing experience information")

    if not cv_data["skills"]:
        anomalies.append("Missing skills information")


## 2. Validate Phone & Email Format

In [51]:
import re

if cv_data["email"] and not re.match(r"[^@]+@[^@]+\.[^@]+", cv_data["email"]):
    anomalies.append("Invalid email format")

if cv_data["phone"] and len(re.sub(r"\D", "", cv_data["phone"])) < 8:
    anomalies.append("Invalid phone number")

## 3. Duplicate or Suspicious Data

In [52]:
if len(set(cv_data["skills"])) != len(cv_data["skills"]):
    anomalies.append("Duplicate skills detected")


## Final Report

In [53]:
report = {
    "cv_data": cv_data,
    "anomalies": anomalies,
    "valid": len(anomalies) == 0
}

print(report)


{'cv_data': {'name': 'Hafiz Abdullah Nadeem', 'email': 'digitalabd417@gmail.com', 'phone': '0318-0450458', 'education': ['University', 'College'], 'experience': ['experience in building and', 'EXPERIENCE', 'experience in data preprocessing, model'], 'skills': ['expertise.', 'EXPERTISE MY PROJECTS GITHUB']}, 'anomalies': [], 'valid': True}


## Step 5: Exporting Results to Dataset

In [54]:
import pandas as pd

df = pd.DataFrame([
    report["cv_data"] | {
        "anomalies": "; ".join(report["anomalies"]),
        "valid": report["valid"]
    }
])

df.to_csv("cv_dataset.csv", index=False)
print("Saved to cv_dataset.csv")


Saved to cv_dataset.csv


In [56]:
import pandas as pd

# Create DataFrame from single CV report
df = pd.DataFrame([
    report["cv_data"] | {
        "anomalies": "; ".join(report["anomalies"]),
        "valid": report["valid"]
    }
])

# Save to Excel
df.to_excel("cv_dataset.xlsx", index=False, engine="openpyxl")

print("Saved to cv_dataset.xlsx")


Saved to cv_dataset.xlsx


In [57]:
import pandas as pd
from sklearn.ensemble import IsolationForest

# Load dataset (from Step 5 Excel/CSV)
df = pd.read_csv("cv_dataset.csv")

# --- Feature engineering ---
# 1. Count number of skills
df["num_skills"] = df["skills"].fillna("").apply(lambda x: len(x.split(",")) if isinstance(x, str) else 0)

# 2. Length of education text
df["edu_length"] = df["education"].fillna("").apply(len)

# 3. Length of experience text
df["exp_length"] = df["experience"].fillna("").apply(len)

# --- Prepare features ---
features = df[["num_skills", "edu_length", "exp_length"]]

# --- Train Isolation Forest ---
model = IsolationForest(contamination=0.1, random_state=42)  # 10% anomalies
df["ml_anomaly"] = model.fit_predict(features)

# In sklearn: -1 = anomaly, 1 = normal
df["ml_anomaly"] = df["ml_anomaly"].map({1: "Normal", -1: "Anomaly"})

# Save updated dataset
df.to_csv("cv_dataset_with_ml.csv", index=False)
print("Saved ML anomaly results to cv_dataset_with_ml.csv")

print(df[["name", "num_skills", "edu_length", "exp_length", "ml_anomaly"]])


Saved ML anomaly results to cv_dataset_with_ml.csv
                    name  num_skills  edu_length  exp_length ml_anomaly
0  Hafiz Abdullah Nadeem           2          25          87     Normal


In [59]:
import pdfplumber
import re
import pandas as pd
import os

def process_cv(file_path):
    """Extracts fields + anomalies from a single CV PDF."""
    with pdfplumber.open(file_path) as pdf:
        text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

    # --- Extract fields ---
    email = re.search(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", text)
    phone = re.search(r"(\+?\d[\d\-\s]{8,}\d)", text)
    lines = text.split("\n")
    name = lines[0].strip() if lines else None

    education = re.findall(r"(B\.Sc\.|M\.Sc\.|B\.Tech|M\.Tech|Ph\.D|University|College).*", text)
    experience = re.findall(r"(?:Experience|Work History|Employment).*", text, re.IGNORECASE)
    skills = re.findall(r"(?:Skills|Technologies|Expertise).*", text, re.IGNORECASE)

    cv_data = {
        "file": os.path.basename(file_path),
        "name": name,
        "email": email.group(0) if email else None,
        "phone": phone.group(0) if phone else None,
        "education": "; ".join(education),
        "experience": "; ".join(experience),
        "skills": "; ".join(skills),
    }

    # --- Rule-based anomaly checks ---
    anomalies = []
    if not cv_data["email"]:
        anomalies.append("Missing email")
    if not cv_data["phone"]:
        anomalies.append("Missing phone")
    if not cv_data["education"]:
        anomalies.append("Missing education")
    if not cv_data["experience"]:
        anomalies.append("Missing experience")
    if not cv_data["skills"]:
        anomalies.append("Missing skills")

    if email and not re.match(r"[^@]+@[^@]+\.[^@]+", cv_data["email"]):
        anomalies.append("Invalid email")
    if phone and len(re.sub(r"\D", "", cv_data["phone"])) < 8:
        anomalies.append("Invalid phone")

    report = {
        "cv_data": cv_data,
        "anomalies": anomalies,
        "valid": len(anomalies) == 0
    }
    return report


# --- Process all CVs in folder ---
folder = "CVs"   # put your pdfs inside this folder
results = []

for file in os.listdir(folder):
    if file.endswith(".pdf"):
        file_path = os.path.join(folder, file)
        report = process_cv(file_path)
        results.append(report)

# --- Save results to Excel ---
df = pd.DataFrame([
    r["cv_data"] | {"anomalies": "; ".join(r["anomalies"]), "valid": r["valid"]}
    for r in results
])
df.to_excel("all_cvs_dataset.xlsx", index=False, engine="openpyxl")

print(f"Processed {len(results)} CVs → saved to all_cvs_dataset.xlsx")


Processed 3 CVs → saved to all_cvs_dataset.xlsx
