In [0]:
from pyspark.sql.functions import col
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

df_raw = spark.read.table("workspace.resume_project.resume_raw_data")
window_spec = Window.partitionBy("candidate_id").orderBy(col("upload_date").desc())
df_raw = df_raw.withColumn("row_num", row_number().over(window_spec)) \
                   .filter(col("row_num") == 1) \
                   .drop("row_num")

print(f"Total resumes to parse: {df_raw.count()}")
df_raw.show(5)

Total resumes to parse: 603
+--------------------+--------+------------+-------------------+
|              Resume|Category|candidate_id|        upload_date|
+--------------------+--------+------------+-------------------+
| a Christian maga...|   Other|      000038|2023-01-01 00:00:00|
| experienced desi...|   Other|      000047|2023-01-01 00:00:00|
|                 ADP|   Other|      000052|2023-01-01 00:00:00|
|         HR SPECI...|      HR|      000077|2023-01-01 00:00:00|
|               forms|   Other|      000090|2023-01-01 00:00:00|
+--------------------+--------+------------+-------------------+
only showing top 5 rows


### Match skills

In [0]:
skill_keywords = [
    # Programming Languages
    "java", "python", "c", "c++", "c#", "scala", "go", "rust", "ruby", "php", "kotlin", "swift", "typescript", "javascript", "html", "css", "bash", "shell scripting", "matlab", "r",

    # Data Science / ML
    "pandas", "numpy", "scikit-learn", "tensorflow", "keras", "pytorch", "xgboost", "lightgbm", "statsmodels", "opencv", "matplotlib", "seaborn", "nltk", "spacy", "gensim", "mlflow", "huggingface", "transformers", "langchain", "lstm", "cnn", "gan", "nlp", "ocr",

    # Web Development
    "react", "angular", "vue", "next.js", "express.js", "node.js", "flask", "django", "fastapi", "spring", "jsp", "bootstrap", "tailwind", "jquery", "graphql", "websockets", "rest api", "soap",

    # Databases
    "mysql", "postgresql", "oracle", "sql server", "sqlite", "mongodb", "cassandra", "dynamodb", "neo4j", "redis", "elasticsearch", "firebase",

    # DevOps & Cloud
    "docker", "kubernetes", "jenkins", "ansible", "puppet", "terraform", "prometheus", "grafana", "aws", "azure", "gcp", "lambda", "s3", "ec2", "cloudformation", "ci/cd", "git", "bitbucket", "gitlab", "github actions", "nexus", "helm", "nginx", "apache",

    # Testing & QA
    "selenium", "cypress", "junit", "pytest", "postman", "restassured", "jmeter", "appium", "karma", "mocha", "testng", "robot framework", "manual testing", "unit testing", "integration testing", "ui testing", "api testing",

    # Project & Agile Tools
    "jira", "confluence", "trello", "asana", "slack", "monday.com", "ms project", "kanban", "scrum", "agile", "waterfall", "safe", "rally", "notion",

    # BI & Visualization
    "power bi", "tableau", "looker", "qlikview", "metabase", "excel", "google sheets", "data studio", "qlik sense", "redash",

    # Design & Marketing
    "photoshop", "illustrator", "figma", "adobe xd", "canva", "seo", "sem", "google analytics", "email marketing", "social media", "wordpress", "shopify", "ux/ui", "coreldraw",

    # ERP & Enterprise
    "sap", "sap fico", "sap mm", "oracle fusion", "oracle ebs", "netsuite", "salesforce", "zoho crm", "workday", "peopleSoft",

    # Operating Systems / Sys Admin
    "linux", "ubuntu", "windows server", "rhel", "bash", "powershell", "cron", "networking", "vpn", "dns", "load balancing", "firewall", "vmware", "virtualbox", "vagrant",

    # Big Data & Streaming
    "hadoop", "spark", "pyspark", "hive", "pig", "oozie", "kafka", "flink", "storm", "airflow", "databricks",

    # Security
    "owasp", "penetration testing", "nmap", "burpsuite", "wireshark", "firewall", "siem", "splunk", "ids", "encryption", "ssl", "tls", "jwt", "oauth",

    # Soft Skills
    "communication", "teamwork", "leadership", "problem solving", "critical thinking", "adaptability", "time management", "collaboration", "decision making", "creativity", "negotiation"
]


In [0]:
import re

education_pattern = re.compile(r"\b(B\.E|MBA|PhD|M\.Sc|B\.Tech|Bachelor|Master|Doctorate)\b", re.I)
email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
url_pattern = re.compile(
    r"(https?://[^\s]+|www\.[^\s]+)",
    re.IGNORECASE
)

In [0]:
from pyspark.sql.functions import pandas_udf
import pandas as pd

@pandas_udf("string")
def extract_skills(resumes: pd.Series) -> pd.Series:
    def match_skills(text):
        if pd.isnull(text):
            return ""
        text = text.lower()
        found = set()
        for kw in skill_keywords:
            # Match exact word using word boundaries
            pattern = r'\b' + re.escape(kw.lower()) + r'\b'
            if re.search(pattern, text):
                found.add(kw)
        return ", ".join(sorted(found))
    return resumes.apply(match_skills)

In [0]:
@pandas_udf("string")
def extract_emails(resumes: pd.Series) -> pd.Series:
    def get_emails(text):
        if pd.isnull(text):
            return ""
        return ", ".join(set(email_pattern.findall(text.lower())))
    return resumes.apply(get_emails)

In [0]:
@pandas_udf("string")
def extract_urls(resumes: pd.Series) -> pd.Series:
    def get_urls(text):
        if pd.isnull(text):
            return ""
        return ", ".join(set(url_pattern.findall(text.lower())))
    return resumes.apply(get_urls)

In [0]:
@pandas_udf("string")
def extract_education(resumes: pd.Series) -> pd.Series:
    def get_education(text):
        if pd.isnull(text):
            return ""
        matches = education_pattern.findall(text)
        return ", ".join(set(matches))
    return resumes.apply(get_education)

In [0]:
df_parsed = df_raw \
    .withColumn("skills", extract_skills(col("Resume"))) \
    .withColumn("emails", extract_emails(col("Resume"))) \
    .withColumn("urls", extract_urls(col("Resume"))) \
    .withColumn("education_level", extract_education(col("Resume")))

In [0]:
df_parsed.show(1000)

+--------------------+--------------------+------------+-------------------+--------------------+------+--------------------+--------------------+
|              Resume|            Category|candidate_id|        upload_date|              skills|emails|                urls|     education_level|
+--------------------+--------------------+------------+-------------------+--------------------+------+--------------------+--------------------+
| a Christian maga...|               Other|      000038|2023-01-01 00:00:00|                    |      |                    |                    |
| experienced desi...|               Other|      000047|2023-01-01 00:00:00|                    |      |                    |                    |
|                 ADP|               Other|      000052|2023-01-01 00:00:00|                    |      |                    |                    |
|         HR SPECI...|                  HR|      000077|2023-01-01 00:00:00|  communication, sap|      |              

In [0]:
df_parsed.describe()

DataFrame[summary: string, Resume: string, Category: string, candidate_id: string, skills: string, emails: string, urls: string, education_level: string]

In [0]:
from pyspark.sql.functions import col, length, when, regexp_replace

special_chars_pattern = r"[<>&%$#@!{}\[\]\\|^~`]"

special_chars_condition = (
    (length(col("Resume")) - length(regexp_replace(col("Resume"), special_chars_pattern, ""))) > 50
)

empty_extracted_fields = (
    (col("skills").isNull() | (col("skills") == "")) &
    (col("emails").isNull() | (col("emails") == "")) &
    (col("urls").isNull() | (col("urls") == "")) &
    (col("education_level").isNull() | (col("education_level") == ""))
)

df_filtered = df_parsed.withColumn(
    "is_usable",
    (~(length(col("Resume")) < 100) &              # Resume length >= 100
     (~special_chars_condition) &                  # Not too many special chars
     (~empty_extracted_fields))                    # At least one extracted field not empty
)


In [0]:
df_filtered.show(1000)

+--------------------+--------------------+------------+-------------------+--------------------+------+--------------------+--------------------+---------+
|              Resume|            Category|candidate_id|        upload_date|              skills|emails|                urls|     education_level|is_usable|
+--------------------+--------------------+------------+-------------------+--------------------+------+--------------------+--------------------+---------+
| a Christian maga...|               Other|      000038|2023-01-01 00:00:00|                    |      |                    |                    |    false|
| experienced desi...|               Other|      000047|2023-01-01 00:00:00|                    |      |                    |                    |    false|
|                 ADP|               Other|      000052|2023-01-01 00:00:00|                    |      |                    |                    |    false|
|         HR SPECI...|                  HR|      000077|20

In [0]:
df_filtered.groupBy(col("is_usable")) \
    .count() \
    .show()

+---------+-----+
|is_usable|count|
+---------+-----+
|     true|   76|
|    false|  527|
+---------+-----+



In [0]:
df_filtered.printSchema()

root
 |-- Resume: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- candidate_id: string (nullable = true)
 |-- upload_date: timestamp (nullable = true)
 |-- skills: string (nullable = true)
 |-- emails: string (nullable = true)
 |-- urls: string (nullable = true)
 |-- education_level: string (nullable = true)
 |-- is_usable: boolean (nullable = true)



In [0]:
from datetime import datetime, timedelta
from pyspark.sql.functions import lit

tracker_df = spark.read.table("workspace.resume_project.processing_tracker")

last_processed_str = tracker_df.select("last_processed_date").first()[0]
last_processed_date = datetime.strptime(last_processed_str, "%Y-%m-%d")
current_processed_date = last_processed_date + timedelta(days=1)

print(f"Last processed date: {last_processed_date.date()}")
print(f"Current processed date: {current_processed_date.date()}")

effective_date = lit(current_processed_date.date())

Last processed date: 2022-12-31
Current processed date: 2023-01-01


In [0]:
from pyspark.sql.functions import when

df_final = df_filtered.withColumn("effective_start_date", effective_date) \
                    .withColumn("effective_end_date", lit("3000-01-01")) \
                    .withColumn("is_current", when(col("is_usable") == True, True).otherwise(False))

In [0]:
df_final.count()

603

In [0]:
df_final.printSchema()

root
 |-- Resume: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- candidate_id: string (nullable = true)
 |-- upload_date: timestamp (nullable = true)
 |-- skills: string (nullable = true)
 |-- emails: string (nullable = true)
 |-- urls: string (nullable = true)
 |-- education_level: string (nullable = true)
 |-- is_usable: boolean (nullable = true)
 |-- effective_start_date: date (nullable = false)
 |-- effective_end_date: string (nullable = false)
 |-- is_current: boolean (nullable = false)



In [0]:
df_final.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .partitionBy("upload_date") \
    .saveAsTable("workspace.resume_project.resume_parsed_data")