In [None]:
!pip install spacy

In [None]:
!pip install PyMuPDF

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
!pip install gradio

In [None]:
import fitz # PyMuPDF
import spacy
import gradio as gr
import re
import nltk
from nltk.corpus import stopwords

print("Environment Ready!")

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

In [9]:
def extract_text_from_pdf(pdf_file):
  """
  PDF text reader
  """

  try:
    doc = fitz.open(pdf_file.name)

    text = ""
    for page in doc:
      text += page.get_text()

    cleaned_text = " ".join(text.split())

    if not cleaned_text.strip():
      return "Error: No text found, either it's not in pdf form or it's scanned image"

    return cleaned_text

  except Exception as e:
    return f"Error occurred: {str(e)}"

In [10]:
def extract_contact_info(text):
    """
    Find specific contact details using Regex patterns
    """

    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(email_pattern, text)

    # phone_pattern = r'^[6-9]\d{9}$'
    phone_pattern = r'[6-9]\d{9}'
    phones = re.findall(phone_pattern, text)

    linkedin = re.findall(r'linkedin\.com/in/[\w.-]+', text)

    github = re.findall(r'github\.com/[\w.-]+', text)

    return {
        "Emails": emails[0] if emails else "Email Not Found",
        "Phones": phones[0] if phones else "Phone Not Found",
        "LinkedIn": linkedin[0] if linkedin else "Link Not Found",
        "Github": github[0] if github else "Link Not Found"
    }

In [11]:
def clean_resume_text(text):
    text = text.lower()

    # Remove URLs, Emails and Phone numbers
    text = re.sub(r'\S+@\S+','', text)
    text = re.sub(r'http\S+', '', text)

    # Remove special characters and numbers (alphanumeric and basic punctuation)
    text = re.sub(r'[^a-zA-Z\s]', ' ', str(text))

    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    filtered_text = [w for w in words if w not in stop_words]

    return " ".join(filtered_text)

In [12]:
def resume_parser(pdf_file):
    raw_text = extract_text_from_pdf(pdf_file)

    contacts = extract_contact_info(raw_text)

    cleaned_text = clean_resume_text(raw_text)

    return contacts, cleaned_text

In [16]:
interface = gr.Interface(
    fn = resume_parser,
    inputs = gr.File(label = "Upload Resume (PDF only)"),
    outputs = [
        gr.JSON(label = "Contact Details"),
        gr.Textbox(label = "Cleaned Text", lines = 10)
    ],
    title = "AI Resume Parser"
)

In [None]:
interface.launch(debug=True)