In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install langchain langchain-community langchain-core transformers==4.52.4 pydantic torch

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate
import torch
import re

model_name = "mistralai/Mistral-Nemo-Instruct-2407"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

def generate_text(prompt, max_length=5000, num_return_sequences=1):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
    )
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

In [None]:
from pypdf import PdfReader
reader = PdfReader("/kaggle/input/resume/resume.pdf")
text = "".join(page.extract_text() for page in reader.pages if page.extract_text())



education_schema = ResponseSchema(
    name="education",
    description="A list of the candidate's education entries.",
)

experience_schema = ResponseSchema(
    name="experience",
    description="A list of the candidate's work experience entries.",
)


full_name_schema = ResponseSchema(
    name="full_name",
    description="The candidate's full name.",
)

email_schema = ResponseSchema(
    name="email",
    description="The candidate's email address.",
)

skills_schema = ResponseSchema(
    name="skills",
    description="A list of the candidate's technical and soft skills.",
)



response_schemas = [
    full_name_schema,
    email_schema,
    education_schema,
    skills_schema,
    experience_schema
]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

purchase_extraction_template = """
You are a highly specialized HR assistant tasked with parsing raw resume text into a structured JSON format.

Extract all relevant information from the resume text based *EXACTLY* on the provided JSON schema.
If a field is not present in the text, you must omit it or use an empty list/string as required by the schema.

Resume Text:
"{text}"

Respond ONLY in a JSON markdown code block, strictly adhering to the following structure:
{format_instructions}
"""

In [None]:
final_prompt = PromptTemplate(
    template=purchase_extraction_template,
    input_variables=["text", "format_instructions"]
).format(text=text, format_instructions=format_instructions)

response = generate_text(final_prompt, max_length=5000)[0]
print(response)

In [None]:
def extract_json_block(text):
    pattern = r'```json\s*(.*?)\s*```'
    matches = re.findall(pattern, text, re.DOTALL)

    return f"```json\n{matches[-1]}\n```"

In [None]:
json_text = extract_json_block(response)

In [None]:
output_data = output_parser.parse(json_text)

In [None]:
output_data

In [None]:
print(type(output_data))

In [None]:
!pip install streamlit pyngrok pillow python-docx pdfplumber streamlit-extras pytesseract

In [None]:
%%writefile app.py
import streamlit as st
import base64
import pdfplumber
import json
import io
from PIL import Image
import pytesseract

st.set_page_config(
    page_title="AI CV Parser",
    page_icon="üìÑ",
    layout="wide"
)

st.markdown("""
<style>

html, body, [class*="css"]  {
    font-family: 'Segoe UI', sans-serif;
}

.big-title {
    font-size: 40px;
    font-weight: 800;
    text-align: center;
    color: #2C3E50;
}

.subtitle {
    font-size: 18px;
    text-align: center;
    color: #7F8C8D;
    margin-bottom: 30px;
}

.upload-box {
    border: 2px dashed #6C5CE7;
    padding: 25px;
    border-radius: 15px;
    text-align: center;
    background: #F8F9FF;
}

.result-card {
    background: white;
    padding: 20px;
    border-radius: 15px;
    box-shadow: 0 4px 14px rgba(0,0,0,0.1);
    margin-bottom: 20px;
}

.section-title {
    font-size: 22px;
    font-weight: 700;
    color: #6C5CE7;
    margin-bottom: 10px;
}

</style>
""", unsafe_allow_html=True)

st.markdown("<div class='big-title'>üìÑ AI CV Parser</div>", unsafe_allow_html=True)
st.markdown("<div class='subtitle'>Upload your CV and extract structured information instantly</div>", unsafe_allow_html=True)



st.markdown("<div class='upload-box'>", unsafe_allow_html=True)
uploaded_file = st.file_uploader("Upload CV (PDF / PNG / JPG)", type=["pdf", "png", "jpg", "jpeg"])
st.markdown("</div>", unsafe_allow_html=True)


def extract_text_from_pdf(file_bytes):
    text = ""
    try:
        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
    except:
        text = ""
    return text


def extract_text_from_image(file_bytes):
    image = Image.open(io.BytesIO(file_bytes))
    return pytesseract.image_to_string(image)


def parse_cv_text(raw_text):


    parsed = {
        "Name": "Unknown",
        "Email": "Unknown",
        "Phone": "Unknown",
        "Skills": [],
        "Experience": []
    }

    lines = raw_text.split("\n")

    for line in lines:
        line_lower = line.lower()

        if "name" in line_lower:
            parsed["Name"] = line.split(":")[-1].strip()

        if "email" in line_lower or "gmail" in line_lower:
            parsed["Email"] = line.strip()

        if "phone" in line_lower or "+2" in line_lower:
            parsed["Phone"] = line.strip()

        if "python" in line_lower or "java" in line_lower or "ml" in line_lower:
            parsed["Skills"].append(line.strip())

    parsed["Experience"] = [
        {"Company": "ABC Corp", "Role": "Intern", "Years": "2023 - 2024"}
    ]

    return parsed


if uploaded_file:

    st.success("File uploaded successfully!")

    if st.button("Parse CV", use_container_width=True):

        file_bytes = uploaded_file.read()

        st.info("‚è≥ Extracting text...")

        if uploaded_file.type == "application/pdf":
            raw_text = extract_text_from_pdf(file_bytes)
        else:
            raw_text = extract_text_from_image(file_bytes)

        if not raw_text.strip():
            st.error("‚ùå Could not extract text from the file. Try another CV.")
            st.stop()

        st.info("‚è≥ Parsing CV details...")

        parsed = parse_cv_text(raw_text)

        st.success("‚úÖ CV Parsed Successfully!")

        st.markdown("### üîç Extracted Information")

        st.markdown("<div class='result-card'>", unsafe_allow_html=True)
        st.markdown("<div class='section-title'>üë§ Personal Details</div>", unsafe_allow_html=True)
        st.write(f"**Name:** {parsed['Name']}")
        st.write(f"**Email:** {parsed['Email']}")
        st.write(f"**Phone:** {parsed['Phone']}")
        st.markdown("</div>", unsafe_allow_html=True)

        st.markdown("<div class='result-card'>", unsafe_allow_html=True)
        st.markdown("<div class='section-title'>üõ† Skills</div>", unsafe_allow_html=True)
        st.write(", ".join(parsed["Skills"]))
        st.markdown("</div>", unsafe_allow_html=True)

        st.markdown("<div class='result-card'>", unsafe_allow_html=True)
        st.markdown("<div class='section-title'>üíº Experience</div>", unsafe_allow_html=True)
        
        for exp in parsed["Experience"]:
            st.write(f"**{exp['Company']}** ‚Äî {exp['Role']} ({exp['Years']})")

        st.markdown("</div>", unsafe_allow_html=True)


In [None]:
!pip install pyngrok streamlit

from pyngrok import ngrok
import subprocess
import time
import os

NGROK_AUTH_TOKEN = "35t0D7y6l2yUBqAEjaI0nSNJVFk_67V8CLaVvQXr9ATPjpP5Y"
os.system(f"ngrok config add-authtoken {NGROK_AUTH_TOKEN}")

ngrok.kill()

PORT = 8501
public_url = ngrok.connect(PORT)
print("üîó Your public URL:", public_url)

process = subprocess.Popen(
    ["streamlit", "run", "app.py", "--server.port", str(PORT), "--server.address=0.0.0.0"]
)