<a href="https://colab.research.google.com/github/SachinkumarSakthivel/PROJECT-1/blob/main/healtether.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import os
import io
import requests
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from PyPDF2 import PdfFileReader

# Step 3: Authenticate and access Google Drive

# Load credentials
SCOPES = ['https://www.googleapis.com/auth/drive']
SERVICE_ACCOUNT_FILE = 'https://drive.google.com/drive/folders/1A0GPGrH7rLlAFNJ-RftYzLp28fJ0kKSU'
# Check if the service account file exists
if not os.path.exists(SERVICE_ACCOUNT_FILE):
    raise FileNotFoundError(f"The service account file '{SERVICE_ACCOUNT_FILE}' does not exist. Please check the path.")

credentials = service_account.Credentials.from_service_account_file(
    SERVICE_ACCOUNT_FILE, scopes=SCOPES)

# Create Google Drive API client
service = build('drive', 'v3', credentials=credentials)

# Function to list files in a Google Drive folder
def list_files_in_folder(folder_id):
    query = f"'{folder_id}' in parents and mimeType='application/pdf'"
    results = service.files().list(q=query).execute()
    items = results.get('files', [])
    return items

# Function to download a file from Google Drive
def download_file(file_id, file_name):
    request = service.files().get_media(fileId=file_id)
    fh = io.FileIO(file_name, 'wb')
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while not done:
        status, done = downloader.next_chunk()
        print(f"Download {int(status.progress() * 100)}%.")
    fh.close()

# Function to parse PDF files
def parse_pdf(file_path):
    with open(file_path, 'rb') as f:
        reader = PdfFileReader(f)
        text = ''
        for page in range(reader.numPages):
            text += reader.getPage(page).extract_text()
    return text

# Function to fetch and parse all PDFs in a folder
def fetch_and_parse_pdfs(folder_id):
    files = list_files_in_folder(folder_id)
    for file in files:
        file_id = file['id']
        file_name = file['name']
        print(f"Downloading {file_name}...")
        download_file(file_id, file_name)
        print(f"Parsing {file_name}...")
        resume_text = parse_pdf(file_name)
        print(resume_text)

# Example usage
folder_id = "https://drive.google.com/drive/folders/1A0GPGrH7rLlAFNJ-RftYzLp28fJ0kKSU"
fetch_and_parse_pdfs(folder_id)


FileNotFoundError: The service account file 'https://drive.google.com/drive/folders/1A0GPGrH7rLlAFNJ-RftYzLp28fJ0kKSU' does not exist. Please check the path.

In [None]:
# Resume Data Loader
# !pip install PyPDF2
# !pip install python-docx\
# !pip install google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client PyPDF2

import requests
from PyPDF2 import PdfFileReader
import docx

def fetch_resume(url):
    response = requests.get(url)
    filename = url.split('/')[-1]
    with open(filename, 'wb') as f:
        f.write(response.content)
    return filename

def parse_pdf(file_path):
    with open(file_path, 'rb') as f:
        reader = PdfFileReader(f)
        text = ''
        for page in range(reader.numPages):
            text += reader.getPage(page).extract_text()
    return text

def parse_docx(file_path):
    doc = docx.Document(file_path)
    text = ''
    for para in doc.paragraphs:
        text += para.text + '\n'
    return text

def parse_resume(file_path):
    if file_path.endswith('.pdf'):
        return parse_pdf(file_path)
    elif file_path.endswith('.docx'):
        return parse_docx(file_path)
    else:
        raise ValueError("Unsupported file format")

resume_url = 'https://drive.google.com/file/d/14akVZY0QKo04F3-Ere_vDnIDznfWHaSd/view?usp=drive_link'
file_path = fetch_resume(resume_url)
resume_text = parse_resume(file_path)
print(resume_text

Pre-process Resumes

In [None]:
import re

def extract_sections(resume_text):
    sections = {
        'work_experience': '',
        'education': '',
        'skills': ''
    }

    # Simple regex-based extraction (this can be improved)
    sections['work_experience'] = re.search(r'Work Experience(.*?)(Education|Skills)', resume_text, re.DOTALL).group(1)
    sections['education'] = re.search(r'Education(.*?)(Skills|$)', resume_text, re.DOTALL).group(1)
    sections['skills'] = re.search(r'Skills(.*)', resume_text, re.DOTALL).group(1)

    return sections

resume_sections = extract_sections(resume_text)
print(resume_sections)


Create Embeddings

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def create_embeddings(text):
    return model.encode(text)

embeddings = {section: create_embeddings(text) for section, text in resume_sections.items()}
print(embeddings)


The Search System

In [None]:
import faiss
import numpy as np

dimension = 384  # Dimension of the embeddings
index = faiss.IndexFlatL2(dimension)

def add_to_index(embeddings):
    vectors = np.array([embeddings[section] for section in embeddings])
    index.add(vectors)

add_to_index(embeddings)

def search(query):
    query_vector = create_embeddings(query)
    D, I = index.search(query_vector.reshape(1, -1), k=5)
    return I

query = "Python developer with FastAPI experience"
results = search(query)
print(results)


The Front-end

In [None]:
import streamlit as st

st.title('Resume Semantic Search')

query = st.text_input('Enter your search query:')
if query:
    results = search(query)
    st.write(f'Results: {results}')
