In [None]:
import os
from docx import Document
from lxml import etree
import win32com.client  # For handling .doc files
from PyPDF2 import PdfReader  # For handling .pdf files
import ollama

def extract_text_from_shapes_and_textboxes(doc):
    extracted_text = []
    for shape in doc.element.xpath('.//w:txbxContent//w:t'):
        extracted_text.append(shape.text)
    return ' '.join(extracted_text).strip()

def extract_text_from_headers_and_footers(doc):
    extracted_text = []
    for section in doc.sections:
        for paragraph in section.header.paragraphs:
            extracted_text.append(paragraph.text)
        for paragraph in section.footer.paragraphs:
            extracted_text.append(paragraph.text)
    return ' '.join(extracted_text).strip()

def doc_to_txt(input_path, output_path):
    try:
        word = win32com.client.Dispatch("Word.Application")
        doc = word.Documents.Open(input_path)
        doc_text = doc.Content.Text
        doc.Close()
        word.Quit()
        with open(output_path, "w", encoding="utf-8") as txt_file:
            txt_file.write(doc_text)
        print(f"Converted: {input_path} -> {output_path}")
    except Exception as e:
        print(f"Error converting {input_path}: {e}")

def pdf_to_txt(input_path, output_path):
    try:
        reader = PdfReader(input_path)
        extracted_text = []
        for page in reader.pages:
            extracted_text.append(page.extract_text())
        with open(output_path, "w", encoding="utf-8") as txt_file:
            txt_file.write('\n'.join(extracted_text))
        print(f"Converted: {input_path} -> {output_path}")
    except Exception as e:
        print(f"Error converting {input_path}: {e}")

def split_large_text(content, max_chars=7000):
    chunks = []
    while len(content) > max_chars:
        split_index = content[:max_chars].rfind(" ")
        chunks.append(content[:split_index])
        content = content[split_index:].strip()
    chunks.append(content)
    return chunks

def summarize_with_ollama(chunks, file_name):
    combined_summary = []
    name = None

    for i, chunk in enumerate(chunks):
        try:
            print(f"Summarizing chunk {i + 1} of {len(chunks)}...")
            prompt = (
                f"Summarize the following text while retaining key details relevant to the resume format.\n"
                f"Focus on professional experience, education, skills, and other resume-relevant content.\n"
                f"Rules:\n"
                f"- Name: Refer to the file name ({file_name}) if not explicitly mentioned in the text.\n"
                f"- Age in years\n- Qualification\n"
                f"- Subject Area of Highest Qualification\n- Place of Education for Highest Qualification\n"
                f"- Coding language\n- Spoken language\n- Skill set\n"
                f"- Years of work experience\n- Any links given/email-ID.\n"
                f"Note: This text is part {i + 1} of {len(chunks)} from a larger document. Ensure continuity while summarizing.\n"
                f"Here is the text: {chunk}"
            )
            response = ollama.chat(
                model="llama3.2:latest",
                messages=[{"role": "user", "content": prompt}]
            )
            summary = response.get("message", {}).get("content", "")
            if "Name:" in summary and not name:
                name_line = [line for line in summary.splitlines() if line.startswith("Name:")]
                if name_line:
                    name = name_line[0]
            combined_summary.append(summary)
        except Exception as e:
            print(f"Error summarizing chunk {i + 1}: {e}")

    # Ensure name is retained if found in any chunk
    if name:
        for i in range(len(combined_summary)):
            if "Name:" not in combined_summary[i]:
                combined_summary[i] = name + "\n" + combined_summary[i]

    return "\n\n".join(combined_summary)

def convert_files_to_txt(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.txt")

        if filename.endswith(".docx"):
            try:
                doc = Document(input_path)
                shapes_text = extract_text_from_shapes_and_textboxes(doc)
                headers_footers_text = extract_text_from_headers_and_footers(doc)
                paragraphs_text = [paragraph.text for paragraph in doc.paragraphs]
                for table in doc.tables:
                    for row in table.rows:
                        row_data = [cell.text.strip() for cell in row.cells]
                        paragraphs_text.append(' '.join(row_data))
                combined_text = []
                if shapes_text:
                    combined_text.append(shapes_text)
                if headers_footers_text:
                    combined_text.append(headers_footers_text)
                combined_text.extend([text for text in paragraphs_text if text.strip()])
                full_text = '\n'.join(combined_text)

                if len(full_text) > 7000:
                    chunks = split_large_text(full_text, max_chars=7000)
                    summarized_text = summarize_with_ollama(chunks, filename)
                    with open(output_path, "w", encoding="utf-8") as txt_file:
                        txt_file.write(summarized_text)
                else:
                    with open(output_path, "w", encoding="utf-8") as txt_file:
                        txt_file.write(full_text)

                print(f"Converted and summarized (if necessary): {filename} -> {output_path}")
            except Exception as e:
                print(f"Error converting {filename}: {e}")

        elif filename.endswith(".doc"):
            doc_to_txt(input_path, output_path)

        elif filename.endswith(".pdf"):
            try:
                reader = PdfReader(input_path)
                extracted_text = []
                for page in reader.pages:
                    extracted_text.append(page.extract_text())
                full_text = '\n'.join(extracted_text)

                if len(full_text) > 7000:
                    chunks = split_large_text(full_text, max_chars=7000)
                    summarized_text = summarize_with_ollama(chunks, filename)
                    with open(output_path, "w", encoding="utf-8") as txt_file:
                        txt_file.write(summarized_text)
                else:
                    with open(output_path, "w", encoding="utf-8") as txt_file:
                        txt_file.write(full_text)

                print(f"Converted and summarized (if necessary): {filename} -> {output_path}")
            except Exception as e:
                print(f"Error converting {filename}: {e}")

input_folder = r"C:\Users\polpi\Desktop\data science\project\docker_project\resumes"
output_folder = r"C:\Users\polpi\Desktop\data science\project\docker_project\resume_txt"
convert_files_to_txt(input_folder, output_folder)


In [None]:
import os
import ollama

def split_large_text(content, max_chars=7000):
    """Split text into chunks of at most `max_chars` characters."""
    chunks = []
    while len(content) > max_chars:
        split_index = content[:max_chars].rfind(" ")
        chunks.append(content[:split_index])
        content = content[split_index:].strip()
    chunks.append(content)
    return chunks

def summarize_with_ollama(chunks, file_name):
    """Summarize chunks of text using Ollama."""
    combined_summary = []
    name = None

    for i, chunk in enumerate(chunks):
        try:
            print(f"Summarizing chunk {i + 1} of {len(chunks)}...")
            prompt = (
                f"Summarize the following text while retaining key details relevant to the resume format.\n"
                f"Focus on professional experience, education, skills, and other resume-relevant content.\n"
                f"Rules:\n"
                f"- Name: Refer to the file name ({file_name}) if not explicitly mentioned in the text.\n"
                f"- Age in years\n- Qualification\n"
                f"- Subject Area of Highest Qualification\n- Place of Education for Highest Qualification\n"
                f"- Coding language\n- Spoken language\n- Skill set\n"
                f"- Years of work experience\n- Any links given/email-ID.\n"
                f"Note: This text is part {i + 1} of {len(chunks)} from a larger document. Ensure continuity while summarizing.\n"
                f"Here is the text: {chunk}"
            )
            response = ollama.chat(
                model="llama3.2:latest",
                messages=[{"role": "user", "content": prompt}]
            )
            summary = response.get("message", {}).get("content", "")
            if "Name:" in summary and not name:
                name_line = [line for line in summary.splitlines() if line.startswith("Name:")]
                if name_line:
                    name = name_line[0]
            combined_summary.append(summary)
        except Exception as e:
            print(f"Error summarizing chunk {i + 1}: {e}")

    # Ensure name is retained if found in any chunk
    if name:
        for i in range(len(combined_summary)):
            if "Name:" not in combined_summary[i]:
                combined_summary[i] = name + "\n" + combined_summary[i]

    return "\n\n".join(combined_summary)

def process_txt_files(input_folder):
    """Scan .txt files and summarize if they exceed 7000 characters."""
    for filename in os.listdir(input_folder):
        input_path = os.path.join(input_folder, filename)

        if filename.endswith(".txt"):
            try:
                with open(input_path, "r", encoding="utf-8") as txt_file:
                    content = txt_file.read()

                if len(content) > 7000:
                    print(f"File {filename} exceeds 7000 characters. Summarizing...")
                    chunks = split_large_text(content, max_chars=7000)
                    summarized_text = summarize_with_ollama(chunks, filename)

                    with open(input_path, "w", encoding="utf-8") as txt_file:
                        txt_file.write(summarized_text)

                    print(f"File {filename} has been summarized and replaced.")
                else:
                    print(f"File {filename} is within the character limit.")

            except Exception as e:
                print(f"Error processing {filename}: {e}")

# Replace the path below with your folder containing .txt files
input_folder = r"C:\Users\polpi\Desktop\data science\project\docker_project\resume_txt"
process_txt_files(input_folder)
