In [1]:
!pip install python-docx pandas tqdm openpyxl requests numpy

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use GPU 0

In [2]:
import os
import docx
import pandas as pd
from tqdm import tqdm
import ollama  # Import the Ollama package

# Function to read DOCX files
def read_docx(file_path):
    try:
        doc = docx.Document(file_path)
        return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

# Function to read DOC files (if needed in the future)
def read_doc(file_path):
    print("Currently reading only DOCX files. Extend functionality if needed.")
    return ""

# Function to read plain text files
def read_txt(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

# Function to process files in a folder
def read_files_in_folder(folder_path):
    data = []
    for file_name in tqdm(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith('.docx'):
            content = read_docx(file_path)  # Use docx library for .docx files
            data.append({"file_name": file_name, "content": content})
        elif file_name.endswith('.txt'):
            content = read_txt(file_path)  # Use plain text reader for .txt files
            data.append({"file_name": file_name, "content": content})
        elif file_name.endswith('.doc'):
            content = read_doc(file_path)  # Placeholder for future .doc handling
            data.append({"file_name": file_name, "content": content})
    return data

# Function to extract data using Ollama
def extract_data_with_ollama(content, file_name):
    try:
        inferred_name = " ".join(file_name.split(".")[0].split("_")[:2]).strip()
        prompt = (
            f"Extract the following information from the text in a clean, structured, and homogenous format.\n"
            f"Only extract the requested fields, and avoid adding any extra explanations or summaries.\n"
            f"Rules:\n"
            f"- Start your response with: 'Here is the extracted text:'\n"
            f"- Use 'NA' for missing information.\n"
            f"- Do not use special characters (*, -, etc.) in the response.\n"
            f"- For the Name field, prioritize extracting it from the text. If it is unclear or missing, compare it with this inferred name from the filename: {inferred_name}. The name should be only string without any special characters or numbers.\n"
            f"- Ensure all fields are included even if the value is 'NA'.\n"
            f"\nFields to extract:\n"
            f"- Name\n- Age in years: provide only one plausible age (positive integer) or put NA\n"
            f"- Qualification: provide only the degree like B.Tech, MCA, etc.\n"
            f"- Subject Area of Highest Qualification: like Engineering or Computer Science\n"
            f"- Place of Education for Highest Qualification\n"
            f"- Coding language\n- Spoken language\n- Skill set\n"
            f"- Years of work experience\n- Any links given/email-ID.\n"
            f"\nHere is the text: {content}"
        )
        response = ollama.chat(
            model="llama3.2:latest",
            messages=[{"role": "user", "content": prompt}]
        )
        print("Response:", response)
        
        # Parse the response content into a dictionary
        extracted_data = response.get("message", {}).get("content", "No data extracted")
        # Ensure the result is returned as a dictionary
        return {"extracted_info": extracted_data}
    except Exception as e:
        print(f"Error extracting data: {e}")
        return {"error": str(e)}

# Function to categorize file names based on keywords
def categorize_file_name(file_name):
    file_name_lower = file_name.lower()
    if 'peoplesoft' in file_name_lower:
        return 'Peoplesoft'
    elif 'react dev' in file_name_lower:
        return 'React Developer'
    elif 'reactjs' in file_name_lower or 'react js' in file_name_lower:
        return 'ReactJS Developer'
    elif 'react' in file_name_lower:
        return 'React'
    else:
        return 'Workday'

# Function to save extracted data to Excel
def save_to_excel(data, output_path):
    df = pd.DataFrame(data)
    # Add keyword category based on file name
    df['Keyword_Category'] = df['file_name'].apply(categorize_file_name)
    df.to_excel(output_path, index=False)

# Main function to integrate everything
def main(folder_path, excel_output):
    print("Reading files...")
    files_data = read_files_in_folder(folder_path)

    extracted_data = []
    print("Extracting data...")
    for file_data in files_data:
        extracted = extract_data_with_ollama(file_data["content"], file_data["file_name"])
        extracted["file_name"] = file_data["file_name"]  # Add the file name to the dictionary
        extracted_data.append(extracted)

    print("Saving results...")
    save_to_excel(extracted_data, excel_output)
    print("Processing completed.")

if __name__ == "__main__":
    folder_path = input("Enter the folder path containing DOCX/DOC files: ")
    excel_output = "extracted_dataAV.xlsx"
    main(folder_path, excel_output)


Reading files...


100%|██████████| 1/1 [00:00<00:00, 150.69it/s]

Extracting data...





Response: {'model': 'llama3.2:latest', 'created_at': '2024-12-23T13:59:19.9171209Z', 'message': {'role': 'assistant', 'content': 'Here is the extracted text:\n\nName: Arun Venu\nAge in years: NA\nQualification: BCA, M.Tech (Software Engineering)\nSubject Area of Highest Qualification: Software Engineering\nPlace of Education for Highest Qualification: BITS Pilani\nCoding language: NA\nSpoken language: NA\nSkill set: Peoplesoft FSCM, PeopleTools, Oracle DB, Fluid Pages Conversion, XML/CSV conversion, EFM files Transfer, Database management\nYears of work experience: 2 years\nAny links given/email-ID: NA'}, 'done_reason': 'stop', 'done': True, 'total_duration': 2662679200, 'load_duration': 27440500, 'prompt_eval_count': 940, 'prompt_eval_duration': 575000000, 'eval_count': 114, 'eval_duration': 2058000000}
Saving results...
Processing completed.


In [3]:
import pandas as pd
import re

def preprocess_and_format_excel(input_file_path, output_file_path, column_name):
    """
    Loads an Excel file, preprocesses text in the specified column to normalize formats,
    and saves the formatted data into a new Excel file.

    Args:
        input_file_path (str): Path to the input Excel file.
        output_file_path (str): Path to save the formatted Excel file.
        column_name (str): Name of the column to preprocess.
    """
    # Load the Excel file
    df = pd.read_excel(input_file_path)

    # Check if the specified column exists
    if column_name not in df.columns:
        print(f"The column '{column_name}' is missing in the input file.")
        return

    def normalize_text(text):
        """
        Normalizes text data to make it consistent for regex extraction.
        Handles different delimiters, extra spaces, and line breaks.
        """
        if not isinstance(text, str) or not text.strip():
            return text  # Return as-is if empty or not a string
        
        # Remove extra spaces
        text = re.sub(r"\s+", " ", text.strip())
        
        # Normalize delimiters (colon, dash, or inconsistent spacing)
        text = re.sub(r"(\w+)\s*[:\-]\s*", r"\1: ", text)
        
        # Handle cases where data is on a new line (e.g., "Name\nAnil")
        text = re.sub(r"(\w+)\n(\w+)", r"\1: \2", text)

        text = text.replace(":", "")
        
        return text

    # Apply normalization to the specified column
    df["formatted_info"] = df[column_name].apply(normalize_text)

    # Save the updated DataFrame to a new Excel file
    df.to_excel(output_file_path, index=False)
    print(f"Formatted data saved to {output_file_path}")

# File paths
input_file = r"C:\Users\polpi\Desktop\data science\project\docker_project\extracted_data11.xlsx"  # Input file path
output_file = r"C:\Users\polpi\Desktop\data science\project\docker_project\formatted_excel2.xlsx"  # Output file path
column_to_format = "extracted_info"   # Column to normalize

# Run the preprocessing function
preprocess_and_format_excel(input_file, output_file, column_to_format)

Formatted data saved to C:\Users\polpi\Desktop\data science\project\docker_project\formatted_excel2.xlsx


In [4]:
import pandas as pd

# Define a function to clean and standardize the "formatted_info" column
def clean_formatted_info(text):
    if not isinstance(text, str):
        return text  # Skip if not a string
    # Remove "Here is the extracted text"
    text = text.replace("Here is the extracted text", "").strip()
    # Standardize specific phrases
    text = text.replace("Age in years", "Age")
    text = text.replace("Subject Area of Highest Qualification", "Subject Area")
    text = text.replace("Place of Education for Highest Qualification", "Place of Education")
    text = re.sub(r"Any links given/email ID.*", "", text, flags=re.IGNORECASE).strip()
    return text

# File paths
input_file = r"C:\Users\polpi\Desktop\data science\project\docker_project\formatted_excel2.xlsx"
output_file = r"C:\Users\polpi\Desktop\data science\project\docker_project\cleaned_formatted_excel.xlsx"

# Load the Excel file
df = pd.read_excel(input_file)

# Apply the cleaning function to the "formatted_info" column
df['formatted_info'] = df['formatted_info'].apply(clean_formatted_info)

# Save the updated DataFrame to a new Excel file
df.to_excel(output_file, index=False)

print(f"Processed file saved to {output_file}")



Processed file saved to C:\Users\polpi\Desktop\data science\project\docker_project\cleaned_formatted_excel.xlsx
